In [10]:
import os

import pandas as pd

##### ----------

In [11]:
import os

import pandas as pd



# Define a function to load and merge CSV data with parquet data

def load_and_process_data(csv_path, parquet_dir):

    # Step 1: Load CSV data

    csv_data = pd.read_csv(csv_path)

    

    # Step 2: Locate all .parquet files with unique ids

    parquet_files = [os.path.join(parquet_dir, f'{file_id}') for file_id in os.listdir(parquet_dir)]

    

    # Step 3: Define a function to process each file by calculating the mean of relevant features

    def process_parquet_files(files):

        batch_data = []

        for file_path in files:

            df = pd.read_parquet(file_path)

            

            # Aggregate relevant features

            mean_summary = df.agg({

                'step': 'mean',                # Mean of time steps if relevant

                'X': 'mean',                   # Mean acceleration along X axis

                'Y': 'mean',                   # Mean acceleration along Y axis

                'Z': 'mean',                   # Mean acceleration along Z axis

                'enmo': 'mean',                # Mean ENMO

                'anglez': 'mean',              # Mean Angle-Z

                'time_of_day': 'mean',                # Mean hour from time_of_day

                'light': 'mean',               # Mean ambient light

                'non-wear_flag': 'mean',       # Percentage of time watch is not worn

                'weekday': 'mean',             # Mode for weekday

                'quarter': 'mean',             # Mode for quarter

                'relative_date_PCIAT': 'mean'  # Mean relative days to PCIAT

            }).to_frame().T  # Transpose to have a single row

            

            # Calculate modes for specific fields

            mean_summary['weekday'] = df['weekday'].mode()[0] if not df['weekday'].mode().empty else None

            mean_summary['quarter'] = df['quarter'].mode()[0] if not df['quarter'].mode().empty else None

            mean_summary['non-wear_flag'] = df['non-wear_flag'].mode()[0] if not df['non-wear_flag'].mode().empty else None

            mean_summary['relative_date_PCIAT'] = df['relative_date_PCIAT'].mode()[0] if not df['relative_date_PCIAT'].mode().empty else None

            

            # Remove 'id=' prefix from the filename and add to DataFrame

            unique_id = os.path.basename(file_path).replace("id=", "")

            mean_summary['id'] = unique_id

            batch_data.append(mean_summary)

        

        return pd.concat(batch_data, ignore_index=True)

    

    # Step 4: Batch load files in chunks

    batch_size = 50  # Adjust based on memory capacity

    final_data = []

    

    for i in range(0, len(parquet_files), batch_size):

        batch_files = parquet_files[i:i+batch_size]

        batch_result = process_parquet_files(batch_files)

        final_data.append(batch_result)  # Append processed batch data

    

    # Combine all batches into one final DataFrame

    final_parquet_data = pd.concat(final_data, ignore_index=True)

    

    # Step 5: Ensure 'id' in both dataframes is of the same type (string in this case)

    csv_data['id'] = csv_data['id'].astype(str)

    final_parquet_data['id'] = final_parquet_data['id'].astype(str)

    

    # Step 6: Merge CSV data with the parquet data using a left join on 'id'

    merged_data = csv_data.merge(final_parquet_data, on='id', how='left')

    

    return merged_data



# Process train and test data

train_merged = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

test_merged = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')



# Display the resulting DataFrames to verify

print("Train Merged Data:")



print("\nTest Merged Data:")

test_merged


Train Merged Data:

Test Merged Data:


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,63.1265,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,47.2211,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,50.4767,,,Fall,1.27,,,,Fall,2.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,,Summer,2.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,


In [4]:
train_merged

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


In [12]:
# Drop rows where 'sii' is NaN

#train_filtered_data = train_merged.dropna(subset=['sii'])



# Additional filtering: Drop rows where the count of NaN values is 70 or more

train_filtered_data = train_merged.dropna(subset=['sii'])



# Display the resulting DataFrame to verify

train_filtered_data

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,ff6c2bb8,Fall,8,0,,,Fall,17.139810,52.5,67.2,...,2.0,2.0,1.0,22.0,Fall,41.0,58.0,Fall,2.0,0.0
3954,ff759544,Summer,7,1,,,Summer,13.927006,48.5,46.6,...,3.0,3.0,0.0,33.0,Summer,48.0,67.0,Summer,0.0,1.0
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0


In [5]:
train_filtered_data

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,ff6c2bb8,Fall,8,0,,,Fall,17.139810,52.5,67.2,...,2.0,2.0,1.0,22.0,Fall,41.0,58.0,Fall,2.0,0.0
3954,ff759544,Summer,7,1,,,Summer,13.927006,48.5,46.6,...,3.0,3.0,0.0,33.0,Summer,48.0,67.0,Summer,0.0,1.0
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0


In [6]:
test_merged

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,63.1265,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,47.2211,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,50.4767,,,Fall,1.27,,,,Fall,2.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,,Summer,2.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,


In [13]:
import pandas as pd

from sklearn.impute import KNNImputer

from sklearn.preprocessing import OrdinalEncoder

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.metrics import accuracy_score



# Function to preprocess training data

def preprocess_train_data(data, parquet_path):

    # Drop 'id' and separate numeric and categorical data

    data_no_id = data.drop(columns=['id'])

    numeric_data = data_no_id.select_dtypes(include=['number']).copy()

    categorical_data = data_no_id.select_dtypes(exclude=['number']).copy()

    
    
    # Drop PCIAT-PCIAT_Total and sii before applying KNN

    numeric_data = numeric_data.drop(columns=['PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02',

       'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06',

       'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10',

       'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14',

       'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18',

       'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'Physical-BMI'], errors='ignore')

    

    # Impute categorical data with mode

    for col in categorical_data.columns:

        categorical_data[col].fillna(categorical_data[col].mode()[0], inplace=True)

    

    # Encode categorical data

    encoder = OrdinalEncoder()

    categorical_encoded = encoder.fit_transform(categorical_data)

    categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=categorical_data.columns)

    categorical_encoded_df = categorical_encoded_df.drop(columns=['PCIAT-Season'], errors = 'ignore')

    # Apply KNN Imputer to numeric data

    knn_imputer = KNNImputer(n_neighbors=10)

    numeric_imputed = pd.DataFrame(knn_imputer.fit_transform(numeric_data), columns=numeric_data.columns)

    def calculate_bmi(row):
        height_m = row["Physical-Height"] / 100  # Convert height to meters
        weight = row["Physical-Weight"]

        if row["Basic_Demos-Age"] < 18:  # For minors
            return weight / (height_m ** 2) * 1.1  # Add a factor for growth
        elif 18 <= row["Basic_Demos-Age"] < 40:  # For adults
            return weight / (height_m ** 2)
    numeric_imputed['BMI'] = numeric_imputed.apply(calculate_bmi, axis = 1)    
    numeric_imputed['Internet_Hours_Age'] = numeric_imputed['PreInt_EduHx-computerinternet_hoursday'] * numeric_imputed['Basic_Demos-Age']
    numeric_imputed['Physical-Waist_Age'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Waist_Circumference']
    numeric_imputed['BMI_Age'] = numeric_imputed['BMI'] * numeric_imputed['Basic_Demos-Age']
    numeric_imputed['Physical-Height_Age'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Height']
    numeric_imputed['SDS_InternetHours'] = numeric_imputed['SDS-SDS_Total_T'] * numeric_imputed['PreInt_EduHx-computerinternet_hoursday']
    numeric_imputed['SDS_BMI'] = numeric_imputed['BIA-BIA_BMI'] * numeric_imputed['SDS-SDS_Total_T']
    numeric_imputed['CGAS_SDS'] = numeric_imputed['CGAS-CGAS_Score'] * numeric_imputed['SDS-SDS_Total_T']
    numeric_imputed['CGAS_Endurance_Mins'] = numeric_imputed['CGAS-CGAS_Score'] * numeric_imputed['Fitness_Endurance-Time_Mins']
    numeric_imputed['SDS_Activity'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['SDS-SDS_Total_T']

    numeric_imputed['BMI_Systolic_BP'] = numeric_imputed['BIA-BIA_BMI'] * numeric_imputed['Physical-Systolic_BP']
    numeric_imputed['Age_Systolic_BP'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Systolic_BP']
    numeric_imputed['PreInt_Systolic_BP'] = numeric_imputed['Physical-Systolic_BP'] * numeric_imputed['PreInt_EduHx-computerinternet_hoursday']
    numeric_imputed['PAQ_A_Activity'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['PAQ_A-PAQ_A_Total']
    numeric_imputed['Activity_CU_PU'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['FGC-FGC_CU'] * numeric_imputed['FGC-FGC_PU']   
    
    # Combine numeric and categorical data

    processed_data = pd.concat([data['id'].reset_index(drop=True), numeric_imputed, categorical_encoded_df], axis=1)

    

    return processed_data



# Function to preprocess test data

def preprocess_test_data(data, parquet_path):

    # Drop 'id' and separate numeric and categorical data

    data_no_id = data.drop(columns=['id'])

    numeric_data = data_no_id.select_dtypes(include=['number']).copy()

    categorical_data = data_no_id.select_dtypes(exclude=['number']).copy()

    numeric_data = numeric_data.drop(columns=['Physical-BMI'], errors='ignore')

    # Impute categorical data with mode

    for col in categorical_data.columns:

        categorical_data[col].fillna(categorical_data[col].mode()[0], inplace=True)

    

    # Encode categorical data

    encoder = OrdinalEncoder()

    categorical_encoded = encoder.fit_transform(categorical_data)

    categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=categorical_data.columns)

    

    # Apply KNN Imputer to numeric data

    knn_imputer = KNNImputer(n_neighbors=10)

    numeric_imputed = pd.DataFrame(knn_imputer.fit_transform(numeric_data), columns=numeric_data.columns)

    def calculate_bmi(row):
        height_m = row["Physical-Height"] / 100  # Convert height to meters
        weight = row["Physical-Weight"]

        if row["Basic_Demos-Age"] < 18:  # For minors
            return weight / (height_m ** 2) * 1.1  # Add a factor for growth
        elif 18 <= row["Basic_Demos-Age"] < 40:  # For adults
            return weight / (height_m ** 2)
    numeric_imputed['BMI'] = numeric_imputed.apply(calculate_bmi, axis = 1)    
    numeric_imputed['Internet_Hours_Age'] = numeric_imputed['PreInt_EduHx-computerinternet_hoursday'] * numeric_imputed['Basic_Demos-Age']
    numeric_imputed['Physical-Waist_Age'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Waist_Circumference']
    numeric_imputed['BMI_Age'] = numeric_imputed['BMI'] * numeric_imputed['Basic_Demos-Age']
    numeric_imputed['Physical-Height_Age'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Height']
    numeric_imputed['SDS_InternetHours'] = numeric_imputed['SDS-SDS_Total_T'] * numeric_imputed['PreInt_EduHx-computerinternet_hoursday']
    numeric_imputed['SDS_BMI'] = numeric_imputed['BIA-BIA_BMI'] * numeric_imputed['SDS-SDS_Total_T']
    numeric_imputed['CGAS_SDS'] = numeric_imputed['CGAS-CGAS_Score'] * numeric_imputed['SDS-SDS_Total_T']
    numeric_imputed['CGAS_Endurance_Mins'] = numeric_imputed['CGAS-CGAS_Score'] * numeric_imputed['Fitness_Endurance-Time_Mins']
    numeric_imputed['SDS_Activity'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['SDS-SDS_Total_T']

    numeric_imputed['BMI_Systolic_BP'] = numeric_imputed['BIA-BIA_BMI'] * numeric_imputed['Physical-Systolic_BP']
    numeric_imputed['Age_Systolic_BP'] = numeric_imputed['Basic_Demos-Age'] * numeric_imputed['Physical-Systolic_BP']
    numeric_imputed['PreInt_Systolic_BP'] = numeric_imputed['Physical-Systolic_BP'] * numeric_imputed['PreInt_EduHx-computerinternet_hoursday']
    numeric_imputed['PAQ_A_Activity'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['PAQ_A-PAQ_A_Total']
    numeric_imputed['Activity_CU_PU'] = numeric_imputed['BIA-BIA_Activity_Level_num'] * numeric_imputed['FGC-FGC_CU'] * numeric_imputed['FGC-FGC_PU']    
    # Combine numeric and categorical data

    processed_data = pd.concat([data['id'].reset_index(drop=True), numeric_imputed, categorical_encoded_df], axis=1)

    

    return processed_data



# Preprocess training and testing data

train_processed = preprocess_train_data(train_filtered_data, '/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet')

test_processed = preprocess_test_data(test_merged, '/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet')



# Train model using only necessary features and target 'sii'

y_train = train_processed['sii']

X_train = train_processed.drop(columns=['id', 'sii'])

print(y_train)

X_test = test_processed.drop(columns=['id'], errors='ignore')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  categorical_data[col].fillna(categorical_data[col].mode()[0], inplace=True)


0       2.0
1       0.0
2       0.0
3       1.0
4       1.0
       ... 
2731    0.0
2732    1.0
2733    1.0
2734    1.0
2735    0.0
Name: sii, Length: 2736, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  categorical_data[col].fillna(categorical_data[col].mode()[0], inplace=True)


In [14]:
X_train

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,...,Basic_Demos-Enroll_Season,CGAS-Season,Physical-Season,Fitness_Endurance-Season,FGC-Season,BIA-Season,PAQ_A-Season,PAQ_C-Season,SDS-Season,PreInt_EduHx-Season
0,5.0,0.0,51.0,46.0,50.8,22.2,65.4,83.8,114.6,4.0,...,0.0,3.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0
1,9.0,0.0,59.0,48.0,46.0,22.0,75.0,70.0,122.0,5.8,...,2.0,1.0,0.0,1.0,0.0,3.0,3.0,0.0,0.0,2.0
2,10.0,1.0,71.0,56.5,75.6,26.4,65.0,94.0,117.0,5.0,...,2.0,0.0,0.0,0.0,0.0,2.0,3.0,2.0,0.0,2.0
3,9.0,0.0,71.0,56.0,81.6,26.4,60.0,97.0,117.0,6.0,...,3.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0
4,13.0,1.0,50.0,59.5,112.2,27.0,60.0,73.0,102.0,4.8,...,1.0,3.0,2.0,1.0,2.0,2.0,3.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,8.0,0.0,63.2,52.5,67.2,25.0,60.0,65.0,112.0,5.4,...,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0
2732,7.0,1.0,61.0,48.5,46.6,23.0,65.0,75.0,105.0,5.2,...,2.0,1.0,2.0,1.0,2.0,0.0,3.0,1.0,2.0,2.0
2733,13.0,0.0,60.0,59.5,82.4,26.8,71.0,70.0,104.0,5.6,...,0.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0,3.0,0.0
2734,11.0,0.0,68.0,60.0,109.8,26.8,79.0,99.0,116.0,4.6,...,0.0,1.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0,0.0


In [15]:
X_test

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,...,Basic_Demos-Enroll_Season,CGAS-Season,Physical-Season,Fitness_Endurance-Season,FGC-Season,BIA-Season,PAQ_A-Season,PAQ_C-Season,SDS-Season,PreInt_EduHx-Season
0,5.0,0.0,51.0,46.0,50.8,25.4,63.8,78.4,109.2,5.0,...,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.0,0.0,65.6,48.0,46.0,22.0,75.0,70.0,122.0,5.0,...,2.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,2.0
2,10.0,1.0,71.0,56.5,75.6,25.4,65.0,94.0,117.0,5.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0
3,9.0,0.0,71.0,56.0,81.6,25.4,60.0,97.0,117.0,6.0,...,3.0,0.0,2.0,2.0,2.0,1.0,0.0,3.0,2.0,3.0
4,18.0,1.0,61.8,58.3,92.32,25.4,63.2,79.6,117.6,5.0,...,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,13.0,1.0,50.0,59.5,112.2,25.4,60.0,73.0,102.0,5.0,...,1.0,3.0,2.0,1.0,2.0,1.0,0.0,1.0,2.0,1.0
6,10.0,0.0,67.8,55.0,84.6,25.4,123.0,83.0,163.0,5.0,...,0.0,2.0,0.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0
7,10.0,1.0,67.8,59.25,84.2,27.0,71.0,90.0,116.0,5.0,...,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,15.0,0.0,61.8,58.6,90.32,25.4,74.8,80.0,126.8,5.0,...,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0
9,19.0,1.0,61.8,58.3,92.32,25.4,63.2,79.6,117.6,5.0,...,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.utils.class_weight import compute_sample_weight

import numpy as np



# Split the data into 80% training and 20% validation

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train_split = X_train
X_val = X_train

y_train_split = y_train
y_val = y_train

# Manually define class weights (higher weight for rare classes 2 and 3)

class_weights = {0: 20, 1: 1, 2: 80, 3: 100}   # Example: increasing weight on 2 and 3

sample_weights = compute_sample_weight(class_weight=class_weights, y=y_train_split)



# Define each model with hyperparameters



# LightGBM with sample weights

lgbm_regressor = LGBMRegressor(

    boosting_type='gbdt',

    objective='regression',

    random_state=42,

    n_estimators=8000,

    learning_rate=0.001

)



# CatBoost with loss weight adjustment

catboost_regressor = CatBoostRegressor(

    iterations=8000,

    learning_rate=0.001,

    depth=6,

    loss_function='RMSE',

    random_seed=42,

    verbose=0

)



# XGBoost does not have a direct class_weight parameter, so we use sample weights in fit

xgboost_regressor = XGBRegressor(

    objective='reg:squarederror',

    n_estimators=8000,

    learning_rate=0.001,

    random_state=42

)



# Train each model with class/sample weights

lgbm_regressor.fit(X_train_split, y_train_split, sample_weight=sample_weights)

catboost_regressor.fit(X_train_split, y_train_split, sample_weight=sample_weights)

xgboost_regressor.fit(X_train_split, y_train_split, sample_weight=sample_weights)



# Predict on the validation set and average predictions

y_val_pred_lgbm = lgbm_regressor.predict(X_val)

y_val_pred_catboost = catboost_regressor.predict(X_val)

y_val_pred_xgboost = xgboost_regressor.predict(X_val)



# Ensemble the predictions by averaging

y_val_pred_ensemble = (y_val_pred_lgbm + y_val_pred_catboost + y_val_pred_xgboost) / 3

y_val_pred_ensemble_rounded = y_val_pred_ensemble.round().clip(0, 3).astype(int)



# Calculate metrics for the ensemble on the validation set

val_mse = mean_squared_error(y_val, y_val_pred_ensemble_rounded)

val_r2 = r2_score(y_val, y_val_pred_ensemble_rounded)

print(f"Validation Mean Squared Error (Ensemble): {val_mse:.2f}")

print(f"Validation R-squared (Ensemble): {val_r2:.2f}")



# Predict on the test set and average predictions

y_test_pred_lgbm = lgbm_regressor.predict(X_test)

y_test_pred_catboost = catboost_regressor.predict(X_test)

y_test_pred_xgboost = xgboost_regressor.predict(X_test)



# Ensemble the predictions by averaging

y_test_pred_ensemble = (y_test_pred_lgbm + y_test_pred_catboost + y_test_pred_xgboost) / 3

y_test_pred_ensemble_rounded = y_test_pred_ensemble.round().clip(0, 3).astype(int)



# Prepare submission with rounded predictions for test set

test_processed['sii'] = y_test_pred_ensemble_rounded  # Assuming 'test_processed' has an 'id' column



# Count the occurrences of each sii value in the predictions

sii_counts = test_processed['sii'].value_counts()

print("Count of each sii value in predictions:")

print(sii_counts)



# Save submission to CSV

submission = test_processed[['id', 'sii']]

submission_path = 'submission.csv'

submission.to_csv(submission_path, index=False)

print(f"Submission has been saved to {submission_path}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6900
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 59
[LightGBM] [Info] Start training from score 1.081913
