In [15]:
# !pip install skimpy --quiet
# !pip install wordcloud --quiet
# !pip install category_encoders --quiet
# !pip install shap
!pip install colorama
from colorama import Fore, Style
# Basic imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
#import skimpy
import re
import time
import random
import datetime
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Parameters: { 'verbose' } are not used.")

##################### Preprocessing imports 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.inspection import permutation_importance
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from IPython.display import clear_output

##################### Models
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

##################### optuna library import
import optuna
import shap
random_state = 42
n_splits = 5

#!pip install opendatasetS



In [None]:
######## Download the data 
train=pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test=pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")
data_dict=pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv")
sample=pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv")

<h3> Basic configuration </h3>
<li>Map sii - map the target column in order to make the target data more clear</li>
<li>perform basic actions - such as dropping the id column across the datasets and applying the map for the dataset</li>

In [None]:
try:
    # Define Dependent and Target feature 
    target = 'sii'
    Target_series = train['sii']
    
    # Drop the id column from all dataset respectively 
    train.drop(['id'], axis =1, inplace = True)
    test.drop(['id'], axis =1, inplace = True)
    data_dict = data_dict[data_dict['Field'].str.contains('id') == False]
    
    # Create a copy 
    train_copy = train.copy()
 
except:
    print("Already dropped id or name column")
    

<h3>SelectKBest</h3>
Find best categorical features using SelectKBest. This function selects the feature with the higest chi-squared score relative to the target column 

In [None]:
def Kbest(cat_dummies):
    # Ten features with highest chi-squared statistics are selected
    chi2_features = SelectKBest(chi2, k=10)
    X_kbest_features = chi2_features.fit_transform(cat_dummies.drop("sii",axis=1),cat_dummies["sii"])
    
    feature_names = cat_dummies.drop("sii", axis=1).columns
    
    scores = chi2_features.scores_
    
    # Create a DataFrame to visualize the results
    feature_scores = pd.DataFrame({'Feature': feature_names, 'Score': scores})
    
    # Sort the DataFrame by scores in descending order
    feature_scores = feature_scores.sort_values(by='Score', ascending=False)
    
    # Print the top 10 features
    print(feature_scores.head(10))
    # Reduced features
    print('Original feature number:', cat_dummies.shape[1])
    print('Reduced feature number:', X_kbest_features.shape[1])

    # Return the dataframe with only the columns that have the highest chi2 square correlation
    return cat_dummies[list(feature_scores[0:10].Feature)].reset_index().drop('index',axis=1)

In [None]:
#### Combines dataframe with target series
def concat_df_a_target(df,target):
    return pd.concat([df,target], axis=1)

def drop_Nans(train,subset):
    dropped_df = train.dropna(subset=subset).reset_index().drop('index',axis=1)
    return dropped_df


# Creates a dataframe of the categorical features converted to dummies
def cat_of_dummies(train):
    # Create categorical dataframe consist only categorical features
    train_cat = train.select_dtypes(exclude='number')
    # Make a categorical dataframe of dummies
    train_cat_dum= pd.get_dummies(train_cat)

    return train_cat_dum

def process_cat_of_dummies(train,test):
    '''
    This function is used for processing the dataframe of categorical dummies
    '''
    # Create a train and a test sets containing only categorical
    train_cat = cat_of_dummies(train)
    test_cat = cat_of_dummies(test)
    
    # Drop uncommon columns from train (this will leave only the columns that appear in both the train and test)
    common_cols = list(set(train_cat.columns).intersection(set(test_cat.columns)))
    train_cat = train_cat[common_cols]
    
    train_cat = concat_df_a_target(train_cat,Target_series)
    train_cat = drop_Nans(train_cat,"sii")

    return train_cat, test_cat

train_cat, test_cat = process_cat_of_dummies(train, test) 

In [None]:
train_cat_best = Kbest(train_cat)
# Define the test categorical set which includes all the categorical columns from the train
test_cat_best = test_cat[train_cat_best.columns]

<h1>Preprocessing action</h1>
<p>In this section I am creating all the necassries function for the feature engineering part of the tabular dataset </p>

In [None]:
def feature_difference(train, test):
    # Get the set of column names from each DataFrame
    train_set = set(train.columns)
    test_set = set(test.columns)

    # find the difference in cols
    feature_difference_cols = train_set - test_set

    return feature_difference_cols


def cap_outliers(train, columns, method='iqr', threshold=1.5):
    '''
    for the this tests I only want to remove outliers for some of the features and not all.
    For example if I am not cartefull the function will alter the target column 
    because of that it is important to set to change only the desired columns
    '''
    
    train_copy = train.copy()
    
    for col in columns:
        if col != 'sii':
            Q1 = train[col].quantile(0.25)
            Q3 = train[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR

            train_copy[col] = np.clip(train[col], lower_bound, upper_bound)
        
    return train_copy


# This function is not neccasry
def correct_outliers_dk(df):
    train = df.copy()
    # Define thresholds
    bmi_threshold = 7
    weight_threshold = 35
    diastolic_bp_threshold = 35
    systolic_bp_threshold = 65
    heart_rate_threshold = 45

    # Correct the outliers
    train.loc[train['Physical-BMI'] <= bmi_threshold, 'Physical-BMI'] = bmi_threshold
    train.loc[train['Physical-Weight'] <= weight_threshold, 'Physical-Weight'] = weight_threshold
    train.loc[train['Physical-Diastolic_BP'] < diastolic_bp_threshold, 'Physical-Diastolic_BP'] = diastolic_bp_threshold
    train.loc[train['Physical-Systolic_BP'] < systolic_bp_threshold, 'Physical-Systolic_BP'] = systolic_bp_threshold
    train.loc[train['Physical-HeartRate'] < heart_rate_threshold, 'Physical-HeartRate'] = heart_rate_threshold
    swap_condition = train['Physical-Diastolic_BP'] > train['Physical-Systolic_BP']
    train.loc[swap_condition, ['Physical-Diastolic_BP', 'Physical-Systolic_BP']] = train.loc[swap_condition, ['Physical-Systolic_BP', 'Physical-Diastolic_BP']].values
    
    return train

def handle_outliers(train): 
    '''
    The following function handle the outliers, both the statistical domain knowledge. 
    The function receives from the user 
    train - the train dataframe set
    '''
    
    train_capper = cap_outliers(train,train.select_dtypes(include='number').columns)
    display(train_capper.describe())
    
    return train_capper


def high_correlation_pairs(train, threshold=0.95):
    # Calculate the correlation matrix
    corr_matrix = train.select_dtypes(include='number').corr("pearson")
    
    # Select pairs of features with correlations above the threshold
    high_corr_pairs = (
        corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  # Upper triangle without diagonal
        .stack()  # Convert to Series
        .reset_index()
    )
    
    # Rename columns for readability
    high_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
    
    # Filter by the correlation threshold (both positive and negative)
    high_corr_pairs = high_corr_pairs[high_corr_pairs['Correlation'].abs() > threshold]
    
    # Display the high correlation pairs
    print("Highly correlated feature pairs (|correlation| > {}):".format(threshold))
    print(high_corr_pairs.to_string(index=False))
    
    return high_corr_pairs


# Find features with less then threshold correlation
def low_correlated_features(train, target_column, threshold=0.1):

    corr_matrix = train.corr()
    target_correlations = corr_matrix[target_column].abs()
    low_correlated_features = target_correlations[target_correlations < threshold].index
    
    return low_correlated_features

In [None]:
def feature_engineering(df,tst):
    '''
        This function is used to clean the data and make sure it is ready for modeling
        input:
            train: the test dataframe
            test: the train dataframe           
    '''
    
    train = df.copy()
    test= tst.copy()
    ### Drop unique columns for train dataframe (later add the target series as it was dropped)
    cols_diff = feature_difference(train, test) 
    train = train.drop(list(cols_diff),axis=1)
    train = concat_df_a_target(train, Target_series)
    
    display(train)
    ### Feature creation 
    #feature_creation(train,test)
    
    ### Handle missing values
    train = train.dropna(subset='sii').reset_index().drop('index',axis=1)
    
    ### Drop high correlation pairs
    high_corr_pairs = high_correlation_pairs(train)
    # take the second feature from each pair and drop them from the dataframe
    features_to_remove = high_corr_pairs['Feature 2'].tolist()
    train = train.drop(features_to_remove, axis=1)
    test = test.drop(features_to_remove, axis=1)

    ### Drop features with low correlation to target
    low_corr_cols = low_correlated_features(train.select_dtypes(include='number'),'sii')
    train = train.drop(low_corr_cols,axis=1)
    test = test.drop(low_corr_cols,axis=1)

    ### Deal with outliers
    train = handle_outliers(train)

    ### Drop categorical columns 
    cat_cols = train.select_dtypes(exclude='number').columns
    train = train.drop(cat_cols,axis=1)
    test = test.drop(cat_cols,axis=1)
    
    return train, test 

In [None]:
train_processed,test_processed = feature_engineering(train,test)

<h2>Combine sets</h2>
Combine the categrocial features dataset with the numerical features dataset

In [None]:
# Concat between the categorical dataframe and the numerical dataframe 
train = pd.concat([train_processed, train_cat_best], axis=1, ignore_index=False)
test = pd.concat([test_processed, test_cat_best], axis=1, ignore_index=False)

<h1> Evaluation </h1>
Evaluate the model using the QWK metric 

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [None]:
Light = LGBMRegressor(verbose = -1)
XGB_Model = XGBRegressor()
CatBoost_Model = CatBoostRegressor(verbose = False)


voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

Submission = TrainML(voting_model, test)

Submission

In [None]:
Submission.to_csv("submission.csv",index=False)