In [1]:
### Imports for the Data Preprocessing
import numpy as np
import pandas as pd
import matplotlib as plt
import os
import gc
from typing import Tuple
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import numpy as np


In [2]:
# get working directory
wd = os.path.dirname(os.getcwd())
# wd = "N:\MASTER_DS\Code\Kaggle_competition\Kaggle-seminar\student-performance"
os.chdir(wd)
print("Working Directory: ", os.getcwd())

Working Directory:  n:\MASTER_DS\Code\Kaggle_competition\Kaggle-seminar\student-performance


In [24]:
from utils.loader_steve import load_data, load_labels , load_all_X_y

df, labels = load_all_X_y( data= "flattened", )
keys = list(df.keys())

dtypes_labels= {
    'correct': np.uint8, 
    'q':np.uint8}

lab_1 = pd.read_csv('data/processed/labels_q1-3.csv', dtype=dtypes_labels)
lab_2 = pd.read_csv('data/processed/labels_q4-13.csv', dtype=dtypes_labels)
lab_3= pd.read_csv('data/processed/labels_q14-18.csv', dtype=dtypes_labels)

labels_dict = { "0_4": lab_1,
        "5_12": lab_2,
        "13_22": lab_3
}

In [26]:
keys = list(df.keys())
labels_dict = {
    "0_4": lab_1,
    "5_12": lab_2,
    "13_22": lab_3
}

# Create an empty dictionary to store the output DataFrames
output_dict = {}

# Iterate over each key in the original dictionary
for key in keys:
    train_data = df[key]
    train_data = train_data.sort_values('session_id_1')
    labels = labels_dict[key]
    labels.rename(columns={'session': 'session_id_1'}, inplace=True)
    labels = labels.sort_values('session_id_1')
    merged_df = pd.merge(train_data, labels, on="session_id_1", how="inner")
    merged_df = merged_df.drop(columns=[col for col in merged_df.columns if 'session' in col])
    grouped_df = merged_df.sort_values('q')
    
    # Group the merged DataFrame by the "q" column
    grouped_by_level = grouped_df.groupby('q')
    
    # Iterate over each unique level in the grouped DataFrame
    for level, group_data in grouped_by_level:
        # Create a unique key for the output dictionary
        output_key = f"{key}_{level}"
        # Store the group_data DataFrame in the output dictionary
        output_dict[output_key] = group_data



{'0_4_1':       level_group_1  event_name_1  name_1  fqid_1  room_fqid_1  text_fqid_1  \
0               0-4             6       3       5            1            6   
35369           0-4             6       3       7            1            8   
35364           0-4             5       3       4            1            4   
35361           0-4             6       3       5            1            5   
35360           0-4             5       3       5            1            5   
...             ...           ...     ...     ...          ...          ...   
172             0-4             5       3       5            1            6   
5504            0-4             7       3       5            1            6   
316             0-4             5       3       5            1            6   
16653           0-4             5       3       5            1            5   
5722            0-4             7       3       6            1            7   

       fullscreen_1  hq_1  music_1  hover

In [29]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd

# Dictionary to store the models and evaluation metrics
results = {}

# Hyperparameters to be tuned
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 5, 10]
}

# Iterate over each group of questions
for group_key, group_df in output_dict.items():
    # Group the DataFrame by the "level" column
    grouped_df = group_df.groupby('q')

    # Prepare lists to store the grouped data
    X_group = []
    y_group = []

    # Iterate over each group within the current group of questions
    for level, group_data in grouped_df:
        # Remove unnecessary columns
        group_data = group_data.drop(['q', 'correct', 'level_group_1'], axis=1)

        # Append the features and labels to the grouped data lists
        X_group.append(group_data)
        y_group.append(output_dict[group_key].loc[group_data.index]['correct'])

    # Concatenate the grouped data into single DataFrames
    X_group = pd.concat(X_group)
    y_group = pd.concat(y_group)

    # Perform basic cross-validation for hyperparameter tuning
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Create an XGBoost classifier
    model = xgb.XGBClassifier(random_state=42)

    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='f1_macro')
    grid_search.fit(X_group, y_group)

    # Retrieve the best hyperparameters and their corresponding evaluation metrics
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Train the model on the entire dataset using the best hyperparameters
    best_model = xgb.XGBClassifier(**best_params, random_state=42)
    best_model.fit(X_group, y_group)

    # Store the best model and evaluation metrics for the current group of questions
    results[group_key] = {
        'Best Params': best_params,
        'Best F1 Score': best_score,
        'Model': best_model
    }



# Print the evaluation metrics and best hyperparameters
    for group_key, metrics in results.items():
        print('Group:', group_key)
        print('Best Params:', metrics['Best Params'])
        print('Best F1 Score:', metrics['Best F1 Score'])
        print('\n')


    # Make predictions on the entire dataset
        y_pred = best_model.predict(X_group)

        # Calculate evaluation metrics on the entire dataset
        f1 = f1_score(y_group, y_pred)
        precision = precision_score(y_group, y_pred)
        recall = recall_score(y_group, y_pred)
        conf_matrix = confusion_matrix(y_group, y_pred)

        # Store the evaluation metrics in the results dictionary
        metrics['F1 Score'] = f1
        metrics['Precision'] = precision
        metrics['Recall'] = recall
        metrics['Confusion Matrix'] = conf_matrix

KeyboardInterrupt: 