In [1]:
### Imports for the Data Preprocessing
import numpy as np
import pandas as pd
import matplotlib as plt
import os
import gc
from typing import Tuple
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix




In [2]:
# get working directory
wd = os.path.dirname(os.getcwd())
# wd = "N:\MASTER_DS\Code\Kaggle_competition\Kaggle-seminar\student-performance"
os.chdir(wd)
print("Working Directory: ", os.getcwd())

Working Directory:  n:\MASTER_DS\Code\Kaggle_competition\Kaggle-seminar\student-performance


In [3]:
from utils.loader_steve import load_data, load_labels , load_all_X_y

df, labels = load_all_X_y( data= "flattened", )
keys = list(df.keys())

dtypes_labels= {
    'correct': np.uint8, 
    'q':np.uint8}

lab_1 = pd.read_csv('data/processed/labels_q1-3.csv', dtype=dtypes_labels)
lab_2 = pd.read_csv('data/processed/labels_q4-13.csv', dtype=dtypes_labels)
lab_3= pd.read_csv('data/processed/labels_q14-18.csv', dtype=dtypes_labels)

labels_dict = { "0_4": lab_1,
        "5_12": lab_2,
        "13_22": lab_3
}

In [4]:
keys = list(df.keys())
labels_dict = {
    "0_4": lab_1,
    "5_12": lab_2,
    "13_22": lab_3
}

# Create an empty dictionary to store the output DataFrames
output_dict = {}

# Iterate over each key in the original dictionary
for key in keys:
    train_data = df[key]
    train_data = train_data.sort_values('session_id_1')
    labels = labels_dict[key]
    labels.rename(columns={'session': 'session_id_1'}, inplace=True)
    labels = labels.sort_values('session_id_1')
    merged_df = pd.merge(train_data, labels, on="session_id_1", how="inner")
    merged_df = merged_df.drop(columns=[col for col in merged_df.columns if 'session' in col])
    grouped_df = merged_df.sort_values('q')
    
    # Group the merged DataFrame by the "q" column
    grouped_by_level = grouped_df.groupby('q')
    
    # Iterate over each unique level in the grouped DataFrame
    for level, group_data in grouped_by_level:
        # Create a unique key for the output dictionary
        output_key = f"{level}"
        # Drop the "level_group" and "q" columns from the group_data DataFrame
        group_data = group_data.drop(columns=["level_group_1", "q"])
        # Store the group_data DataFrame in the output dictionary
        output_dict[output_key] = group_data



In [5]:
from sklearn.model_selection import train_test_split

# Split ratio for train, test, and validation sets
train_ratio = 0.7
test_ratio = 0.2
val_ratio = 0.1

# Create dictionaries for train, test, and validation sets
train_dict = {}
test_dict = {}
val_dict = {}

# Iterate over each key in the output_dict
for key, data in output_dict.items():
    # Split data into train and remaining data (test + validation)
    data_train, data_remaining = train_test_split(data, test_size=(test_ratio + val_ratio), random_state=42)
    
    # Split remaining data into test and validation
    data_test, data_val = train_test_split(data_remaining, test_size=val_ratio / (test_ratio + val_ratio), random_state=42)
    
    # Store the data in respective dictionaries
    train_dict[key] = data_train
    test_dict[key] = data_test
    val_dict[key] = data_val


In [6]:
check = train_dict["1"]
vars_to_keep = ['train_dict', 'test_dict', 'val_dict']
all_vars = list(globals().keys())
vars_to_drop = [var for var in all_vars if var not in vars_to_keep]

for var in vars_to_drop:
    del globals()[var]

In [None]:
'''import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm  # for progress tracking

# Dictionary to store results
results_dict = {}
test_data_dict = {}
#index to stop early
index = 0
limit = 1
# Iterate over each key (model) in the output_dict
for key, data in tqdm(output_dict.items(), desc='GridSearch Progress', total=len(output_dict)):
    X = data.drop("correct", axis=1)  # Features
    y = data["correct"]  # Labels

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    test_data_dict[key] = {
        "X_test": X_test,
        "y_test": y_test
    }

    # Define hyperparameter values to search for each model
    param_grid = {
        'max_depth': [3, 6, 9],
        'learning_rate': [0.1, 0.01, 0.001],
        'n_estimators': [100, 200, 300],
        'gamma': [0, 1, 5],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'scale_pos_weight': [1, 5, 10]
    }

    # Create XGBoost classifier
    model = xgb.XGBClassifier()

    # Perform grid search with cross-validation for the current model
    grid_search = GridSearchCV(model, param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, y_train)

    # Get the grid search results for each hyperparameter combination
    cv_results = grid_search.cv_results_

    # Store results for the current model
    results_dict[key] = {
        "cv_results": cv_results,
        "best_params": grid_search.best_params_,
        "best_score": grid_search.best_score_,
        "f1_score": f1_score(y_test, grid_search.predict(X_test))
    }
    index += 1
    if index == limit:
        break
'''

In [8]:
import xgboost as xgb
import numpy as np
from hyperopt import fmin, hp, tpe, Trials
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd

# Define search space for hyperparameters
param_space = {
    'max_depth': hp.choice('max_depth', [3, 6, 9]),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1)),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
    'gamma': hp.choice('gamma', [0, 1, 5]),
    'subsample': hp.uniform('subsample', 0.8, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.8, 1.0),
    'scale_pos_weight': hp.choice('scale_pos_weight', [1, 5, 10])
}

# Dictionary to store results
results_dict = {}
test_data_dict = {}
best_models = {}  # Dictionary to store best models

# Define the objective function for hyperopt
def objective(params):
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred)
    return -score  # maximize F1 score

# Iterate over each key (model) in the train_dict
for key, train_data in tqdm(train_dict.items()):
    val_data = val_dict[key]
    test_data = test_dict[key]

    X_train = train_data.drop("correct", axis=1)  # Features for training
    y_train = train_data["correct"]  # Labels for training

    X_val = val_data.drop("correct", axis=1)  # Features for validation
    y_val = val_data["correct"]  # Labels for validation
    
    X_test = test_data.drop("correct", axis=1)  # Features for testing
    y_test = test_data["correct"]  # Labels for testing

    # Run hyperparameter optimization using hyperopt
    trials = Trials()
    best_params = fmin(objective, param_space, algo=tpe.suggest, max_evals=100, trials=trials)

    # Fit the model with the best hyperparameters using train and validation data
    model = xgb.XGBClassifier(**best_params)
    X_train_val = pd.concat([X_train, X_val])
    y_train_val = pd.concat([y_train, y_val])
    model.fit(X_train_val, y_train_val)

    # Evaluate on the test set
    y_pred = model.predict(X_test)
    best_score = f1_score(y_test, y_pred)

    # Store results for the current model
    results_dict[key] = {
        "best_params": best_params,
        "best_score": best_score
    }
    
    # Store the best model
    best_models[key] = model


  0%|          | 0/18 [00:00<?, ?it/s]

100%|██████████| 100/100 [14:53<00:00,  8.93s/trial, best loss: -0.8436421555614548]

  6%|▌         | 1/18 [14:53<4:13:04, 893.22s/it]


100%|██████████| 100/100 [07:29<00:00,  4.50s/trial, best loss: -0.9899292907649453]

 11%|█         | 2/18 [22:22<2:48:36, 632.26s/it]


100%|██████████| 100/100 [10:54<00:00,  6.54s/trial, best loss: -0.969590899146795]

 17%|█▋        | 3/18 [33:17<2:40:34, 642.28s/it]


100%|██████████| 100/100 [14:32<00:00,  8.72s/trial, best loss: -0.8942261038330908]


 22%|██▏       | 4/18 [47:49<2:51:02, 733.06s/it]

100%|██████████| 100/100 [17:02<00:00, 10.23s/trial, best loss: -0.7262826024648895]

 28%|██▊       | 5/18 [1:04:52<3:01:28, 837.56s/it]


100%|██████████| 100/100 [08:50<00:00,  5.31s/trial, best loss: -0.8793315310887196]

 33%|███▎      | 6/18 [1:13:42<2:26:38, 733.21s/it]


100%|██████████| 100/100 [15:19<00:00,  9.20s/trial, best loss: -0.8393166625402327]

 39%|███▉      | 7/18 [1:29:02<2:25:37, 794.31s/it]


100%|██████████| 100/100 [14:03<00:00,  8.43s/trial, best loss: -0.7641633728590251]

 44%|████▍     | 8/18 [1:43:06<2:14:59, 809.98s/it]


100%|██████████| 100/100 [09:57<00:00,  5.97s/trial, best loss: -0.8443775100401607]

 44%|████▍     | 8/18 [1:53:03<2:21:19, 847.94s/it]







XGBoostError: [20:20:53] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_colmaker.cc:177: Check failed: param_.max_depth > 0 (0 vs. 0) : exact tree method doesn't support unlimited depth.

In [9]:
# Iterate over each key-value pair in results_dict
for key, value in results_dict.items():
    best_score = value["best_score"]
    print(f"Model: {key}, Best Score: {best_score}")

Model: 1, Best Score: 0.0
Model: 2, Best Score: 0.0
Model: 3, Best Score: 0.0
Model: 4, Best Score: 0.0
Model: 5, Best Score: 0.7024438573315721
Model: 6, Best Score: 0.0
Model: 7, Best Score: 0.8466527964753396
Model: 8, Best Score: 0.0
