In [12]:
import tensorflow as tf

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, StratifiedKFold

from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.dictionary_based import MUSE

In [2]:
print("TensorFlow v" + tf.__version__)
# get working directory and remove last folder
wd = os.path.dirname(os.getcwd())
os.chdir(wd)
print("Working Directory: ", os.getcwd())

TensorFlow v2.11.1
Working Directory:  /Users/nzuchna/Desktop/Drive/2. Areas/University/Master-TUC/M2/4_Forschungsmodul/student-performance


# Load the raw data

In [3]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('data/raw/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [4]:
# Display the first 5 examples
dataset_df.head(5)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


# Load the labels

In [5]:
labels = pd.read_csv('data/raw/train_labels.csv')

labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

# Display the first 5 examples
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


# Pre processing: Feature engineering

In [6]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y',
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [7]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 22)


In [8]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - test_ratio))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


# Training

In [9]:
# Fetch the unique list of user sessions in the validation dataset. We assigned
# `session_id` as the index of our feature engineered dataset. Hence fetching
# the unique values in the index column will give us a list of users in the
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is:
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s.
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [17]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected level group.
    X_train = train_x.loc[train_x.level_group == grp]
    train_users = X_train.index.values
    X_val = valid_x.loc[valid_x.level_group == grp]
    valid_users = X_val.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.

    #train_df["correct"] = train_labels["correct"]
    #valid_df["correct"] = valid_labels["correct"]

    y_train = train_labels["correct"]
    y_val = valid_labels["correct"]

    # There's one more step required before we can train the model.
    # We need to convert the datatset from Pandas format (pd.DataFrame)
    # into TensorFlow Datasets format (tf.data.Dataset).
    # TensorFlow Datasets is a high performance data loading library
    # which is helpful when training neural networks with accelerators like GPUs and TPUs.
    # We are omitting `level_group`, since it is not needed for training anymore.

    #train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df.loc[:, train_df.columns != 'level_group'], label="correct")
    #valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df.loc[:, valid_df.columns != 'level_group'], label="correct")

    # We will now create the Gradient Boosted Trees Model with default settings.
    # By default the model is set to train for a classification task.

    #gbtm = tfdf.keras.GradientBoostedTreesModel(verbose=0)
    #gbtm.compile(metrics=["accuracy"])

    # Train the model.

    #gbtm.fit(x=train_ds)

    # Store the model

    #models[f'{grp}_{q_no}'] = gbtm

    rocket1 = RocketClassifier()

    rocket1.fit(X_train, y_train)

    rocket_preds = rocket1.predict(X_val)
    accuracy_temp = metrics.accuracy_score(y_val, rocket_preds)
    print(str(q_no) + " - Rocket Accuracy: " + str(accuracy_temp))
    print(str(q_no) + " - Rocket Accuracy: " + str(metrics.f1_score(y_val, rocket_preds)))

    # Evaluate the trained model on the validation dataset and store the
    # evaluation accuracy in the `evaluation_dict`.

    #inspector = gbtm.make_inspector()
    #inspector.evaluation()
    #evaluation = gbtm.evaluate(x=valid_ds,return_dict=True)
    evaluation_dict[q_no] = accuracy_temp

    # Use the trained model to make predictions on the validation dataset and
    # store the predicted values in the `prediction_df` dataframe.

    #predict = gbtm.predict(x=valid_ds)
    prediction_df.loc[valid_users, q_no-1] = rocket_preds.flatten() # WARNING: .flatten compatible?

### q_no 1 grp 0-4


TypeError: X is not of a supported input data type.X must be in a supported mtype format for Panel, found <class 'pandas.core.frame.DataFrame'>Use datatypes.check_is_mtype to check conformance with specifications.

In [18]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(18849, 22) (18849,) (4713, 22) (4713,)


In [None]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

In [None]:
%%time
cv3 = StratifiedShuffleSplit(n_splits=6, train_size=0.75, random_state=42)
cv_results3 = cross_val_score(rocket1, X=X1, y=y1, cv=cv3)

print(cv_results3)
print(str(cv_results3.mean()) + " +/-" + str(cv_results3.std()))
rocket_preds3 = rocket1.predict(X_test1)
print("Rocket Accuracy: " + str(metrics.accuracy_score(y_test1, rocket_preds3)))

In [None]:
disp = ConfusionMatrixDisplay.from_predictions(y_test1, rocket_preds, display_labels=rocket1.classes_, xticks_rotation='45')
fig, ax = plt.subplots(figsize=(10,10))
disp.plot(ax=ax)
plt.show()
disp.ax_.set_title('The Rocket Algorithm')
print('The Rocket Algorithm')
print(disp.confusion_matrix)
plt.show()