<a href="https://colab.research.google.com/github/LeechXDD/9417_Pro_Project/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install transformers



In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import torch
import pandas as pd
import numpy as np
import tensorflow as tf

##Data preprocessing

In [None]:
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset_df = pd.read_csv('/content/drive/MyDrive/9417project/predict-student-performance-from-game-play.zip (Unzipped Files)/train.csv', dtype=dtypes)
labels = pd.read_csv('/content/drive/MyDrive/9417project/predict-student-performance-from-game-play.zip (Unzipped Files)/train_labels.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [None]:
CATEGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y',
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [None]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [None]:
dataset_df = feature_engineer(dataset_df)

In [None]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)

In [None]:
# Fetch the unique list of user sessions in the validation dataset. We assigned
# `session_id` as the index of our feature engineered dataset. Hence fetching
# the unique values in the index column will give us a list of users in the
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is:
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s.
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')  # Assume 'label' is your target column
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):
  # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)


    # Filter the rows in the datasets based on the selected level group.
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]


    # Drop the 'correct' and 'level_group' columns from the features
    train_features = train_df.drop(columns=["correct", "level_group"])
    valid_features = valid_df.drop(columns=["correct", "level_group"])

    # Convert all features to strings
    train_features_str = train_features.astype(str)
    valid_features_str = valid_features.astype(str)

    # Join all features into a single string
    train_sequences = train_features_str.apply(lambda x: ' '.join(x), axis=1)
    valid_sequences = valid_features_str.apply(lambda x: ' '.join(x), axis=1)

    # Encode the text sequences. This will convert the text into tokens according to the tokenizer's vocabulary, and then return the input ids and attention masks.
    train_encodings = tokenizer(train_sequences.tolist(), truncation=True, padding=True, max_length=512)
    valid_encodings = tokenizer(valid_sequences.tolist(), truncation=True, padding=True, max_length=512)

    # Convert our labels into numpy arrays
    train_labels = train_df['correct'].to_numpy()
    valid_labels = valid_df['correct'].to_numpy()

    # Convert our encodings and labels into a TensorFlow Dataset object
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
        train_labels
    ))
    valid_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': valid_encodings['input_ids'], 'attention_mask': valid_encodings['attention_mask']},
        valid_labels
    ))

    # Shuffle and batch the datasets
    batch_size = 8
    train_dataset = train_dataset.shuffle(100).batch(batch_size)
    valid_dataset = valid_dataset.batch(batch_size)

    # Load pre-trained model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

    # Train the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(train_dataset, epochs=3, validation_data=valid_dataset)

    models[f'{grp}_{q_no}'] = model

    # Evaluate the trained model on the validation dataset
    evaluation = model.evaluate(valid_dataset, return_dict=True)
    evaluation_dict[q_no] = evaluation["accuracy"]

    # Use the trained model to make predictions on the validation dataset
    predictions = model.predict(valid_dataset)
    # The model returns logits, so we need to apply a softmax to get probabilities
    predictions = tf.nn.softmax(predictions.logits, axis=-1)

    # Store the predicted probabilities for the positive class in the `prediction_df` dataframe.
    prediction_df.loc[valid_users, q_no-1] = predictions[:, 1]



### q_no 1 grp 0-4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["correct"] = train_labels["correct"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df["correct"] = valid_labels["correct"]
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and in

Epoch 1/3


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7fd0112581f0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7fd0112581f0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
  46/2357 [..............................] - ETA: 10:24:47 - loss: 7.5881 - accuracy: 0.6168

In [None]:
for name, value in evaluation_dict.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

In [None]:
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')