## Performance Prediction model

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
with open("/content/quiz_1_features.json", "r") as file:
    features_data = json.load(file)

In [None]:
df = pd.DataFrame(features_data)

In [None]:
print(df.head())

  user_id question_id                             category difficulty  \
0    U001        Q060  Road Safety and Accident Prevention     Medium   
1    U001        Q032            Driving Ethics and Safety       Hard   
2    U001        Q059  Road Safety and Accident Prevention       Hard   
3    U001        Q029        Traffic Rules and Regulations       Easy   
4    U001        Q026        Traffic Rules and Regulations       Easy   

   difficulty_numeric  overall_accuracy  category_performance  \
0                   2             0.325              0.272727   
1                   3             0.325              0.333333   
2                   3             0.325              0.272727   
3                   1             0.325              0.333333   
4                   1             0.325              0.333333   

   importance_weight  is_correct  
0                  2       False  
1                  1       False  
2                  2       False  
3                  2       Fal

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               2000 non-null   object 
 1   question_id           2000 non-null   object 
 2   category              2000 non-null   object 
 3   difficulty            2000 non-null   object 
 4   difficulty_numeric    2000 non-null   int64  
 5   overall_accuracy      2000 non-null   float64
 6   category_performance  2000 non-null   float64
 7   importance_weight     2000 non-null   int64  
 8   is_correct            2000 non-null   bool   
dtypes: bool(1), float64(2), int64(2), object(4)
memory usage: 127.1+ KB
None


In [None]:
X = df[["overall_accuracy", "category_performance", "importance_weight", "difficulty_numeric"]]
y = df["is_correct"].astype(int)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
print(y.value_counts())

is_correct
0    1493
1     507
Name: count, dtype: int64


In [None]:
# Resampling the minority class
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

print(y.value_counts())

is_correct
0    1493
1    1493
Name: count, dtype: int64


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
params = {
    "objective": "binary",
    "metric": "binary_error",
    "boosting_type": "gbdt",
    "num_leaves": 15,
    "learning_rate": 0.2,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 35,
    "lambda_l1": 1.0,
    "lambda_l2": 1.0,
    "verbose": 0,
    "n_jobs": -1,
    "random_state": 42,
}

model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=1000)



In [None]:
y_pred_train = model.predict(X_train)

y_pred_labels = [1 if prob >= 0.5 else 0 for prob in y_pred_train]

print(classification_report(y_train, y_pred_labels))
print("Train Accuracy:", accuracy_score(y_train, y_pred_labels))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91       893
           1       0.95      0.86      0.90       898

    accuracy                           0.91      1791
   macro avg       0.91      0.91      0.91      1791
weighted avg       0.91      0.91      0.91      1791

Train Accuracy: 0.9056393076493578


In [None]:
y_pred_val = model.predict(X_val)

y_pred_labels = [1 if prob >= 0.5 else 0 for prob in y_pred_val]

print(classification_report(y_val, y_pred_labels))
print("Val Accuracy:", accuracy_score(y_val, y_pred_labels))

              precision    recall  f1-score   support

           0       0.74      0.81      0.77       306
           1       0.78      0.69      0.73       291

    accuracy                           0.75       597
   macro avg       0.76      0.75      0.75       597
weighted avg       0.76      0.75      0.75       597

Val Accuracy: 0.7537688442211056


In [None]:
y_pred_test = model.predict(X_test)

y_pred_labels = [1 if prob >= 0.5 else 0 for prob in y_pred_test]

print(classification_report(y_test, y_pred_labels))
print("Test Accuracy:", accuracy_score(y_test, y_pred_labels))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77       294
           1       0.79      0.74      0.76       304

    accuracy                           0.77       598
   macro avg       0.77      0.77      0.77       598
weighted avg       0.77      0.77      0.77       598

Test Accuracy: 0.7692307692307693


In [None]:
with open("answer_prediction_model.pkl", "wb") as file:
    pickle.dump(model, file)