In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import lightgbm as lightgbm
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
train = pd.read_csv("task1_allData.csv")
test_public = pd.read_csv("task1_public_test_data.csv")
test = pd.read_csv("task1_private_test_data.csv")

In [4]:
test = pd.merge(test, test_public, how = "outer")

In [5]:
y_train = train["IsCorrect"]
x_train = train.drop(["IsCorrect", "AnswerId"], axis=1)
y_test = test["IsCorrect"]
x_test = test.drop(["IsCorrect", "AnswerId"], axis=1)

In [6]:
categorical_features = ["UserId", "QuestionId", "QuizId", "GroupId", "SchemeOfWorkId",
                        "Gender", "PremiumPupil", "Confidence", "MathType", "SchoolShift"]

for feature in categorical_features:
    x_train[feature] = pd.Series(x_train[feature], dtype="category")
    x_test[feature] = pd.Series(x_test[feature], dtype="category")

In [9]:
params = {
    # "max_bin": [1500],
    "num_leaves": [600],
    "learning_rate": [0.1], # 0.15
    # "n_estimators": 600,
    "max_depth": [18],
    "n_jobs": [4],
    "verbose": [2]
}

In [7]:
model = lightgbm.LGBMClassifier(**params)
model.fit(x_train, y_train)

y_predict = model.predict(x_test)
print('private ', accuracy_score(y_test, y_predict))

[LightGBM] [Info] Number of positive: 10202239, number of negative: 5665611
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.914306
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.191397
[LightGBM] [Debug] init for col-wise cost 0.196562 seconds, init for row-wise cost 0.838038 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 58105
[LightGBM] [Info] Number of data points in the train set: 15867850, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.642950 -> initscore=0.588192
[LightGBM] [Info] Start training from score 0.588192
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree wi

In [9]:
print(classification_report(y_test, y_predict, target_names=["Wrong Answer", "Correct Answer"]))

                precision    recall  f1-score   support

  Wrong Answer       0.65      0.59      0.62   1415961
Correct Answer       0.78      0.83      0.80   2551002

      accuracy                           0.74   3966963
     macro avg       0.72      0.71      0.71   3966963
  weighted avg       0.74      0.74      0.74   3966963



In [10]:
df = pd.DataFrame(y_predict, columns =['LightGBM'])
df.to_csv("LGBM1.csv", header=True, index=False)

In [6]:
import pickle
pickle.dump(model, open("LightGBM.sav", 'wb'))

____

Undersampling & Oversampling Data Balancing Techniques

______

In [13]:
categorical_features = ["UserId", "QuestionId", "QuizId", "GroupId", "SchemeOfWorkId",
                        "Gender", "PremiumPupil", "Confidence", "MathType", "SchoolShift"]

for feature in categorical_features:
    train[feature] = pd.Series(train[feature], dtype="category")
    test[feature] = pd.Series(test[feature], dtype="category")

In [15]:
ZerosData = train[train["IsCorrect"] == 0]
OnesData = train[train["IsCorrect"] == 1]
OnesDataSample = train[train["IsCorrect"] == 1].sample(n=ZerosData.shape[0])
undersampleTrain = pd.merge(ZerosData, OnesDataSample, how="outer")

In [16]:
model = lightgbm.LGBMClassifier(**params)
model.fit(undersampleTrain.drop(["IsCorrect", "AnswerId"], axis=1), undersampleTrain["IsCorrect"])

y_predict = model.predict(x_test)
print('Undersampling ', accuracy_score(y_test, y_predict))

[LightGBM] [Info] Number of positive: 5665611, number of negative: 5665611
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.912742
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.191343
[LightGBM] [Debug] init for col-wise cost 0.110061 seconds, init for row-wise cost 0.647257 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 57913
[LightGBM] [Info] Number of data points in the train set: 11331222, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trai

In [34]:
ZerosData = train[train["IsCorrect"] == 0]
OnesData = train[train["IsCorrect"] == 1]
diff = OnesData.shape[0] - ZerosData.shape[0]
ZerosSample = ZerosData.sample(n=diff)
overersampleTrain = train.append(ZerosSample)

In [39]:
overersampleTrain = overersampleTrain.sample(frac=1)

In [40]:
model = lightgbm.LGBMClassifier(**params)
model.fit(overersampleTrain.drop(["IsCorrect", "AnswerId"], axis=1), overersampleTrain["IsCorrect"])

y_predict = model.predict(x_test)
print('Oversampling ', accuracy_score(y_test, y_predict))

[LightGBM] [Info] Number of positive: 10202239, number of negative: 10202239
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.913000
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.191268
[LightGBM] [Debug] init for col-wise cost 0.196245 seconds, init for row-wise cost 1.121236 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 57674
[LightGBM] [Info] Number of data points in the train set: 20404478, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 600 and depth = 18
[LightGBM] [Debug] Tr