In [1]:
import pickle
import pandas as pd

from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("task1_allData.csv")
test_public = pd.read_csv("task1_public_test_data_.csv")
test = pd.read_csv("task1_private_test_data.csv")

In [3]:
test = pd.merge(test, test_public, how = "outer")

In [4]:
y_train = train["IsCorrect"]
x_train = train.drop(["IsCorrect", "AnswerId"], axis=1)
y_test = test["IsCorrect"]
x_test = test.drop(["IsCorrect", "AnswerId"], axis=1)

In [5]:
categorical_features = ["UserId", "QuestionId", "QuizId", "GroupId", "SchemeOfWorkId",
                        "Gender", "PremiumPupil", "Confidence", "MathType", "SchoolShift"]

for feature in categorical_features:
    x_train[feature] = pd.Series(x_train[feature], dtype="category")
    x_test[feature] = pd.Series(x_test[feature], dtype="category")

In [6]:
model = tree.DecisionTreeClassifier(
    max_depth=12, 
    criterion='entropy', 
    min_samples_split=6, 
    min_samples_leaf=4,
    random_state=0
)

In [7]:
model = model.fit(x_train, y_train)

In [8]:
y_predict = model.predict(x_test)

In [9]:
accuracy_score(y_test, y_predict)

0.7282281180842877

In [10]:
print(classification_report(y_test, y_predict, target_names=["Wrong Answer", "Correct Answer"]))

                precision    recall  f1-score   support

  Wrong Answer       0.64      0.55      0.59   1415961
Correct Answer       0.77      0.83      0.80   2551002

      accuracy                           0.73   3966963
     macro avg       0.70      0.69      0.69   3966963
  weighted avg       0.72      0.73      0.72   3966963



In [11]:
df = pd.DataFrame(y_predict, columns =['DecisionTree'])
df.to_csv("DecisionTree1.csv", header=True, index=False)