# **CATBOOST**

In [None]:
#installing requirments
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

# KAGGLE DATA DOWNLOAD

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c riiid-test-answer-prediction
! unzip train.csv.zip -d train

# DATA PREPROCESSING

In [None]:
import pandas as pd
import numpy as np
trainData = pd.read_csv('/content/train/train.csv', low_memory=False,  nrows=10**6, 
    dtype={
        'row_id': 'int64', 
        'timestamp': 'int64', 
        'user_id': 'int32', 
        'content_id': 'int16', 
        'content_type_id': 'int8',
        'task_container_id': 'int16', 
        'user_answer': 'int8', 
        'answered_correctly': 'int8', 
        'prior_question_elapsed_time': 'float32'
    })

In [None]:
question = pd.read_csv('/content/questions.csv')
lecture = pd.read_csv('/content/lectures.csv')

In [None]:
#merging all dataframe 
train1 = pd.merge(trainData, question[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
train1 = pd.merge(train1, lecture[['lecture_id', 'part']], left_on = 'content_id', right_on = 'lecture_id', how = 'left')

In [None]:
#droping content-type-id
train1 = train1.drop('content_type_id', axis=1)
train1.shape

(1000000, 13)

In [None]:
train1 = train1.fillna(False).astype('int')

In [None]:
#spliting the data to train and validation
train1.sort_values('timestamp')
valid_split = train1.groupby('user_id').tail(5)
train_data = train1[~train1.row_id.isin(valid_split.row_id)]
valid_split.head(10)
print(valid_split.shape,train_data.shape)

(19104, 13) (980896, 13)


In [None]:
X = train1.drop('answered_correctly', axis=1)
y = train1.answered_correctly

In [None]:
#spliting the data for classification
X_train = train_data.drop('answered_correctly', axis=1)
y_train = train_data.answered_correctly
X_validation = valid_split.drop('answered_correctly', axis = 1)
y_validation = valid_split.answered_correctly

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

**MODEL TRAINNING**

In [None]:
#fit the simple model
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Info',
    task_type='GPU'
)

In [None]:
#training the simple model(you should use GPU for trainning this part)
%%time
model.fit(
    X_train, y_train,
    eval_set=(X_validation, y_validation),
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Best split for depth 4: 1 / 97 (>1.09884e+10) with score -9.420274734
Best split for depth 5: 4 / 37 (>147.5) with score -12.08313751
286:	learn: 0.5776206	test: 0.5961106	best: 0.5960917 (285)	total: 5.44s	remaining: 13.5s
Best split for depth 0: 3 / 0 (>52.5) with score -2.38082695
Best split for depth 1: 1 / 17 (>1.00331e+08) with score -3.190367937
Best split for depth 2: 5 / 2 (>1.5) with score -4.476416111
Best split for depth 3: 8 / 1 (>14.5) with score -7.084567547
Best split for depth 4: 7 / 0 (>0.5) with score -10.39369583
Best split for depth 5: 4 / 6 (>10.5) with score -15.07657146
287:	learn: 0.5775364	test: 0.5960373	best: 0.5960373 (287)	total: 5.46s	remaining: 13.5s
Best split for depth 0: 3 / 2 (>173.5) with score -2.25502038
Best split for depth 1: 5 / 3 (>2.5) with score -3.576220512
Best split for depth 2: 8 / 4 (>184.5) with score -6.510389328
Best split for depth 3: 5 / 2 (>1.5) with score -8.1647720

<catboost.core.CatBoostClassifier at 0x7fcef49a3f28>

In [None]:
#cross validation
%%time
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'MultiClass',
    "verbose": True,
    "depth": 3,
})
cv_data = cv(
    Pool(X, y),
    cv_params,
    plot='True'
)

In [None]:
#cross validation results
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Best validation accuracy score: 0.68±0.00 on step 995


In [None]:
#prediction results
predictions = model.predict(X_validation)
predictions_probs = model.predict_proba(X_validation)
print(predictions[:10])
print(predictions_probs[:10])

**TRAINNING MODEL USING HYPER PARAMETERS**

In [None]:
# hyper parameters
params = {
    'iterations': 10000,
    'learning_rate': 0.5,
    'random_seed': 200,
    'logging_level': 'Info',
    'use_best_model': False,
    'task_type':"GPU",
    'eval_metric':'Accuracy',
    'loss_function': 'MultiClass'
}
train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_validation, y_validation)

In [None]:
#trainning the model using hyper parameters
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);
#f1_score(X_validation, y_pred)
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Best split for depth 0: 2 / 12 (>2.1224e+06) with score -1.826204181
Best split for depth 1: 1 / 11 (>9.81064e+06) with score -2.469763041
Best split for depth 2: 8 / 76 (>5899.5) with score -3.096323013
Best split for depth 3: 3 / 75 (>5975.5) with score -3.966955185
Best split for depth 4: 6 / 23 (>13875) with score -5.000854015
Best split for depth 5: 1 / 93 (>9.18481e+09) with score -6.170230389
9287:	learn: 0.7605842	test: 0.6858773	best: 0.6892274 (6918)	total: 2m 52s	remaining: 13.2s
Best split for depth 0: 2 / 93 (>1.53832e+07) with score -1.570928574
Best split for depth 1: 4 / 125 (>3877.5) with score -2.209856987
Best split for depth 2: 3 / 7 (>417.5) with score -2.734553337
Best split for depth 3: 6 / 74 (>32900) with score -3.309187651
Best split for depth 4: 7 / 0 (>0.5) with score -4.008243084
Best split for depth 5: 6 / 80 (>35100) with score -5.055259705
9288:	learn: 0.7606036	test: 0.6856679	best: 0.6892

In [None]:
#best result with timing
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 11min 17s, sys: 13min 57s, total: 25min 14s
Wall time: 14min 23s


**SVM MODEL TRAINNING**

In [None]:
#dropping the data for SVM(you can skip this part because of previous data preprosseing)
from sklearn.model_selection import train_test_split
trainData.dropna()
X = trainData[['timestamp','content_id','user_answer']]
Y = trainData[['answered_correctly']]
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [None]:
#trainning model using SVM(you should use CPU for trainning this part)
from sklearn.svm import SVC
svcclass = SVC(kernel='linear')
svcclass.fit(X_train,y_train)
y_predict = svcclass.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_predict, y_test))
print(confusion_matrix(y_predict, y_test))