In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,UserID,CourseCategory,TimeSpentOnCourse,NumberOfVideosWatched,NumberOfQuizzesTaken,QuizScores,CompletionRate,DeviceType,CourseCompletion
0,5618,Health,29.979719,17,3,50.365656,20.860773,1,0
1,4326,Arts,27.80264,1,5,62.61597,65.632415,1,0
2,5849,Arts,86.820485,14,2,78.458962,63.812007,1,1
3,4992,Science,35.038427,17,10,59.198853,95.433162,0,1
4,3866,Programming,92.490647,16,0,98.428285,18.102478,0,0


In [3]:
df.columns

Index(['UserID', 'CourseCategory', 'TimeSpentOnCourse',
       'NumberOfVideosWatched', 'NumberOfQuizzesTaken', 'QuizScores',
       'CompletionRate', 'DeviceType', 'CourseCompletion'],
      dtype='object')

In [4]:
X = df.drop('CourseCompletion', axis=1)
y = df['CourseCompletion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [5]:
#1 глубокое дерево
categorical_features = ['CourseCategory']
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)
model = CatBoostClassifier(iterations=1, random_state=42, verbose=0, depth=16)
model.fit(train_pool)
y_pred = model.predict(test_pool)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9483333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      1091
           1       0.94      0.93      0.93       709

    accuracy                           0.95      1800
   macro avg       0.95      0.94      0.95      1800
weighted avg       0.95      0.95      0.95      1800

Confusion Matrix:
[[1051   40]
 [  53  656]]


In [6]:
#10 деревьев глубины 3
categorical_features = ['CourseCategory']
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)
model = CatBoostClassifier(iterations=10, random_state=42, verbose=0, depth=3)
model.fit(train_pool)
y_pred = model.predict(test_pool)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1091
           1       0.94      0.93      0.94       709

    accuracy                           0.95      1800
   macro avg       0.95      0.95      0.95      1800
weighted avg       0.95      0.95      0.95      1800

Confusion Matrix:
[[1050   41]
 [  49  660]]


In [7]:
#100 деревьев глубины 4
categorical_features = ['CourseCategory']
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)
model = CatBoostClassifier(iterations=100, random_state=42, verbose=0, depth=4)
model.fit(train_pool)
y_pred = model.predict(test_pool)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc}")

Accuracy: 0.9622222222222222
ROC AUC: 0.9606874556410379


In [8]:
model.save_model("model.cbm")

In [9]:
#Проверим без ID. Вдруг человек с определенным ID - умный. И модель видя его и в теренировочных и в тестовых данных понимает
#Что он точно завершит курс. И наоборот в случае, если он не умный
df = df.drop(columns=['UserID'])

In [10]:
X = df.drop('CourseCompletion', axis=1)
y = df['CourseCompletion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [11]:
#30 деревьев глубины 3
categorical_features = ['CourseCategory']
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)
model = CatBoostClassifier(iterations=30, random_state=42, verbose=0, depth=3)
model.fit(train_pool)
y_pred = model.predict(test_pool)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9516666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1091
           1       0.96      0.92      0.94       709

    accuracy                           0.95      1800
   macro avg       0.95      0.95      0.95      1800
weighted avg       0.95      0.95      0.95      1800

Confusion Matrix:
[[1062   29]
 [  58  651]]


In [10]:
# Результат чуть хуже, но не сильно. Возможно это связано с информативностью столбика ID, а возможно с предположением описаным выше.
# Этот столбец мог помогть, так как через него данные из train частично просачиваются в test