В этом ноутбуке будет решена следующая задача: спрогнозировать вернется ли студент к ИИ-помошнику на основе имеющихся данных

Будет использован метод: "деревья решений"

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import graphviz 
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

df = pd.read_csv('../data/ai_assistant_usage_student_life.csv', parse_dates=True)
df['UsedAgain'] = df['UsedAgain'].astype(int)
df = df.drop(["SessionID", "SessionLengthMin", "AI_AssistanceLevel", "SessionDate"], axis=1)

X = df.drop("UsedAgain", axis=1)
y = df["UsedAgain"]

добавляем библиотеки и подгружаем данные, а также избавляемся от лишних переменных

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

num_features     = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

num_transformer = StandardScaler() 
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features),
])

dt_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])

dt_pipeline.fit(X_train, y_train)

y_pred = dt_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_proba = dt_pipeline.predict_proba(X_test)[:,1]
print("ROC AUC:", roc_auc_score(y_test, y_proba))

param_grid = {
    "clf__max_depth": [3, 5, 7, 9, None],
    "clf__min_samples_leaf": [1, 5, 10, 20],
    "clf__max_features": [None, "sqrt", "log2"]
}
grid = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best CV AUC:", grid.best_score_)

y_pred_best = grid.predict(X_test)
print("Test accuracy (tuned):", accuracy_score(y_test, y_pred_best))
y_proba_best = grid.predict_proba(X_test)[:,1]
print("Test ROC AUC (tuned):", roc_auc_score(y_test, y_proba_best))

best_dt = grid.best_estimator_.named_steps["clf"]
feature_names = num_features + list(grid.best_estimator_.named_steps["prep"]
                                    .named_transformers_["cat"].get_feature_names_out(cat_features))

rules_text = export_text(best_dt, feature_names=feature_names)
print(rules_text)

# Экспорт в graphviz (PDF/PNG)
dot_data = export_graphviz(
    best_dt,
    out_file=None,
    feature_names=feature_names,
    class_names=["No","Yes"],
    filled=True, rounded=True,
    special_characters=True
)
graph = graphviz.Source(dot_data)
graph.render("used_again_tree")  # сохранит used_again_tree.pdf в рабочей папке

Accuracy: 0.6316
              precision    recall  f1-score   support

           0       0.38      0.40      0.39       734
           1       0.74      0.73      0.74      1766

    accuracy                           0.63      2500
   macro avg       0.56      0.56      0.56      2500
weighted avg       0.64      0.63      0.63      2500

ROC AUC: 0.5650186230370208
Best params: {'clf__max_depth': 5, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 10}
Best CV AUC: 0.6858458475843815
Test accuracy (tuned): 0.7432
Test ROC AUC (tuned): 0.6665913207698552
|--- FinalOutcome_Gave Up <= 0.50
|   |--- FinalOutcome_Confused <= 0.50
|   |   |--- TaskType_Coding <= 0.50
|   |   |   |--- StudentLevel_Undergraduate <= 0.50
|   |   |   |   |--- SatisfactionRating <= 0.98
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- SatisfactionRating >  0.98
|   |   |   |   |   |--- class: 1
|   |   |   |--- StudentLevel_Undergraduate >  0.50
|   |   |   |   |--- FinalOutcome_Assignment Completed

'used_again_tree.pdf'

как можно увидеть, после улучшения обучения модель выдает 0,743 точности, а также Test ROC AUC = 0.667. если сравнить с моделью логистической регрессии, то результат практически один и тот же и самые важные признаки в дереве решений оказываются примерно теми же что и в логистической регрессии, это результат - "сдался" и "скомфужен".

In [6]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

num_transformer = StandardScaler() 
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features),
])

rf_pipeline = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])

rf_pipeline.fit(X_train, y_train)

y_pred = rf_pipeline.predict(X_test)
print("Base Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

y_proba = rf_pipeline.predict_proba(X_test)[:,1]
print("Base Model ROC AUC:", roc_auc_score(y_test, y_proba))

param_grid = {
    "clf__n_estimators": [50, 100, 200],
    "clf__max_depth": [3, 5, 7, 9, None],
    "clf__min_samples_leaf": [1, 5, 10, 20],
    "clf__max_features": ["sqrt", "log2", None],
    "clf__bootstrap": [True, False]
}

grid = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring="roc_auc", n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("\nBest params:", grid.best_params_)
print("Best CV AUC:", grid.best_score_)

y_pred_best = grid.predict(X_test)
print("\nTest accuracy (tuned):", accuracy_score(y_test, y_pred_best))
y_proba_best = grid.predict_proba(X_test)[:,1]
print("Test ROC AUC (tuned):", roc_auc_score(y_test, y_proba_best))

best_rf = grid.best_estimator_.named_steps["clf"]
feature_names = num_features + list(grid.best_estimator_.named_steps["prep"]
                                .named_transformers_["cat"].get_feature_names_out(cat_features))

importances = best_rf.feature_importances_
feature_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values("Importance", ascending=False)

print("\nTop 10 Features by Importance:")
print(feature_importance.head(10))

sample_tree = best_rf.estimators_[0]

dot_data = tree.export_graphviz(
    sample_tree,
    out_file=None,
    feature_names=feature_names,
    class_names=["No","Yes"],
    filled=True,
    rounded=True,
    special_characters=True,
    max_depth=3  # Ограничиваем глубину для читаемости
)

graph = graphviz.Source(dot_data)
graph.render("random_forest_sample_tree")  # Сохраняем в PDF

Base Model Accuracy: 0.6984
              precision    recall  f1-score   support

           0       0.48      0.37      0.42       734
           1       0.76      0.84      0.80      1766

    accuracy                           0.70      2500
   macro avg       0.62      0.60      0.61      2500
weighted avg       0.68      0.70      0.69      2500

Base Model ROC AUC: 0.6405753854984092
Fitting 5 folds for each of 360 candidates, totalling 1800 fits

Best params: {'clf__bootstrap': False, 'clf__max_depth': 5, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 10, 'clf__n_estimators': 200}
Best CV AUC: 0.6914073561989798

Test accuracy (tuned): 0.7432
Test ROC AUC (tuned): 0.6676567066077066

Top 10 Features by Importance:
                              Feature  Importance
19              FinalOutcome_Confused    0.335929
18  FinalOutcome_Assignment Completed    0.242629
20               FinalOutcome_Gave Up    0.169406
21          FinalOutcome_Idea Drafted    0.165650
0          

'random_forest_sample_tree.pdf'

теперь проверим модель RandomForest, она получила практически туже точность и ROC AUC, что и модель одного дерева, и важные параметры получились примерно теже, в топ 3 видим все теже результат: "сдался" и "скомфужен".

можно сказать что модели дерева решений и RandomForest  подтверждают модель логистической регрессии.