In [1]:
!pip3 install -q numpy==1.22.4
!pip3 install -q pandas==1.5.3
!pip3 install -q scikit-learn
!pip3 install joblib

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dython 0.7.4 requires numpy>=1.23.0, but you have numpy 1.22.4 which is incompatible.




In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import joblib 

import time
import os

---

Чтение данных

In [3]:
path1 = 'datasets/tree/train_tree.csv'

def read_file(path):
    df = pd.DataFrame()
    if os.path.exists(path):
        df = pd.read_csv(path, sep=',')
    elif os.path.exists(path[1:]):
        df = pd.read_csv(path[1:], sep=',')
    else:
        print('No such file or directory') 
        raise FileNotFoundError('No such file or directory')
    return df

df_train = read_file(path1)

---

## Разбиение тренировочной выборки на тренировочную и владиационную

In [4]:
X_tree = df_train.drop(columns=['Transported'], axis=1)
y_tree = df_train.Transported

print(y_tree.shape[0])

8673


In [5]:
X_train_tree, X_val_tree, y_train_tree, y_val_tree = \
        train_test_split(X_tree, y_tree, test_size = 0.3, shuffle=True, random_state = 42)

print(y_train_tree.shape, y_val_tree.shape)

(6071,) (2602,)


In [6]:
new_directory = 'datasets/for_comparison'

os.makedirs(new_directory, exist_ok=True)

In [7]:
X_val_tree.to_csv('datasets/for_comparison/X_val_tree.csv', index=False)
y_val_tree.to_csv('datasets/for_comparison/y_val_tree.csv', index=False)

---

# Вспомогательные функции

In [8]:
def grid_search_cv(model, cv, param_grid, X_train_set, y_train_set, scores, refit=True):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=cv, refit=refit)

    start_time = time.time()

    grid_search.fit(X_train_set, y_train_set)

    total_time = time.time() - start_time
    print(f"Total time spent on grid search (wall-clock): {total_time:.2f} seconds or \
            {(total_time // 60):.0f} minutes {(total_time % 60):.2f} seconds")
    
    return grid_search

In [9]:
def custom_refit(cv_results):
    sorted_indices = np.lexsort((-cv_results["mean_test_roc_auc"], -cv_results["mean_test_f1"]))
    return sorted_indices[0]

In [10]:
def print_top_low(grid_search, show_columns, sort_by = ['rank_test_score', 'index'], n_rows = 10):
    df_results = pd.DataFrame(grid_search.cv_results_)
    df_results.index.name = 'index'
    df_results = df_results.sort_values(by=sort_by).reset_index(drop=True)

    print('Top ' + str(n_rows))
    display(df_results[:n_rows][show_columns])

    print('Low ' + str(n_rows))
    display(df_results[-n_rows:][show_columns])
    
    return df_results

In [11]:
def custom_threshold_scorer(estimator, X, y, threshold=0.5, scorer=f1_score):
    probabilities = estimator.predict_proba(X)

    predictions = (probabilities[:, 1] >= threshold).astype(int)
    score = scorer(y, predictions)

    return score

In [12]:
def findBestThreshold(model, features, target):
    best_result = {'f1': 0, 'roc_auc': 0}
    best_threshold = 0

    for threshold in np.arange(0, 1.05, 0.05):
        print(f'threshold: {threshold:.2f}')
        
        f1      = custom_threshold_scorer(model, features, target, threshold, f1_score)
        roc_auc = roc_auc_score(target, model.predict_proba(features)[:, 1])
        print(f"f1 score: {f1:.6f}")

        if f1 > best_result['f1']:
            best_result['f1'] = f1
            best_result['roc_auc'] = roc_auc
            best_threshold = threshold

    return best_threshold, best_result

In [13]:
def findBestEstimators(model, features, target, threshold=0.5):
    best_result = {'f1': 0, 'roc_auc': 0}
    best_params = model.get_params()
    cur_params  = best_params.copy()
    result = {}

    for estimators in range(10, 151, 10):
        print('estimators:', estimators)
        cur_params['n_estimators'] = estimators 

        new_model = model.__class__(**cur_params)
        new_model.fit(features, target)
        f1      = custom_threshold_scorer(new_model, features, target, threshold, f1_score)
        roc_auc = roc_auc_score(target, new_model.predict_proba(features)[:, 1])

        print(f"f1 score: {f1:.6f}")

        if f1 > best_result['f1']:
            best_result['f1'] = f1
            best_result['roc_auc'] = roc_auc
            best_params['n_estimators'] = estimators

    return best_params, best_result

In [14]:
from sklearn.exceptions import NotFittedError

def is_fitted(estimator, X):
    try:
        estimator.predict(X)
        return True
    except NotFittedError as e:
        print(repr(e))
        return False

# Обучение модели

In [15]:
param_grid = [
    {
        "max_depth": np.arange(3, 11, 1),
        "criterion": ['gini', 'entropy', 'log_loss'],
        "max_features": [None, 'sqrt', 'log2'],
    }
]

rfc = RandomForestClassifier(n_estimators=30, n_jobs = 4, random_state=42)
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

scores = ['f1', 'roc_auc']
scoring = {'f1': 'f1',
           'roc_auc': 'roc_auc'
          }

columns = ['param_criterion', 'param_max_depth', \
                         'mean_test_f1', 'mean_test_roc_auc', 'rank_test_f1', 'rank_test_roc_auc']

In [16]:
grid_search = grid_search_cv(rfc, sss, param_grid, X_train_tree, y_train_tree, scores, custom_refit)

Total time spent on grid search (wall-clock): 30.83 seconds or             0 minutes 30.83 seconds


In [17]:
df_results = print_top_low(grid_search, columns, ['rank_test_f1', 'index'])

Top 10


Unnamed: 0,param_criterion,param_max_depth,mean_test_f1,mean_test_roc_auc,rank_test_f1,rank_test_roc_auc
0,entropy,8,0.799423,0.874558,1,17
1,entropy,8,0.799423,0.874558,1,17
2,log_loss,8,0.799423,0.874558,1,17
3,log_loss,8,0.799423,0.874558,1,17
4,gini,9,0.798878,0.874216,5,22
5,gini,9,0.798878,0.874216,5,22
6,gini,7,0.798504,0.874362,7,21
7,entropy,8,0.798475,0.876802,8,1
8,log_loss,8,0.798475,0.876802,8,1
9,entropy,6,0.798237,0.875638,10,7


Low 10


Unnamed: 0,param_criterion,param_max_depth,mean_test_f1,mean_test_roc_auc,rank_test_f1,rank_test_roc_auc
62,log_loss,4,0.73818,0.854074,61,60
63,log_loss,4,0.73818,0.854074,61,60
64,gini,4,0.737909,0.854649,65,58
65,gini,4,0.737909,0.854649,65,58
66,gini,3,0.713241,0.846419,67,70
67,gini,3,0.713241,0.846419,67,70
68,entropy,3,0.713094,0.848146,69,64
69,entropy,3,0.713094,0.848146,69,64
70,log_loss,3,0.713094,0.848146,69,64
71,log_loss,3,0.713094,0.848146,69,64


In [18]:
is_fitted(grid_search.best_estimator_, X_train_tree)

True

In [19]:
best_estimator = findBestEstimators(grid_search.best_estimator_, X_train_tree, y_train_tree)

estimators: 10
f1 score: 0.834450
estimators: 20


f1 score: 0.835305
estimators: 30


f1 score: 0.837418
estimators: 40


f1 score: 0.836352
estimators: 50


f1 score: 0.837410
estimators: 60


f1 score: 0.838555
estimators: 70


f1 score: 0.840626
estimators: 80


f1 score: 0.839337
estimators: 90


f1 score: 0.840681
estimators: 100


f1 score: 0.841687
estimators: 110


f1 score: 0.839388
estimators: 120


f1 score: 0.838699
estimators: 130


f1 score: 0.839573
estimators: 140


f1 score: 0.840006
estimators: 150


f1 score: 0.839522


In [20]:
# %%pipeline
best_rfc_threshold = findBestThreshold(grid_search.best_estimator_, X_train_tree, y_train_tree)

threshold: 0.00
f1 score: 0.675466
threshold: 0.05
f1 score: 0.680290
threshold: 0.10
f1 score: 0.701564
threshold: 0.15
f1 score: 0.750488
threshold: 0.20
f1 score: 0.793542
threshold: 0.25


f1 score: 0.821637
threshold: 0.30
f1 score: 0.826953
threshold: 0.35
f1 score: 0.833921
threshold: 0.40
f1 score: 0.838041
threshold: 0.45
f1 score: 0.841552
threshold: 0.50
f1 score: 0.837418
threshold: 0.55
f1 score: 0.827020
threshold: 0.60


f1 score: 0.795834
threshold: 0.65
f1 score: 0.759675
threshold: 0.70
f1 score: 0.724104
threshold: 0.75
f1 score: 0.651738
threshold: 0.80
f1 score: 0.615520
threshold: 0.85
f1 score: 0.581432
threshold: 0.90
f1 score: 0.517463
threshold: 0.95


f1 score: 0.378859
threshold: 1.00
f1 score: 0.000000


In [21]:
new_directory = 'models/'

os.makedirs(new_directory, exist_ok=True)

In [22]:
rfc_model = RandomForestClassifier(**best_estimator[0]).fit(X_train_tree, y_train_tree)

joblib_logr = 'models/random_forest.pkl'
joblib.dump(rfc_model, joblib_logr)

['models/random_forest.pkl']