In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
kf = KFold(n_splits=5, shuffle=True, random_state=42)

import time
import datetime
import pickle

In [30]:
df = pd.read_csv('data/data_bag.csv', header=None, sep=';', encoding='utf-8')

In [31]:
X = df.values[:, 1:]
y = df.values[:, 0]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

In [33]:
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [34]:
print(roc_auc_score(y_test, y_pred[:, 1]))

0.8219038999520094


In [35]:
with open('model/GBCls', mode='wb') as f:
    pickle.dump(model, f)

## Тесты

In [None]:
nums = (50, 250, 500)
learns = (0.01, 0.05, 0.1)
scores = []
for n in nums:
    for l in learns:
        print(f"# {n}, {l}", end='\t')
        model = GradientBoostingClassifier(n_estimators=n,learning_rate=l, random_state=42)
        start_time = datetime.datetime.now()
        cvl = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=kf)
        print(f"Res={cvl.mean()}, Time:{datetime.datetime.now() - start_time}")

# 50, 0.01	Res=1.0, Time:0:00:33.477444
# 50, 0.05	Res=1.0, Time:0:00:33.525316
# 50, 0.1	Res=1.0, Time:0:00:33.353812
# 250, 0.01	

In [None]:
import matplotlib.pyplot as plt

plt.plot(nums, scores)
plt.xlabel('n_estimators')
plt.ylabel('score')
plt.show()