<a href="https://colab.research.google.com/github/Jiablero/notebooks/blob/master/stepik_contest_season3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import numpy as np

In [0]:
submissions = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')
events = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')

In [0]:
def prepare_stepik_data(submissions, events, period = 0):
  if period > 0:
    events_period = events.groupby('user_id').agg({'timestamp': 'min'}).rename({'timestamp':'start_time'}, axis=1).reset_index()
    events_period = events_period.merge(events, left_on='user_id', right_on='user_id', how='outer')
    events = events_period[events_period.timestamp <= events_period.start_time + period].drop('start_time', axis = 1)
    submissions_period = submissions.groupby('user_id').agg({'timestamp': 'min'}).rename({'timestamp':'start_time'}, axis=1).reset_index()
    submissions_period = submissions_period.merge(submissions, left_on='user_id', right_on='user_id', how='outer')
    submissions = submissions_period[submissions_period.timestamp <= submissions_period.start_time + period].drop('start_time', axis = 1)

  userdata = pd.DataFrame()
  events = pd.get_dummies(events)
  userdata = events.groupby('user_id').agg({'action_passed': 'sum', 'action_viewed': 'sum', 'action_started_attempt': 'sum'}).reset_index()
  
  submissions = pd.get_dummies(submissions)
  userdata = userdata.merge(submissions.groupby('user_id').agg({'submission_status_correct': 'sum', 'submission_status_wrong': 'sum'}).reset_index(), how='outer')
  userdata = userdata.fillna(0)
  userdata['passed'] = userdata.action_passed.map(lambda x: 1 if x >= 40 else 0)
  userdata = userdata.rename({'action_started_attempt': 'tried', 'action_viewed': 'viewed', 'submission_status_correct': 'correct',
                 'submission_status_wrong': 'wrong'}, axis = 1).drop('action_passed', axis = 1)

  X = userdata.drop('passed', axis = 1)
  y = userdata.passed

  return X, y

In [0]:
X, y = prepare_stepik_data(submissions, events)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [0]:
def grid_search(clf, params, X, y):
  gs = GridSearchCV(clf, params, n_jobs=-1)
  gs.fit(X, y)
  # best_params_ для отладки
  return {'best_params_': gs.best_params_, 'best_estimator_': gs.best_estimator_}

In [0]:
# Random Forest
rf_clf = RandomForestClassifier()
rf_parameters = {
    'n_estimators': range(10, 30, 5), 
    'max_depth': range(3, 10, 1), 
    'min_samples_leaf': range(1, 7), 
    'min_samples_split': range(4, 12, 2)}

In [0]:
clf_final_rf = grid_search(rf_clf, rf_parameters, X_train, y_train)

In [64]:
clf_final_rf.get('best_estimator_').score(X_test, y_test)

0.9935327405012127

In [0]:
pred_proba = clf_final_rf.get('best_estimator_').predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])

In [71]:
roc_score

0.9993409680207432

In [0]:
clf_cv = RandomForestClassifier(**clf_final_rf.get('best_estimator_').get_params())
cv_scores = cross_val_score(clf_cv, X, y, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)

In [84]:
mean_cv_scores

0.9978011793069745

In [0]:
# проверочные результаты из задания
submissions_full = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')
events_full = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')

In [0]:
X_full, y_full = prepare_stepik_data(submissions, events, period = 60 * 60 * 34 * 2)

In [106]:
clf = RandomForestClassifier(**clf_final_rf.get('best_estimator_').get_params())
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
proba_full = clf.predict_proba(X_full)
roc_full = roc_auc_score(y_full, proba_full[:, 1])

In [108]:
roc_full

0.999378740715228

In [0]:
result = pd.DataFrame({'user_id': X.user_id, 'is_gone': proba_full[:, 1]})

In [0]:
result.to_csv('result.csv', index = False)

In [0]:
# Your ROC score is 0.7701523928607497