In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
events_df_train = pd.read_csv('data/event_data_train.csv')

In [3]:
events_df_train['date'] = pd.to_datetime(events_df_train.timestamp, unit='s')
events_df_train['day'] = events_df_train.date.dt.date

In [4]:
events_df_train.head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15


In [5]:
users_data = events_df_train.pivot_table(index='user_id',
                                         columns='action',
                                         values='step_id', 
                                         aggfunc='count',
                                         fill_value=0).reset_index()

In [6]:
users_data.head()

action,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,10
2,3,91,87,30,192
3,5,11,11,4,12
4,7,1,1,0,1


In [7]:
is_gone_count = users_data.loc[users_data.passed >= 40].user_id.count()
all_users_count = users_data.user_id.count()
is_gone_count/all_users_count

0.24706249350109183

In [8]:
users_data['success'] = users_data.passed >= 40
users_data['success'] = users_data['success'].apply(int)

In [9]:
users_data.head()

action,user_id,discovered,passed,started_attempt,viewed,success
0,1,1,0,0,1,0
1,2,9,9,2,10,0
2,3,91,87,30,192,1
3,5,11,11,4,12,0
4,7,1,1,0,1,0


In [10]:
unique_days = events_df_train.groupby('user_id', as_index=False).agg({'day':'nunique'}).rename(columns={'day':'unique_days'})

In [11]:
users_data = users_data.merge(unique_days, on='user_id', how='outer')

In [12]:
users_data.head()

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,success,unique_days
0,1,1,0,0,1,0,1
1,2,9,9,2,10,0,2
2,3,91,87,30,192,1,7
3,5,11,11,4,12,0,2
4,7,1,1,0,1,0,1


In [13]:
users_data.isnull().sum()

user_id            0
discovered         0
passed             0
started_attempt    0
viewed             0
success            0
unique_days        0
dtype: int64

In [14]:
sub_data_train = pd.read_csv('data/submissions_data_train.csv')

In [15]:
sub_data_train.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853
3,31972,1478852164,correct,15853
4,31976,1434348123,wrong,15853


In [16]:
sub_data_train['date'] = pd.to_datetime(sub_data_train.timestamp, unit='s')
sub_data_train['day'] = sub_data_train.date.dt.date

In [17]:
sub_data_train.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id,date,day
0,31971,1434349275,correct,15853,2015-06-15 06:21:15,2015-06-15
1,31972,1434348300,correct,15853,2015-06-15 06:05:00,2015-06-15
2,31972,1478852149,wrong,15853,2016-11-11 08:15:49,2016-11-11
3,31972,1478852164,correct,15853,2016-11-11 08:16:04,2016-11-11
4,31976,1434348123,wrong,15853,2015-06-15 06:02:03,2015-06-15


In [18]:
users_sub_data = sub_data_train.pivot_table(index='user_id',
                                            columns='submission_status',
                                            values='step_id',
                                            aggfunc='count',
                                            fill_value=0).reset_index()

In [19]:
users_sub_data.head()

submission_status,user_id,correct,wrong
0,2,2,0
1,3,29,23
2,5,2,2
3,8,9,21
4,14,0,1


In [20]:
users_sub_data['accuracy'] = users_sub_data.correct / (users_sub_data.correct + users_sub_data.wrong)

In [21]:
users_sub_data.head()

submission_status,user_id,correct,wrong,accuracy
0,2,2,0,1.0
1,3,29,23,0.557692
2,5,2,2,0.5
3,8,9,21,0.3
4,14,0,1,0.0


In [22]:
users_data = users_data.merge(users_sub_data, on='user_id', how='outer').fillna(0)

In [23]:
users_data.head()

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,success,unique_days,correct,wrong,accuracy
0,1,1,0,0,1,0,1,0.0,0.0,0.0
1,2,9,9,2,10,0,2,2.0,0.0,1.0
2,3,91,87,30,192,1,7,29.0,23.0,0.557692
3,5,11,11,4,12,0,2,2.0,2.0,0.5
4,7,1,1,0,1,0,1,0.0,0.0,0.0


In [29]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [30]:
train_X = users_data.drop(columns=['success', 'user_id'])

In [31]:
train_X.head()

Unnamed: 0,discovered,passed,started_attempt,viewed,unique_days,correct,wrong,accuracy
0,1,0,0,1,1,0.0,0.0,0.0
1,9,9,2,10,2,2.0,0.0,1.0
2,91,87,30,192,7,29.0,23.0,0.557692
3,11,11,4,12,2,2.0,2.0,0.5
4,1,1,0,1,1,0.0,0.0,0.0


In [32]:
train_y = users_data.success

In [33]:
rf_clf = RandomForestClassifier(criterion='entropy')

In [34]:
params = {'n_estimators':range(30,100,10), 'max_depth':range(2,12,2), 'min_samples_split':range(2,30,5), 'min_samples_leaf':range(2,30,2)}

In [35]:
grid_scv = RandomizedSearchCV(rf_clf, params, cv=5)

In [36]:
grid_scv.fit(train_X, train_y)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(criterion='entropy'),
                   param_distributions={'max_depth': range(2, 12, 2),
                                        'min_samples_leaf': range(2, 30, 2),
                                        'min_samples_split': range(2, 30, 5),
                                        'n_estimators': range(30, 100, 10)})