In [9]:
import pandas as pd
import numpy as np
import scipy
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("impressions.сsv")
df['event_datetime_m'] = pd.to_datetime(df.event_datetime_m, yearfirst = True)
df['dayofweek'] = [d.isoweekday() for d in df.event_datetime_m]
df['hour'] = [d.hour for d in df.event_datetime_m]
df['month'] = [d.month for d in df.event_datetime_m]
df['day'] = [d.day for d in df.event_datetime_m]
df['daymonth'] = 31*df.month + df.day
df.head()

Unnamed: 0,rank,event_datetime_m,is_clicked,id_show,id_user,dayofweek,hour,month,day,daymonth
0,1,2017-01-12 12:30:00,0.0,5547,906335,4,12,1,12,43
1,3,2017-01-12 14:53:00,0.0,5547,5038628,4,14,1,12,43
2,4,2017-01-12 17:10:00,0.0,8148,5038628,4,17,1,12,43
3,2,2017-01-12 14:48:00,0.0,8131,5038628,4,14,1,12,43
4,1,2017-01-12 14:58:00,0.0,8367,5038628,4,14,1,12,43


In [3]:
df = df.drop(['event_datetime_m', 'month', 'day'], axis = 1)
train_df = df[df.daymonth < 117]
test_df = df[df.daymonth > 116]
y_train = train_df.is_clicked
X_train = train_df.drop(['is_clicked'], axis = 1)
y_valid = test_df.is_clicked
X_valid = test_df.drop(['is_clicked'], axis = 1)
X_train.head()

Unnamed: 0,rank,id_show,id_user,dayofweek,hour,daymonth
0,1,5547,906335,4,12,43
1,3,5547,5038628,4,14,43
2,4,8148,5038628,4,17,43
3,2,8131,5038628,4,14,43
4,1,8367,5038628,4,14,43


In [4]:
scaler = StandardScaler()
std_features = ['daymonth', 'rank']
train_std = scaler.fit_transform(X_train[std_features])
valid_std = scaler.transform(X_valid[std_features])
onehot = OneHotEncoder(handle_unknown = 'ignore')
onehot_features = ['dayofweek', 'hour', 'id_user', 'id_show']
train_onehot = onehot.fit_transform(X_train[onehot_features])
valid_onehot = onehot.transform(X_valid[onehot_features])

In [5]:
X_train_new = scipy.sparse.hstack([train_std, train_onehot])
X_valid_new = scipy.sparse.hstack([valid_std, valid_onehot])

In [7]:
skf = StratifiedKFold(n_splits = 5, random_state = 57, shuffle = False)
X_train_new.shape

(4242567, 478764)

In [11]:
%%time
lr = LogisticRegression(C = 0.01, random_state = 57)
lr.fit(X_train_new, y_train)
pred = lr.predict(X_valid_new)
print(log_loss(pred, y_valid))
print(accuracy_score(pred, y_valid))

6.22188365745


NameError: name 'accuraxy_score' is not defined

In [12]:
%%time
lr = LogisticRegression(C = 0.01, random_state = 57)
lr.fit(X_train_new, y_train)
pred = lr.predict(X_valid_new)
print(log_loss(pred, y_valid))
print(accuracy_score(pred, y_valid))

0.819862035285


In [13]:
tdf = pd.read_csv('test.csv', index_col="id")
tdf['event_datetime_m'] = pd.to_datetime(tdf.event_datetime_m, yearfirst = True)
tdf['dayofweek'] = [d.isoweekday() for d in tdf.event_datetime_m]
tdf['hour'] = [d.hour for d in tdf.event_datetime_m]
tdf['month'] = [d.month for d in tdf.event_datetime_m]
tdf['day'] = [d.day for d in tdf.event_datetime_m]
tdf['daymonth'] = 31*tdf.month + tdf.day
tdf.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,rank,event_datetime_m,id_show,id_user,dayofweek,hour,month,day,daymonth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2,2017-04-12 23:03:00,22286,6235895,3,23,4,12,136
1,4,2017-04-12 21:58:00,22286,6235895,3,21,4,12,136
2,4,2017-04-12 22:02:00,22286,6235895,3,22,4,12,136
3,2,2017-04-12 22:54:00,22286,6235895,3,22,4,12,136
4,3,2017-04-12 22:02:00,15994,6235895,3,22,4,12,136


In [20]:
y_train = df.is_clicked
X_train = df.drop(['is_clicked'], axis = 1)
X_test = tdf.drop(['event_datetime_m', 'month', 'day'], axis = 1)

In [21]:
train_std = scaler.fit_transform(X_train[std_features])
test_std = scaler.transform(X_test[std_features])
train_onehot = onehot.fit_transform(X_train[onehot_features])
test_onehot = onehot.transform(X_test[onehot_features])

In [22]:
X_train_new = scipy.sparse.hstack([train_std, train_onehot])
X_test_new = scipy.sparse.hstack([test_std, test_onehot])

In [36]:
%%time
lr.fit(X_train_new, y_train)
tdf['answer'] = lr.predict_proba(X_test_new)[:, 1]
print(np.mean(tdf.answer))
write_to_submission_file(tdf['answer'], 'dummy_prediction.csv')

0.14225612192
CPU times: user 49.8 s, sys: 3.54 s, total: 53.3 s
Wall time: 53.7 s


In [25]:
def write_to_submission_file(predicted_labels, out_file,
                             target='answer', index_label="id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(0, predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [31]:
pred.shape

(1150000, 2)

In [32]:
tdf.shape

(1150000, 9)

In [34]:
tdf['answer'] = pd.Series(pred[1])

In [37]:
tdf.head()

Unnamed: 0_level_0,rank,event_datetime_m,id_show,id_user,dayofweek,hour,month,day,daymonth,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,2017-04-12 23:03:00,22286,6235895,3,23,4,12,136,0.251964
1,4,2017-04-12 21:58:00,22286,6235895,3,21,4,12,136,0.121482
2,4,2017-04-12 22:02:00,22286,6235895,3,22,4,12,136,0.123076
3,2,2017-04-12 22:54:00,22286,6235895,3,22,4,12,136,0.246864
4,3,2017-04-12 22:02:00,15994,6235895,3,22,4,12,136,0.106725


In [39]:
tdf.answer.describe()

count    1.150000e+06
mean     1.422561e-01
std      1.059934e-01
min      3.382310e-04
25%      7.256584e-02
50%      1.170570e-01
75%      1.821577e-01
max      9.406108e-01
Name: answer, dtype: float64