In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score 
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('../../Desktop/ML/train_sessions.csv',
                      index_col='session_id')
test_df = pd.read_csv('../../Desktop/ML/test_sessions.csv',
                     index_col='session_id')

In [3]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [4]:
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [5]:
train_df = train_df.sort_values(by='time1')

In [6]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [7]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [8]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [9]:
# загрузим словарик сайтов
with open(r"../../Desktop/ML/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

In [10]:
# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(list(site_dict.keys()), 
                          index=list(site_dict.values()), 
                          columns=['site'])
print(u'всего сайтов:', sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [11]:
# наша целевая переменная
y_train = train_df['target']

In [12]:
# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

In [13]:
# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [14]:
full_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,945,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,946,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,952,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22


In [18]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [16]:
from scipy.sparse import csr_matrix

In [17]:
csr_matrix?

In [19]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

In [20]:
# искомая матрица
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [21]:
full_sites_sparse.shape

(336358, 48371)

In [22]:
idx_split

253561

In [23]:
x_train_sparse = full_sites_sparse[:idx_split]
x_test_sparse = full_sites_sparse[idx_split:]

In [24]:
x_train_sparse.shape, y_train.shape

((253561, 48371), (253561,))

In [25]:
x_test_sparse.shape

(82797, 48371)

In [51]:
def get_auc_lr_valid(X, y, C=1.0, ratio = 0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]

    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)

    logit.fit(X_train, y_train)

    valid_pred = logit.predict_proba(X_valid)[:, 1]

    return roc_auc_score(y_valid, valid_pred)

In [52]:
%%time
get_auc_lr_valid(x_train_sparse, y_train)

Wall time: 3.42 s


0.9197949536813962

In [27]:
%%time
C = 1
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(x_train_sparse, y_train)

Wall time: 4.07 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [28]:
test_pred = logit.predict_proba(x_test_sparse)[:, 1]

In [29]:
test_pred[:10], test_df.index[:10]

(array([2.21954472e-03, 2.51892753e-09, 6.16011025e-09, 1.32267302e-08,
        2.72907698e-05, 1.51179371e-04, 4.42376221e-04, 1.01245884e-04,
        7.77291574e-04, 1.04991492e-01]),
 Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64', name='session_id'))

In [30]:
pd.Series(test_pred, index=range(1, test_pred.shape[0] + 1),
         name='target').to_csv('benchmark1.csv', header=True, index_label='session_id')

In [31]:
pd.Series?

In [30]:
pd.Series(test_pred, index=range(1, test_pred.shape[0] + 1),).head()

1    2.219545e-03
2    2.518928e-09
3    6.160110e-09
4    1.322673e-08
5    2.729077e-05
dtype: float64

In [31]:
head benchmark1.csv

SyntaxError: invalid syntax (<ipython-input-31-b89a8d01e629>, line 1)

In [32]:
time = ['time%d' % i for i in range(1,11)]
train_df[time].head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [33]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_test = pd.DataFrame(index=test_df.index)

In [34]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

In [62]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,morning,start_hour_scaled,morning_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21669,201301,-1.744405,8,1,-1.357366,1.039061
54843,201301,-1.744405,8,1,-1.357366,1.039061
77292,201301,-1.744405,8,1,-1.357366,1.039061
114021,201301,-1.744405,8,1,-1.357366,1.039061
146670,201301,-1.744405,8,1,-1.357366,1.039061


In [36]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_test['year_month_scaled'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1, 1))

In [37]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21669,201301,-1.744405
54843,201301,-1.744405
77292,201301,-1.744405
114021,201301,-1.744405
146670,201301,-1.744405


In [38]:
x_train_sparse_new = csr_matrix(hstack([x_train_sparse, new_feat_train['year_month_scaled'].values.reshape(-1, 1)]))

In [60]:
%%time
get_auc_lr_valid(x_train_sparse_new, y_train)

Wall time: 3.33 s


0.9198903563591923

In [40]:
x_train_sparse.shape, x_train_sparse_new.shape

((253561, 48371), (253561, 48372))

In [41]:
new_feat_train['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour)
new_feat_test['start_hour'] = test_df['time1'].apply(lambda ts: ts.hour)

In [42]:
new_feat_train['start_hour']

session_id
21669      8
54843      8
77292      8
114021     8
146670     8
          ..
12224     23
164438    23
12221     23
156968    23
204762    23
Name: start_hour, Length: 253561, dtype: int64

In [43]:
new_feat_train['morning'] = train_df['time1'].apply(lambda ts: 1 if ts.hour <= 11 else 0)
new_feat_test['morning'] = test_df['time1'].apply(lambda ts: 1 if  ts.hour <= 11 else 0 )

In [44]:
new_feat_train['morning']

session_id
21669     1
54843     1
77292     1
114021    1
146670    1
         ..
12224     0
164438    0
12221     0
156968    0
204762    0
Name: morning, Length: 253561, dtype: int64

In [45]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,morning
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21669,201301,-1.744405,8,1
54843,201301,-1.744405,8,1
77292,201301,-1.744405,8,1
114021,201301,-1.744405,8,1
146670,201301,-1.744405,8,1


In [46]:
scaler = StandardScaler()
scaler.fit(new_feat_train['start_hour'].values.reshape(-1, 1))
new_feat_train['start_hour_scaled'] = scaler.transform(new_feat_train['start_hour'].values.reshape(-1, 1))
new_feat_test['start_hour_scaled'] = scaler.transform(new_feat_test['start_hour'].values.reshape(-1, 1))

In [47]:
scaler = StandardScaler()
scaler.fit(new_feat_train['morning'].values.reshape(-1, 1))
new_feat_train['morning_scaled'] = scaler.transform(new_feat_train['morning'].values.reshape(-1, 1))
new_feat_test['morning_scaled'] = scaler.transform(new_feat_test['morning'].values.reshape(-1, 1))

In [48]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,morning,start_hour_scaled,morning_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21669,201301,-1.744405,8,1,-1.357366,1.039061
54843,201301,-1.744405,8,1,-1.357366,1.039061
77292,201301,-1.744405,8,1,-1.357366,1.039061
114021,201301,-1.744405,8,1,-1.357366,1.039061
146670,201301,-1.744405,8,1,-1.357366,1.039061


In [49]:
x_train_sparse_new_1 = csr_matrix(hstack([x_train_sparse,new_feat_train['year_month_scaled'].values.reshape(-1, 1), new_feat_train['start_hour_scaled'].values.reshape(-1, 1)]))

AttributeError: head not found

In [67]:
x_train_sparse_new_1.shape, x_train_sparse_new_2.shape

((253561, 48373), (253561, 48373))

In [53]:
%%time
get_auc_lr_valid(x_train_sparse_new_1, y_train)

Wall time: 3.48 s


0.9573322845076919

In [54]:
x_train_sparse_new_2 = csr_matrix(hstack([x_train_sparse,new_feat_train['year_month_scaled'].values.reshape(-1, 1), new_feat_train['morning_scaled'].values.reshape(-1, 1)]))

In [55]:
%%time
get_auc_lr_valid(x_train_sparse_new_2, y_train)

Wall time: 3.48 s


0.9483688104493708

In [56]:
x_train_sparse_new_3 = csr_matrix(hstack([x_train_sparse,new_feat_train['year_month_scaled'].values.reshape(-1, 1), new_feat_train['start_hour_scaled'].values.reshape(-1, 1), new_feat_train['morning_scaled'].values.reshape(-1, 1)]))

In [57]:
%%time
get_auc_lr_valid(x_train_sparse_new_3, y_train)

Wall time: 3.61 s


0.9586546380796769

In [87]:
%%time
C = 1
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(new_feat_train, y_train)



Wall time: 2.2 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [88]:
test_pred_feauters = logit.predict_proba(new_feat_test)[:, 1]

(253561, 48374)

In [86]:
test_pred_feauters = logit.predict_proba(x_test_sparse)[:, 1]

ValueError: X has 48371 features per sample; expecting 48374

In [79]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [89]:
write_to_submission_file(test_pred_feauters, 'benchmar4.csv')

In [73]:
test_pred_feauters[:10], test_df.index[:10]

(array([1.01365629e-06, 6.63229892e-08, 6.55423597e-07, 2.27006503e-06,
        3.21613519e-06, 2.47645297e-06, 1.90812205e-06, 4.01882479e-06,
        2.28940507e-06, 1.05834188e-08]),
 Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64', name='session_id'))

In [74]:
pd.Series(test_pred_feauters, index=range(1, test_pred.shape[0] + 1),
         name='target').to_csv('benchmark2.csv', header=True, index_label='session_id')

ValueError: Length of passed values is 253561, index implies 82797.

In [58]:
%%time
get_auc_lr_valid(x_train_sparse, y_train)

Wall time: 3.47 s


0.9197949536813962

In [59]:
%%time
C=1
get_auc_lr_valid(x_train_sparse, y_train)

Wall time: 3.47 s


0.9197949536813962

In [52]:
from sklearn.model_selection import GridSearchCV

In [96]:
c_values = np.logspace(-3, 1, 10)

logit_grid_search = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, verbose=1)
logit_grid_search.fit(new_feat_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.8s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=10.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=-1, penalty='l2',
                                          random_state=17, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-03, 2.78255940e-03, 7.74263683e-03, 2.15443469e-02,
       5.99484250e-02, 1.66810054e-01, 4.64158883e-01, 1.29154967e+00,
       3.59381366e+00, 1.00000000e+01])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=1)

In [97]:
logit_grid_search.best_score_, logit_grid_search.best_params_

(0.7930472775579379, {'C': 0.001})

In [56]:
final_model = logit_grid_search.best_estimator_

In [95]:
x_train_sparse

<253561x48371 sparse matrix of type '<class 'numpy.intc'>'
	with 1429676 stored elements in Compressed Sparse Row format>

In [78]:
GridSearchCV?

In [57]:
%%time
C=10.0
get_auc_lr_valid(x_train_sparse, y_train)

Wall time: 3.61 s


0.9197949536813962

In [64]:
%%time
C = 10.0
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(x_train_sparse, y_train)

Wall time: 3.41 s


LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [67]:
%%time
C = 10.0
logit = LogisticRegression(C=C, n_jobs=-1, random_state=17)
logit.fit(new_feat_train, y_train)

Wall time: 1.42 s


LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [69]:
second_pred = final_model.predict_proba(x_test_sparse)[:, 1]

In [71]:
second_pred[:10], test_df.index[:10]

(array([3.41908370e-04, 1.20613226e-17, 1.68497841e-18, 7.52152358e-16,
        1.16328759e-06, 1.12720097e-06, 1.73481308e-05, 7.85034264e-07,
        5.55319377e-05, 9.91827650e-02]),
 Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64', name='session_id'))

In [73]:
pd.Series(second_pred, index=range(1, second_pred.shape[0] + 1),
         name='target').to_csv('benchmark3.csv', header=True, index_label='session_id')

In [98]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,morning,start_hour_scaled,morning_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21669,201301,-1.744405,8,1,-1.357366,1.039061
54843,201301,-1.744405,8,1,-1.357366,1.039061
77292,201301,-1.744405,8,1,-1.357366,1.039061
114021,201301,-1.744405,8,1,-1.357366,1.039061
146670,201301,-1.744405,8,1,-1.357366,1.039061


In [None]:
# функция для записи прогнозов в файл
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


SyntaxError: invalid syntax (<ipython-input-92-981346b00703>, line 1)