In [62]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook


%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
train_df = pd.read_csv('train_sessions.csv')
test_df = pd.read_csv('test_sessions.csv')

In [4]:
train_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,...,,,,,,,,,,0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [5]:
test_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
0,1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,...,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
1,2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,...,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53
2,3,55,2014-12-05 15:55:12,55.0,2014-12-05 15:55:13,55.0,2014-12-05 15:55:14,55.0,2014-12-05 15:56:15,55.0,...,55.0,2014-12-05 15:56:17,55.0,2014-12-05 15:56:18,55.0,2014-12-05 15:56:19,1445.0,2014-12-05 15:56:33,1445.0,2014-12-05 15:56:36
3,4,1023,2014-11-04 10:03:19,1022.0,2014-11-04 10:03:19,50.0,2014-11-04 10:03:20,222.0,2014-11-04 10:03:21,202.0,...,3374.0,2014-11-04 10:03:22,50.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:23,3374.0,2014-11-04 10:03:23
4,5,301,2014-05-16 15:05:31,301.0,2014-05-16 15:05:32,301.0,2014-05-16 15:05:33,66.0,2014-05-16 15:05:39,67.0,...,69.0,2014-05-16 15:05:40,70.0,2014-05-16 15:05:40,68.0,2014-05-16 15:05:40,71.0,2014-05-16 15:05:40,167.0,2014-05-16 15:05:44


In [6]:
# times = ['time%s' % i for i in range(1,11)]
# train_df[times] = train_df[times].apply(pd.to_datetime)
# test_df[times] = test_df[times].apply(pd.to_datetime)
# код чтобы привпести колонки ко времени, тут не нужен

In [7]:
train_df = train_df.sort_values('time1')
train_df

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,...,,,,,,,,,,0
54842,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,,,,,,,,,,0
77291,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12223,12224,50,2014-04-30 23:33:48,50.0,2014-04-30 23:33:49,48.0,2014-04-30 23:33:52,49.0,2014-04-30 23:33:52,48.0,...,2014-04-30 23:33:53,52.0,2014-04-30 23:33:54,49.0,2014-04-30 23:33:54,303.0,2014-04-30 23:33:57,304.0,2014-04-30 23:34:00,0
164437,164438,4207,2014-04-30 23:34:15,753.0,2014-04-30 23:34:16,753.0,2014-04-30 23:34:17,52.0,2014-04-30 23:34:18,50.0,...,2014-04-30 23:35:16,3346.0,2014-04-30 23:35:29,3359.0,2014-04-30 23:36:12,3346.0,2014-04-30 23:36:42,38.0,2014-04-30 23:37:13,0
12220,12221,52,2014-04-30 23:38:08,3346.0,2014-04-30 23:38:10,784.0,2014-04-30 23:38:13,784.0,2014-04-30 23:38:18,3346.0,...,2014-04-30 23:38:24,3324.0,2014-04-30 23:38:35,7330.0,2014-04-30 23:38:35,3594.0,2014-04-30 23:38:35,3329.0,2014-04-30 23:38:36,0
156967,156968,3328,2014-04-30 23:38:36,3324.0,2014-04-30 23:38:36,3599.0,2014-04-30 23:38:38,3413.0,2014-04-30 23:38:38,753.0,...,2014-04-30 23:38:40,3599.0,2014-04-30 23:38:40,3359.0,2014-04-30 23:39:07,3359.0,2014-04-30 23:39:08,3346.0,2014-04-30 23:39:53,0


In [8]:
sites = ['site%d'% i for i in range(1,11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

with open ('site_dic.pkl','rb') as input_file :
    site_dict = pickle.load(input_file)
    
sites_dict_df = pd.DataFrame(site_dict.keys() , index = list(site_dict.values()) , columns = ['site'])
print('Всего сайтов :',sites_dict_df.shape[0])
sites_dict_df.head()

Всего сайтов : 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [9]:
y_train = train_df['target']

full_df = pd.concat([train_df.drop('target', axis =1), test_df])
idx_split = train_df.shape[0]
idx_split_test = test_df.shape[0]

In [10]:
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
21668,56,55,0,0,0,0,0,0,0,0
54842,56,55,56,55,0,0,0,0,0,0
77291,946,946,951,946,946,945,948,784,949,946
114020,945,948,949,948,945,946,947,945,946,946
146669,947,950,948,947,950,952,946,951,946,947


In [11]:
from scipy.sparse import csr_matrix

In [12]:
sites_flatten = full_sites.values.flatten()

full_sites_sparse = csr_matrix(([1]*sites_flatten.shape[0],
                               sites_flatten,
                               range(0,sites_flatten.shape[0]+10,10)))[:,1:]

full_sites_sparse.shape

(336358, 48371)

In [13]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]
X_train_sparse.shape , y_train.shape
X_train_sparse.shape, X_test_sparse.shape

((253561, 48371), (82797, 48371))

In [14]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    arr = X.shape[0]
    train_len = int(ratio*arr)
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(n_jobs=-1,random_state=seed)
    logit.fit(X_train,y_train)
    valid_pred = logit.predict_proba(X_valid)[:,1]
    return roc_auc_score(y_valid,valid_pred) 
                   

In [15]:
get_auc_lr_valid(X_train_sparse,y_train)


0.919794802727792

# Предсказание тестового дата фрейма


In [16]:
new_ratio = test_df.shape[0]/train_df.shape[0]

In [17]:
def predict_test(X, y, C=1.0, ratio=new_ratio ,  seed=17):
    arr = X.shape[0]
    train_len = int(ratio*arr)
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    
    logit = LogisticRegression(n_jobs=-1,random_state=seed)
    logit.fit(X_train,y_train)
    return logit.predict_proba(X_valid)[:,1]

In [18]:
predictions = predict_test(X_train_sparse,y_train)

In [19]:
predictions

array([9.63498663e-05, 2.88110277e-03, 5.26580623e-05, ...,
       2.41899301e-05, 7.37977122e-05, 2.50612613e-05])

In [20]:
pd.Series(predictions, index=range(0,predictions.shape[0]), name ='target').head()

0    0.000096
1    0.002881
2    0.000053
3    0.000086
4    0.000379
Name: target, dtype: float64

In [21]:
pd.Series(predictions, index=range(1,predictions.shape[0]+1), name = 'target').to_csv('sample_submission.csv',
                                                                                    header=True , index_label='session_id')

In [22]:
!sample_submission.csv

# Дата фрейм с учетом времени

In [42]:
times = ['time%d'% i for i in range(1,11)]
date_train_df = train_df[times]
date_test_df = test_df[times]
# date_train_df = date_train_df.fillna(0)
# date_test_df = date_test_df.fillna(0)


In [43]:
date_test_df

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
0,2014-10-04 11:19:53,2014-10-04 11:19:53,2014-10-04 11:19:54,2014-10-04 11:19:54,2014-10-04 11:19:54,2014-10-04 11:19:54,2014-10-04 11:19:54,2014-10-04 11:19:54,2014-10-04 11:20:00,2014-10-04 11:20:00
1,2014-07-03 11:00:28,2014-07-03 11:00:53,2014-07-03 11:00:58,2014-07-03 11:01:06,2014-07-03 11:01:09,2014-07-03 11:01:10,2014-07-03 11:01:23,2014-07-03 11:01:29,2014-07-03 11:01:30,2014-07-03 11:01:53
2,2014-12-05 15:55:12,2014-12-05 15:55:13,2014-12-05 15:55:14,2014-12-05 15:56:15,2014-12-05 15:56:16,2014-12-05 15:56:17,2014-12-05 15:56:18,2014-12-05 15:56:19,2014-12-05 15:56:33,2014-12-05 15:56:36
3,2014-11-04 10:03:19,2014-11-04 10:03:19,2014-11-04 10:03:20,2014-11-04 10:03:21,2014-11-04 10:03:21,2014-11-04 10:03:22,2014-11-04 10:03:22,2014-11-04 10:03:22,2014-11-04 10:03:23,2014-11-04 10:03:23
4,2014-05-16 15:05:31,2014-05-16 15:05:32,2014-05-16 15:05:33,2014-05-16 15:05:39,2014-05-16 15:05:40,2014-05-16 15:05:40,2014-05-16 15:05:40,2014-05-16 15:05:40,2014-05-16 15:05:40,2014-05-16 15:05:44
...,...,...,...,...,...,...,...,...,...,...
82792,2014-10-02 18:20:09,2014-10-02 18:20:09,2014-10-02 18:20:09,,,,,,,
82793,2014-05-26 14:16:40,2014-05-26 14:16:41,2014-05-26 14:16:44,2014-05-26 14:16:44,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19
82794,2014-05-02 11:21:56,2014-05-02 11:21:56,2014-05-02 11:21:56,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:04,2014-05-02 11:22:04,2014-05-02 11:22:04
82795,2014-05-03 10:05:25,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:36,2014-05-03 10:05:37,2014-05-03 10:05:37,2014-05-03 10:05:38,2014-05-03 10:05:38,2014-05-03 10:05:38


In [44]:
date_train_df

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
21668,2013-01-12 08:05:57,2013-01-12 08:05:57,,,,,,,,
54842,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,,,,,,
77291,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114020,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146669,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22
...,...,...,...,...,...,...,...,...,...,...
12223,2014-04-30 23:33:48,2014-04-30 23:33:49,2014-04-30 23:33:52,2014-04-30 23:33:52,2014-04-30 23:33:53,2014-04-30 23:33:53,2014-04-30 23:33:54,2014-04-30 23:33:54,2014-04-30 23:33:57,2014-04-30 23:34:00
164437,2014-04-30 23:34:15,2014-04-30 23:34:16,2014-04-30 23:34:17,2014-04-30 23:34:18,2014-04-30 23:34:18,2014-04-30 23:35:16,2014-04-30 23:35:29,2014-04-30 23:36:12,2014-04-30 23:36:42,2014-04-30 23:37:13
12220,2014-04-30 23:38:08,2014-04-30 23:38:10,2014-04-30 23:38:13,2014-04-30 23:38:18,2014-04-30 23:38:22,2014-04-30 23:38:24,2014-04-30 23:38:35,2014-04-30 23:38:35,2014-04-30 23:38:35,2014-04-30 23:38:36
156967,2014-04-30 23:38:36,2014-04-30 23:38:36,2014-04-30 23:38:38,2014-04-30 23:38:38,2014-04-30 23:38:39,2014-04-30 23:38:40,2014-04-30 23:38:40,2014-04-30 23:39:07,2014-04-30 23:39:08,2014-04-30 23:39:53


In [45]:
date_df = pd.concat([date_train_df,date_test_df])

In [46]:
date_df

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
21668,2013-01-12 08:05:57,2013-01-12 08:05:57,,,,,,,,
54842,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,,,,,,
77291,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114020,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146669,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22
...,...,...,...,...,...,...,...,...,...,...
82792,2014-10-02 18:20:09,2014-10-02 18:20:09,2014-10-02 18:20:09,,,,,,,
82793,2014-05-26 14:16:40,2014-05-26 14:16:41,2014-05-26 14:16:44,2014-05-26 14:16:44,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19,2014-05-26 14:17:19
82794,2014-05-02 11:21:56,2014-05-02 11:21:56,2014-05-02 11:21:56,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:03,2014-05-02 11:22:04,2014-05-02 11:22:04,2014-05-02 11:22:04
82795,2014-05-03 10:05:25,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:27,2014-05-03 10:05:36,2014-05-03 10:05:37,2014-05-03 10:05:37,2014-05-03 10:05:38,2014-05-03 10:05:38,2014-05-03 10:05:38


In [47]:
date_df = date_df.sort_values('time1')
date_df

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
21668,2013-01-12 08:05:57,2013-01-12 08:05:57,,,,,,,,
54842,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,,,,,,
77291,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114020,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146669,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22
...,...,...,...,...,...,...,...,...,...,...
73505,2014-12-05 20:07:54,2014-12-05 20:07:54,,,,,,,,
59131,2014-12-05 20:55:17,2014-12-05 20:55:17,2014-12-05 21:23:38,2014-12-05 21:23:38,,,,,,
78587,2014-12-05 21:54:46,2014-12-05 21:54:46,,,,,,,,
26998,2014-12-05 22:26:40,2014-12-05 22:26:40,2014-12-05 22:55:10,2014-12-05 22:55:10,,,,,,


In [59]:
date_df[times].astype('datetime64')

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
21668,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54842,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77291,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114020,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146669,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22
...,...,...,...,...,...,...,...,...,...,...
73505,2014-12-05 20:07:54,2014-12-05 20:07:54,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
59131,2014-12-05 20:55:17,2014-12-05 20:55:17,2014-12-05 21:23:38,2014-12-05 21:23:38,NaT,NaT,NaT,NaT,NaT,NaT
78587,2014-12-05 21:54:46,2014-12-05 21:54:46,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
26998,2014-12-05 22:26:40,2014-12-05 22:26:40,2014-12-05 22:55:10,2014-12-05 22:55:10,NaT,NaT,NaT,NaT,NaT,NaT


AttributeError: 'Series' object has no attribute 'pd'