In [1]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
PATH_TO_DATA = ('.')
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')

In [3]:
y = train_df['target']

In [4]:
sites = [x for x in train_df.columns if 'site' in x ]

In [5]:
train_df[sites] = train_df[sites].fillna(0)

In [6]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,0.0,,0.0,,0.0,,0.0,,...,,0.0,,0.0,,0.0,,0.0,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [7]:
train_df[sites] = train_df[sites].astype(str)

In [8]:
site1 = train_df['site1'].values.flatten()
site2 = train_df['site2'].values.flatten()
site3 = train_df['site3'].values.flatten()
site4 = train_df['site4'].values.flatten()
site5 = train_df['site5'].values.flatten()
site6 = train_df['site6'].values.flatten()
site7 = train_df['site7'].values.flatten()
site8 = train_df['site8'].values.flatten()
site9 = train_df['site9'].values.flatten()
site10 = train_df['site10'].values.flatten()

session_str = site1 + ' ' + site2 + ' ' + site3 + ' ' + site4 + ' ' + site5 + ' ' + site6 + ' ' + site7 + ' ' + site8 + ' ' + site9 + ' ' + site10 

In [9]:
session_str

array(['718 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0',
       '890 941.0 3847.0 941.0 942.0 3846.0 3847.0 3846.0 1516.0 1518.0',
       '14769 39.0 14768.0 14769.0 37.0 39.0 14768.0 14768.0 14768.0 14768.0',
       ..., '2661 15004.0 5562.0 5562.0 5562.0 0.0 0.0 0.0 0.0 0.0',
       '812 676.0 814.0 22.0 39.0 812.0 814.0 570.0 22.0 570.0',
       '34942 1429.0 34942.0 29.0 30.0 34942.0 5779.0 30.0 35.0 33.0'],
      dtype=object)

In [10]:
tf = TfidfVectorizer( ngram_range=(1, 3), max_features=100000)

In [11]:
matrix = tf.fit_transform(session_str)


In [12]:
site_feat = matrix

site_feat

<253561x100000 sparse matrix of type '<class 'numpy.float64'>'
	with 3636401 stored elements in Compressed Sparse Row format>

In [13]:
times = ['time%s' % i for i in range(1, 11)]
train_df[times]=train_df[times].apply(pd.to_datetime)
train_df['hour'] = train_df['time1'].apply(lambda x:x.hour)

In [14]:
train_df.head()
train_df['is_morning'] =  train_df['hour'].apply(lambda x: 1 if x in range(4,13) else 0)
train_df['is_day'] =  train_df['hour'].apply(lambda x: 1 if x in range(13,19) else 0)
train_df['is_evening'] =  train_df['hour'].apply(lambda x: 1 if x in range(19,25) else 0)
train_df['is_night'] =  train_df['hour'].apply(lambda x: 1 if x in range(0,4) else 0)


In [15]:
ddf = train_df.drop(times,axis=1)
ddf.head()


Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,target,hour,is_morning,is_day,is_evening,is_night
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,10,1,0,0,0
2,890,941.0,3847.0,941.0,942.0,3846.0,3847.0,3846.0,1516.0,1518.0,0,11,1,0,0,0
3,14769,39.0,14768.0,14769.0,37.0,39.0,14768.0,14768.0,14768.0,14768.0,0,16,0,1,0,0
4,782,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,0,10,1,0,0,0
5,22,177.0,175.0,178.0,177.0,178.0,175.0,177.0,177.0,178.0,0,10,1,0,0,0


In [16]:
scaler = StandardScaler()


In [17]:
time_feat = csr_matrix((scaler.fit_transform(ddf[['hour','is_morning','is_day','is_evening','is_night']])))

In [18]:
X = hstack([time_feat,site_feat])

In [19]:
scv = StratifiedKFold(n_splits=8, random_state=18, shuffle=True)

In [20]:
lr = LogisticRegression(random_state=18, n_jobs=-1)

In [22]:
cross_val_score(lr,X,y,cv=scv,scoring='roc_auc',n_jobs=-1)

array([0.95828707, 0.96931561, 0.96501352, 0.97344492, 0.9665951 ,
       0.96170331, 0.96464177, 0.97130822])