In [55]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
import scipy as sp

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [2]:
train_df = pd.read_csv('../train_sessions.csv', index_col='session_id')
test_df = pd.read_csv('../test_sessions.csv', index_col='session_id')

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
site1     253561 non-null int64
time1     253561 non-null object
site2     250098 non-null float64
time2     250098 non-null object
site3     246919 non-null float64
time3     246919 non-null object
site4     244321 non-null float64
time4     244321 non-null object
site5     241829 non-null float64
time5     241829 non-null object
site6     239495 non-null float64
time6     239495 non-null object
site7     237297 non-null float64
time7     237297 non-null object
site8     235224 non-null float64
time8     235224 non-null object
site9     233084 non-null float64
time9     233084 non-null object
site10    231052 non-null float64
time10    231052 non-null object
target    253561 non-null int64
dtypes: float64(9), int64(2), object(10)
memory usage: 42.6+ MB


In [11]:
sites = ['site%s' % i for i in range(1,11)]
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [18]:
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [12]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
site1     253561 non-null int64
time1     253561 non-null datetime64[ns]
site2     250098 non-null float64
time2     250098 non-null datetime64[ns]
site3     246919 non-null float64
time3     246919 non-null datetime64[ns]
site4     244321 non-null float64
time4     244321 non-null datetime64[ns]
site5     241829 non-null float64
time5     241829 non-null datetime64[ns]
site6     239495 non-null float64
time6     239495 non-null datetime64[ns]
site7     237297 non-null float64
time7     237297 non-null datetime64[ns]
site8     235224 non-null float64
time8     235224 non-null datetime64[ns]
site9     233084 non-null float64
time9     233084 non-null datetime64[ns]
site10    231052 non-null float64
time10    231052 non-null datetime64[ns]
target    253561 non-null int64
dtypes: datetime64[ns](10), float64(9), int64(2)
memory usage: 42.6 MB


In [13]:
#train_df[sites] = train_df[sites].fillna(0).astype('int')
#test_df[sites] = test_df[sites].fillna(0).astype('int')

In [14]:
with open(r"../site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [19]:
y_train = train_df['target']
df = pd.concat([train_df.drop('target', axis=1), test_df])
full_sites = df[sites]
idx_split = train_df.shape[0]

In [20]:
sites_flatten = full_sites.values.flatten()
sites_flatten

array([ 718,    0,    0, ..., 1098, 1098, 1098])

In [22]:
full_sites_sparse = sp.sparse.csr_matrix(([1]*sites_flatten.shape[0], 
                                          sites_flatten, 
                                          range(0, sites_flatten.shape[0] + full_sites.shape[1],full_sites.shape[1])))[:, 1:]

In [24]:
# Select the training set from the united dataframe (where we have the answers)
X_train = full_sites_sparse[:idx_split, :]
X_test = full_sites_sparse[idx_split:, :]

In [29]:
v_params = {'stop_words':[[' ','  ',','], None],
            'ngram_range': [(1, 6), (1,4), (1,5)],
            'max_features': [100000, 200000],
            'max_df': [0.5, 0.7, 0.9]}
v = TfidfVectorizer(*v_params)

In [32]:
v = TfidfVectorizer(stop_words=[' ','  ',','], ngram_range=(1, 6), max_features=200000, max_df=0.5)

In [27]:
full_sites_str = ["".join(str(sites)) for sites in df[sites].values]
full_sites_str

['[718   0   0   0   0   0   0   0   0   0]',
 '[ 890  941 3847  941  942 3846 3847 3846 1516 1518]',
 '[14769    39 14768 14769    37    39 14768 14768 14768 14768]',
 '[782 782 782 782 782 782 782 782 782 782]',
 '[ 22 177 175 178 177 178 175 177 177 178]',
 '[570  21 570  21  21   0   0   0   0   0]',
 '[  803    23  5956 17513    37    21   803 17514 17514 17514]',
 '[   22    21    29  5041 14422    23    21  5041 14421 14421]',
 '[668 940 942 941 941 942 940  23  21  22]',
 '[3700  229  570   21  229   21   21   21 2336 2044]',
 '[  229  1500    33  1500   391    35    29  2276 40305    23]',
 '[  37   39 3592  890   35   29   22   30   33 3592]',
 '[ 23 747  23 749  23  21 752  55  56  55]',
 '[ 262  322  625   52   37  152   21 2164 2164   30]',
 '[2883 2945  167  167  167   21    3  229   21   76]',
 '[28395   248   728  3364  3362  1434    30   102   727 28396]',
 '[ 63 167 167 359 167 167 167 360 363 362]',
 '[ 178  343  167  270  167 2726 2727 2725 2724 2732]',
 '[22736 227

In [33]:
tfidf = v.fit_transform(full_sites_str)

In [34]:
tfidf

<336358x200000 sparse matrix of type '<class 'numpy.float64'>'
	with 6263850 stored elements in Compressed Sparse Row format>

### Features

In [46]:
#df['YearMonth'] = df['time1'].apply(lambda x:x.strftime('%Y%m')).astype('int64')
df['Year'] = df['time1'].apply(lambda x:x.strftime('%Y')).astype('int64')
df['Month'] = df.time1.dt.strftime('%m').astype('int64')
df['YearDay'] = df['time1'].apply(lambda x:x.strftime('%j')).astype('int64')
df['is_weekend'] = df['time1'].apply(lambda x: 1 if x.date().weekday() in (5, 6) else 0)
df['Hour'] = df.time1.apply(lambda x:x.strftime('%H')).astype('int64')
df['Week'] = df['time1'].dt.strftime('%W').astype('int64')
df['Day_of_week'] = df['time1'].dt.strftime('%w').astype('int64')

In [52]:
df['Day'] = df.Hour.apply(lambda x:1 if x in range(12,18) else 0)
df['Eve'] = df.Hour.apply(lambda x:1 if x in range(18,22) else 0)
df['Night'] = df.Hour.apply(lambda x:1 if x in range(22,6) else 0)
df['Morning'] = df.Hour.apply(lambda x:1 if x in range(6,12) else 0)

In [53]:
#df['is_summer'] = df['Month'].apply(lambda x: 1 if x in (6, 7, 8) else 0)

In [57]:
enc = OneHotEncoder()

In [58]:
y = enc.fit_transform(df[['Year']].as_matrix())
m = enc.fit_transform(df[['Month']].as_matrix())
hour = enc.fit_transform(df[['Hour']].as_matrix())
dayweek = enc.fit_transform(df[['Day_of_week']].as_matrix())
yday = enc.fit_transform(df[['YearDay']].as_matrix())

In [61]:
full_feat = sp.sparse.hstack([tfidf, dayweek, hour, m, y, yday, df.Night.values.reshape(-1, 1), df.Day.values.reshape(-1, 1), df.Morning.values.reshape(-1, 1), df.Eve.values.reshape(-1, 1)]).tocsr()

In [62]:
Xfull_train = full_feat[:idx_split, :]
Xfull_test = full_feat[idx_split:, :]