In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from scipy.sparse import hstack, vstack
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

Import the train/test data and create list of column names we need to process

In [2]:
# load train & test data
train_data = pd.read_csv('../mlcourse.ai/data/train_sessions.csv')
test_data = pd.read_csv('../mlcourse.ai/data/test_sessions.csv')

# setting website and time column names - maximum 10 sessions
site_cols = ['site{}'.format(i) for i in range(1,11)]
time_cols = ['time{}'.format(i) for i in range(1,11)]

Function to pre-process train/test dataframes

In [3]:
def convert_df(df):
    df[site_cols] = df[site_cols].fillna(0).astype('int')
    df[time_cols] = df[time_cols].apply(pd.to_datetime)
    return df

In [4]:
train_data = convert_df(train_data)
test_data = convert_df(test_data)

In [5]:
print('train_data shape: ', train_data.shape)
print('test_data shape: ', test_data.shape)

train_data shape:  (253561, 22)
test_data shape:  (82797, 21)


Let's see what the dataframe looks like so far

In [6]:
train_data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,0,NaT,0,NaT,0,NaT,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
1,2,890,2014-02-22 11:19:50,941,2014-02-22 11:19:50,3847,2014-02-22 11:19:51,941,2014-02-22 11:19:51,942,...,2014-02-22 11:19:51,3847,2014-02-22 11:19:52,3846,2014-02-22 11:19:52,1516,2014-02-22 11:20:15,1518,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39,2013-12-16 16:40:18,14768,2013-12-16 16:40:19,14769,2013-12-16 16:40:19,37,...,2013-12-16 16:40:19,14768,2013-12-16 16:40:20,14768,2013-12-16 16:40:21,14768,2013-12-16 16:40:22,14768,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782,2014-03-28 10:52:42,782,2014-03-28 10:53:12,782,2014-03-28 10:53:42,782,...,2014-03-28 10:54:42,782,2014-03-28 10:55:12,782,2014-03-28 10:55:42,782,2014-03-28 10:56:12,782,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177,2014-02-28 10:55:22,175,2014-02-28 10:55:22,178,2014-02-28 10:55:23,177,...,2014-02-28 10:55:59,175,2014-02-28 10:55:59,177,2014-02-28 10:55:59,177,2014-02-28 10:57:06,178,2014-02-28 10:57:11,0


Sort train_data by `time0` column to allow time-based cross-validation  

In [7]:
train_data = train_data.sort_values(by=time_cols[0])
train_data.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54842,54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77291,77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


Separating the `target` column from the train_data as `y_train`

In [8]:
y_train = train_data.pop('target')
print('y_train shape: ', y_train.shape)

y_train shape:  (253561,)


In [22]:
print('New train_data shape: ', train_data.shape)

New train_data shape:  (253561, 21)


We have a pickle file that holds website information. Let's extract it and see what the dictionary holds

In [16]:
with open("../mlcourse.ai/data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Let's see the first 5 keys and values in the dictionary
print(list(site_dict.keys())[:5])
print(list(site_dict.values())[:5])

['www.abmecatronique.com', 'groups.live.com', 'majeureliguefootball.wordpress.com', 'cdt46.media.tourinsoft.eu', 'www.hdwallpapers.eu']
[25075, 13997, 42436, 30911, 8104]


To make a Dataframe with this information, we should set the dictionary keys as the dataframe values and vice versa.

In [21]:
sites_by_id = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
sites_by_id.head()

Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


Let's create a new dataframe to investigate potential correlations and trends in the time features

Converting website data into Tf-Idf Vectorizer. This creates a sparse matrix where the output is weighed proportionally with the frequency of occurance of a text feature and inversely with the frequency of that feature over the corpus. Therefore, features that are too frequent and potentially unusable for distinguishing an example's outcome will be weighed appropriately. 

In [23]:
# Invert the site dictionary to be used when mapping the train/test site values
inverse_site_dict = {v: k for k,v in site_dict.items()}

In [27]:
train_sites = train_data[site_cols].apply(lambda col: col.map(inverse_site_dict))
test_sites = test_data[site_cols].apply(lambda col: col.map(inverse_site_dict))
train_sites.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
21668,safebrowsing.clients.google.com,safebrowsing-cache.google.com,,,,,,,,
54842,safebrowsing.clients.google.com,safebrowsing-cache.google.com,safebrowsing.clients.google.com,safebrowsing-cache.google.com,,,,,,
77291,www.apache.org,www.apache.org,download.eclipse.org,www.apache.org,www.apache.org,www.webtide.com,download.oracle.com,javadl-esd-secure.oracle.com,www.caucho.com,www.apache.org
114020,www.webtide.com,download.oracle.com,www.caucho.com,download.oracle.com,www.webtide.com,www.apache.org,public.dhe.ibm.com,www.webtide.com,www.apache.org,www.apache.org
146669,public.dhe.ibm.com,jope.ow2.org,download.oracle.com,public.dhe.ibm.com,jope.ow2.org,master.dl.sourceforge.net,www.apache.org,download.eclipse.org,www.apache.org,public.dhe.ibm.com


In [71]:
tv = TfidfVectorizer(ngram_range=(1,4), analyzer='word', max_features=10000)

In [72]:
train_tv = tv.fit(train_sites['site1'])
tv.get_feature_names()

['01',
 '01 10005',
 '01 10013',
 '01 10036',
 '01 10077',
 '01 10169',
 '01 11074',
 '01 11544',
 '01 11641',
 '01 12160',
 '01 12434',
 '01 ibm',
 '01 ibm com',
 '01net',
 '01net com',
 '02',
 '02 vty',
 '02 vty dailymotion',
 '02 vty dailymotion com',
 '0pb',
 '0pb org',
 '10',
 '10 109',
 '10 109 web1',
 '10 109 web1 im',
 '10005',
 '10013',
 '10036',
 '10077',
 '10169',
 '102',
 '102 40',
 '109',
 '109 web1',
 '109 web1 im',
 '109 web1 im weibo',
 '10fastfingers',
 '10fastfingers com',
 '11',
 '11 rutube',
 '11 rutube ru',
 '11 sfr',
 '11 sfr fr',
 '11074',
 '11544',
 '11641',
 '118',
 '12',
 '12 94',
 '12 94 web1',
 '12 94 web1 im',
 '12 rutube',
 '12 rutube ru',
 '12 sfr',
 '12 sfr fr',
 '12160',
 '123',
 '123 193',
 '123rf',
 '123rf com',
 '12434',
 '126',
 '126 com',
 '126 net',
 '127',
 '127 102',
 '127 102 40',
 '127 net',
 '12mlbe',
 '12mlbe com',
 '13',
 '13 94',
 '13 94 web1',
 '13 94 web1 im',
 '13 sfr',
 '13 sfr fr',
 '14',
 '14 01',
 '14 01 10005',
 '14 01 10013',
 '14

In [73]:
tv.stop_words_

{'www bulletins electroniques com',
 'groupe fnac',
 'reponseatout',
 'www activolcans',
 'catalogue schneider electric',
 'www seleccao fr',
 'video sekindo',
 'p6 sinaimg cn',
 '324402711 init cedexis radar',
 'franki geotechnics be',
 '518443417',
 'v10i zr4td 1322',
 'dista unibo it',
 'apps direction',
 'chu rouen fr',
 'catholique sedevacantiste com',
 'arielec',
 'www ic',
 'd3brdqgnb3d1f5 cloudfront',
 'www ts3 serveur',
 '12160 259554343',
 'cid 31b42a2ec131b78a',
 'sedevacantiste com',
 'hero corp',
 'wt',
 'arxiv web',
 'cache adfeedstrk com',
 'cid fddcddb369a85702',
 'athle com',
 'cloud moniut',
 'fedora fr',
 'monde fr',
 '11074 281316314',
 'os blog',
 'abstrait concret',
 'boutique formation',
 'www gotransit com',
 'business gov au',
 'oncfs',
 'moteur occasion com',
 'www galaxie',
 'premierconfigure',
 'w400125',
 'vicu',
 'ruhighlanders',
 'www man linux magique',
 'www cegep ste',
 '11074 208369510',
 'a66 w23 58dd0e8a',
 'peignier',
 'www codeincodeblock',
 'stic