In [4]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pickle
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [5]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
from xgboost import XGBClassifier
import scipy as sp
from sklearn.metrics import r2_score

In [6]:
train_df = pd.read_csv('../Alice/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv('../Alice/test_sessions.csv', index_col='session_id')

In [7]:
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [8]:
train_df = train_df.sort_values(by='time1')

In [9]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [10]:
sites = ['site%s' % i for i in range(1,11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [11]:
with open(r"../Alice/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)


In [12]:
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [13]:
print (test_df.shape, train_df.shape)

(82797, 20) (253561, 21)


In [14]:
y_train = train_df['target']
df = pd.concat([train_df.drop('target', axis=1), test_df])
idx_split = train_df.shape[0]

In [15]:
full_sites = df[sites]
print (full_sites.head())

            site1  site2  site3  site4  site5  site6  site7  site8  site9  \
session_id                                                                  
21669          56     55      0      0      0      0      0      0      0   
54843          56     55     56     55      0      0      0      0      0   
77292         946    946    951    946    946    945    948    784    949   
114021        945    948    949    948    945    946    947    945    946   
146670        947    950    948    947    950    952    946    951    946   

            site10  
session_id          
21669            0  
54843            0  
77292          946  
114021         946  
146670         947  


In [16]:
sites_flatten = full_sites.values.flatten()

In [17]:
full_sites_sparse = sp.sparse.csr_matrix(([1]*sites_flatten.shape[0], sites_flatten, range(0, sites_flatten.shape[0] + full_sites.shape[1],full_sites.shape[1])))[:, 1:]

In [18]:
full_sites_sparse

<336358x48371 sparse matrix of type '<class 'numpy.int32'>'
	with 3195430 stored elements in Compressed Sparse Row format>

In [19]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio= 0.8):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, n_jobs=-1).fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [20]:
%%time
# Select the training set from the united dataframe (where we have the answers)
X_train = full_sites_sparse[:idx_split, :]

# Calculate metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.921492734896
Wall time: 7.28 s


In [21]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, index= np.arange(1, predicted_labels.shape[0] +1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [22]:
lr = LogisticRegression(C=1.0, random_state=17).fit(X_train, y_train)

In [23]:
X_test = full_sites_sparse[idx_split:,:]
y_test = lr.predict_proba(X_test)[:,1]

In [24]:
write_to_submission_file(y_test,'baseline11.csv')