In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack,csr_matrix

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [2]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
train_df = pd.read_csv('./data/all/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('./data/all/test_sessions.csv',
                      index_col='session_id')

In [4]:
# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
y_train = train_df['target']

full_df = pd.concat([train_df.drop('target', axis=1), test_df])

idx_split = train_df.shape[0]

In [6]:
full_df.shape

(336358, 20)

In [7]:
sites = ['site%s' % i for i in range(1, 11)]
full_sites = full_df[sites]

sites_flatten = full_sites.values.flatten()

full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0]  + 10, 10)))[:, 1:]

In [7]:
# %%time
# sites = ['site%s' % i for i in range(1, 11)]

# full_df[sites].fillna(0).astype('int').to_csv('./data/all/full_sessions_text.txt', 
#                                                sep=' ', index=None, header=None)

# cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)

# with open('./data/all/full_sessions_text.txt') as inp_train_file:
#     full_sites_sparse = cv.fit_transform(inp_train_file)

Wall time: 39.5 s


In [8]:
full_sites_sparse.shape

(336358, 48371)

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

Tf_trn = TfidfTransformer()

full_sites_sparse = Tf_trn.fit_transform(full_sites_sparse).tocsr()

full_sites_sparse.shape

(336358, 50000)

In [9]:
def add_time_features(df, X_sparse):
    
    full_new_feature = pd.DataFrame(index=df.index)
    hour = df['time1'].apply(lambda ts: ts.hour)
    full_new_feature['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
    full_new_feature["day"] = ((hour >= 12) & (hour <= 18)).astype('int')
    full_new_feature["evening"] = ((hour >= 19) & (hour <= 23)).astype('int')
    full_new_feature["night"] = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, full_new_feature.values])
    return X

In [10]:
%%time
full_sites_sparse = add_time_features(full_df.fillna(0), full_sites_sparse)

Wall time: 2min 28s


In [11]:
full_sites_sparse.shape

(336358, 48375)

In [12]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [88]:
def get_auc_roc_cv(X,y,best_C ={'C':1},seed=17):
    lr = LogisticRegression(C=best_C['C'], random_state=seed, solver='liblinear')
    time_split = TimeSeriesSplit(n_splits=10)    
    scores =cross_val_score(lr,X,y,cv=time_split,scoring='roc_auc')
    return (scores,scores.mean(),scores.std())

In [41]:
def cross_val_C_calc(X,y, c_values,seed=17):
    
    logit = LogisticRegression(C=1, random_state=seed)
    time_split = TimeSeriesSplit(n_splits=10)
    
    logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                      scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)
    logit_grid_searcher.fit(X, y)

    return (logit_grid_searcher.best_score_, logit_grid_searcher.best_params_, logit_grid_searcher)

In [98]:
X_train = full_sites_sparse.tocsr()[:idx_split,:]
X_test = full_sites_sparse.tocsr()[idx_split:,:]

In [21]:
c_values = np.logspace(-2, 2, 10)

best_score, best_C = cross_val_C_calc(X_train,y_train,c_values)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.7min finished


Wall time: 2min 50s


In [22]:
(best_score,best_C) #(0.9247612064520712, {'C': 1.6681005372000592})

(0.9247612064520712, {'C': 1.6681005372000592})

In [92]:
get_auc_lr_valid(X_train,y_train)#0.9529920800544904

0.9488193297151332

In [None]:
get_auc_roc_cv(X_train,y_train)

In [None]:
# logit_grid_searcher.best_score_, logit_grid_searcher.best_params_  ## 0.94615 PB ##0.92498 val

In [93]:
full_new_feature = pd.DataFrame(index=full_df.index)
full_new_feature['start_hour'] = full_df.time1.dt.hour
full_new_feature['start_month'] = full_df['time1'].apply(lambda ts:100 * ts.year + ts.month).astype('float64')

tmp_scaled_full = StandardScaler().fit_transform(full_new_feature[['start_month','start_hour']])

In [94]:
X_train = hstack([full_sites_sparse.tocsr()[:idx_split,:],tmp_scaled_full[:idx_split,:]]).tocsr()
X_test = hstack([full_sites_sparse.tocsr()[idx_split:,:],tmp_scaled_full[idx_split:,:]]).tocsr()

In [95]:
get_auc_lr_valid(X_train,y_train)

0.9679433472699255

In [17]:
c_values = np.logspace(-2, 2, 10)
best_score, best_C = cross_val_C_calc(X_train,y_train,c_values)
(best_score,best_C) #(0.915237083184964, {'C': 0.21544346900318834})

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.9min finished


(0.915237083184964, {'C': 0.21544346900318834})

In [97]:
get_auc_roc_cv(X_train,y_train)

(array([0.78740524, 0.73398389, 0.96401056, 0.9774022 , 0.89305302,
        0.95997737, 0.95151356, 0.93562682, 0.96876531, 0.96542777]),
 0.913716573717184,
 0.08063737402725764)

In [42]:
lr = LogisticRegression(C=best_C['C'], random_state=17, solver='liblinear').fit(X_train, y_train)
y_test = lr.predict_proba(X_test)[:, 1]
write_to_submission_file(y_test, 'sumb_8.csv') # LB 0.92909, c=1.66 

In [43]:
lr = LogisticRegression(C=C, random_state=17, solver='liblinear').fit(X_train, y_train)
y_test = lr.predict_proba(X_test)[:, 1]
write_to_submission_file(y_test, 'sumb_9.csv') #LB0.92780, c=1.29 

#### Add time difference between sessions

In [81]:
timedf2.head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [82]:
timedf2 = full_df[times]
timedff = timedf2.diff(axis=1,periods=1)

In [84]:
timedff.head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,NaT,00:00:00,91034 days 15:41:19.854776,00:00:00,0 days 00:00:00,0 days,00:00:00,00:00:00,00:00:00,00:00:00
54843,NaT,00:00:00,0 days 00:29:44,00:00:02,91034 days 14:40:07.854776,0 days,00:00:00,00:00:00,00:00:00,00:00:00
77292,NaT,00:00:01,0 days 00:00:01,00:00:00,0 days 00:00:01,0 days,00:00:00,00:00:00,00:00:01,00:00:00
114021,NaT,00:00:00,0 days 00:00:01,00:00:00,0 days 00:00:00,0 days,00:00:01,00:00:00,00:00:00,00:00:01
146670,NaT,00:00:00,0 days 00:00:00,00:00:01,0 days 00:00:00,0 days,00:00:00,00:00:01,00:00:00,00:00:00


In [85]:
timedff = timedff.apply(lambda x: x.dt.seconds).iloc[:,1:]

In [86]:
timedff.head()

Unnamed: 0_level_0,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21669,0,56479,0,0,0,0,0,0,0
54843,0,1784,2,52807,0,0,0,0,0
77292,1,1,0,1,0,0,0,1,0
114021,0,1,0,0,0,1,0,0,1
146670,0,0,1,0,0,0,1,0,0


In [None]:
def func3(x):
    if x.last_valid_index() is None:
        return None
    else:
        return x[x.last_valid_index()]
    
last_site =full_df[sites].apply(func2, axis=1)

In [66]:
tmp_scaled_time = StandardScaler().fit_transform(timedff)

In [73]:
X_train = hstack([full_sites_sparse.tocsr()[:idx_split,:],tmp_scaled_time[:idx_split,:],tmp_scaled_full[:idx_split,:],
                  first_site_normalize[:idx_split,:],last_site_normalize[:idx_split,:]]).tocsr()
X_test = hstack([full_sites_sparse.tocsr()[idx_split:,:],tmp_scaled_time[idx_split:,:],tmp_scaled_full[idx_split:,:],
                first_site_normalize[idx_split:,:],last_site_normalize[idx_split:,:]]).tocsr()

In [74]:
X_train.shape

(253561, 48388)

In [76]:
c_values = np.logspace(-2, 2, 10)
best_score, best_C,logit_Grid = cross_val_C_calc(X_train,y_train,c_values)
(best_score,best_C)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.7min finished


(0.9153056984488351, {'C': 0.5994842503189409})

In [77]:
lr = LogisticRegression(C=best_C['C'], random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)    
scores =cross_val_score(lr,X_train,y_train,cv=time_split,scoring='roc_auc')
(scores,scores.mean(),scores.std())

(array([0.79666801, 0.73981366, 0.96265773, 0.98151405, 0.89352197,
        0.96156984, 0.94943119, 0.93386802, 0.9681774 , 0.96583512]),
 0.915305698448835,
 0.07808378013987449)

In [78]:
lr = LogisticRegression(C=best_C['C'], random_state=17, solver='liblinear').fit(X_train, y_train)
y_test = lr.predict_proba(X_test)[:, 1]
write_to_submission_file(y_test, 'sumb_10.csv') # LB 0.92521, c=0.5994842503189409 cv:0.9153056984488351

Adding first site and last site

In [26]:
def func1(x):
    if x.first_valid_index() is None:
        return None
    else:
        return x[x.first_valid_index()]
    
def func2(x):
    if x.last_valid_index() is None:
        return None
    else:
        return x[x.last_valid_index()]
    
last_site =full_df[sites].apply(func2, axis=1)
first_site =full_df[sites].apply(func2, axis=1)

first_site.to_pickle('./data/all/first_site.pkl')
last_site.to_pickle('./data/all/last_site.pkl')

In [28]:
first_site.head()

session_id
21669      55.0
54843      55.0
77292     946.0
114021    946.0
146670    947.0
dtype: float64

In [34]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [29]:
first_site = LabelEncoder().fit_transform(first_site)
last_site = LabelEncoder().fit_transform(last_site)

In [46]:
first_site_normalize = StandardScaler().fit_transform(first_site.reshape(-1, 1))
last_site_normalize= StandardScaler().fit_transform(last_site.reshape(-1, 1))



In [47]:
X_train = hstack([full_sites_sparse.tocsr()[:idx_split,:],first_site_normalize[:idx_split,:]]).tocsr()
X_test = hstack([full_sites_sparse.tocsr()[idx_split:,:],first_site_normalize[idx_split:,:]]).tocsr()

In [48]:
lr = LogisticRegression(C=best_C['C'], random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)    
scores =cross_val_score(lr,X_train,y_train,cv=time_split,scoring='roc_auc')
(scores,scores.mean(),scores.std())

(array([0.87167254, 0.75774452, 0.91723685, 0.96616951, 0.90648696,
        0.94352683, 0.94998777, 0.92310685, 0.95223493, 0.946856  ]),
 0.9135022747761145,
 0.05812470028812712)

In [38]:
lr = LogisticRegression(C=best_C['C'], random_state=17, solver='liblinear').fit(X_train, y_train)
y_test = lr.predict_proba(X_test)[:, 1]
write_to_submission_file(y_test, 'sumb_9.csv') # LB 0.94013, c=0.21544346900318834, cv:0.9135877619314569