In [2]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_df = pd.read_csv("data/train_sessions.csv", index_col = 'session_id')
test_df = pd.read_csv("data/test_sessions.csv", index_col = 'session_id')

In [4]:
sites = ['site%d' % i for i in range(1,11)]

In [5]:
times = ['time%d' % i for i in range(1,11)]

In [6]:
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [7]:
train_df = train_df.sort_values(by='time1')

In [8]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [9]:
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")

In [11]:
with open(f"data/site_dic.pkl", 'rb') as input_file:
    site_dict = pickle.load(input_file)

In [12]:
sites_dict_df = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])

In [13]:
y_train = train_df['target']

In [14]:
full_df  = pd.concat((train_df.drop('target', axis=1), test_df))

In [15]:
idx_split = train_df.shape[0]

In [16]:
full_sites = full_df[sites]

In [17]:
full_sites

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947
...,...,...,...,...,...,...,...,...,...,...
82793,812,1039,676,0,0,0,0,0,0,0
82794,300,302,302,300,300,1222,302,1218,1221,1216
82795,29,33,35,22,37,6779,30,21,23,6780
82796,5828,23,21,804,21,3350,23,894,21,961


In [18]:
sites_flatten = full_sites.values.flatten()

In [19]:
full_sites_sparce = csr_matrix(([1] * sites_flatten.shape[0], sites_flatten, range(0, sites_flatten.shape[0]+10,10)))[:,1:]

In [20]:
X_train_sparce = full_sites_sparce[:idx_split]
X_test_sparce = full_sites_sparce[idx_split:]

In [21]:
def get_auc_lr_valid(X,y, C=1.0, ratio=0.9, seed=17):
    
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len]
    X_test = X[train_len:]
    y_train = y[:train_len]
    y_test = y[train_len:]

    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    logit.fit(X_train, y_train)
    valid_pred = logit.predict_proba(X_test)[:,1]

    return roc_auc_score(y_test, valid_pred)


In [22]:
get_auc_lr_valid(X_train_sparce, y_train)

0.9197949536813961

In [23]:
logit = LogisticRegression(random_state=17,n_jobs=-1)
logit.fit(X_train_sparce,y_train)

LogisticRegression(n_jobs=-1, random_state=17)

In [24]:
test_predict = logit.predict_proba(X_test_sparce)[:,1]

In [25]:
test_predict.shape

(82797,)

In [26]:
pd.Series(test_predict, 
          index=range(1,test_predict.shape[0]+1), 
          name='target').to_csv("data/benchmark1.csv", header=True, index_label='session_id')

In [27]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_test = pd.DataFrame(index=test_df.index)

In [28]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

In [29]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))

new_feat_test['year_month_scaled'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1,1))
new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))

In [30]:
X_train_sparce_new = hstack([X_train_sparce, new_feat_train['year_month_scaled'].values.reshape(-1,1)])

In [31]:
new_feat_test['start_hour'] = test_df.time1.map(lambda x: x.hour)
new_feat_train['start_hour'] = train_df.time1.map(lambda x: x.hour)


new_feat_test['morning'] = (new_feat_test['start_hour'] <= 11).map({True:1, False:0})
new_feat_train['morning'] = (new_feat_train['start_hour'] <= 11).map({True:1, False:0})

In [32]:
get_auc_lr_valid(new_feat_train, y_train)

0.8811486422025218