In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
train_df = pd.read_csv('./data/all/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('./data/all/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('./data/all/train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('./data/all/test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [5]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('./data/all/train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
    
with open('./data/all/test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
    
X_train.shape, X_test.shape

Wall time: 23.3 s


In [6]:
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [7]:
y_train = train_df.target.astype('int')

In [8]:
time_split = TimeSeriesSplit(n_splits=10)

In [None]:
# [(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

### Just using count vector

In [9]:
logit = LogisticRegression(C=1, random_state=17)

In [None]:
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

In [None]:
cv_scores, cv_scores.mean()

In [None]:
logit.fit(X_train, y_train)

### Adding more features

In [10]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [11]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

Wall time: 2min 27s


In [None]:
X_train_new.shape, X_test_new.shape

In [None]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

In [None]:
cv_scores, cv_scores.mean()

In [None]:
logit.fit(X_train_new, y_train)

In [None]:
logit_test_pred3 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'baseline_3.csv')

### finding optimum value for C

In [None]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [None]:
%%time
logit_grid_searcher.fit(X_train_new, y_train)

In [None]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

In [None]:
c=logit_grid_searcher.best_params_

In [38]:
c= 0.16

In [None]:
logit_test_pred4 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred4, 'sumb_4.csv')

### tfid

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer , TfidfTransformer

In [16]:
tf = TfidfVectorizer(ngram_range=(1,3),max_features=50000,)

In [23]:
Tf_trn = TfidfTransformer()

In [27]:
X_train_tfid = Tf_trn.fit_transform(X_train)
X_test_tfid = Tf_trn.fit_transform(X_test)

In [33]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train_tfid)
X_test_new = add_time_features(test_df.fillna(0), X_test_tfid)

Wall time: 2min 28s


In [34]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

Wall time: 17.9 s


In [35]:
cv_scores,cv_scores.mean()

array([0.87119036, 0.80638135, 0.92863515, 0.96590791, 0.9158636 ,
       0.95176604, 0.94829952, 0.93836686, 0.95197759, 0.95070139])

In [36]:
cv_scores.mean()

0.9229089769634182

In [41]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)


In [42]:
%%time
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.6min finished


Wall time: 2min 43s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=0.16, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [43]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9249802732195809, {'C': 1.6681005372000592})

In [39]:
# logit = LogisticRegression(C=c, random_state=17)
# logit.fit(X_train_new, y_train)

LogisticRegression(C=0.16, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
logit_test_pred6 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred6, 'sumb_6.csv') # 0.94615

### Adding extra features

In [None]:
def add_new_time_features(df, X_sparse):
    hour1 = df['time1'].apply(lambda ts: ts.hour)
    
#     morning = ((hour >= 7) & (hour <= 11)).astype('int')
#     day = ((hour >= 12) & (hour <= 18)).astype('int')
#     evening = ((hour >= 19) & (hour <= 23)).astype('int')
#     night = ((hour >= 0) & (hour <= 6)).astype('int')    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [None]:
top_sites = pd.Series(train_df.loc[train_df.target==1,sites].values.flatten()
                     ).value_counts().sort_values(ascending=False).head(5)
print(top_sites)
sites_dict.loc[top_sites.index]

In [12]:
len(set(train_df.loc[train_df.target==1,sites].values.flatten()))

1254

In [57]:
cust = lambda g : (train_df.loc[g.index]['time2'].dt.hour - train_df.loc[g.index]['time1'].dt.hour).mean() 
f = {'time2': {'diff':cust}}
train_df.groupby('target').agg(f)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,time2
Unnamed: 0_level_1,diff
target,Unnamed: 1_level_2
0,0.005077
1,0.001744


In [73]:
# ((train_df.time2-train_df.time1)/np.timedelta64(1, 's')).tail()

In [71]:
(train_df.time2-train_df.time1).astype('timedelta64[s]').tail()

session_id
12224     1.0
164438    1.0
12221     2.0
156968    0.0
204762    6.0
dtype: float64