In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from pathlib2 import Path

In [53]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

Read training and test sets, sort train set by session start time.

In [186]:
# Read the training and test data sets, change paths if needed
PATH_TO_DATA = Path('../../data/alice/')

times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


Transform data into format which can be fed into CountVectorizer

In [4]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)


In [5]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


Fit CountVectorizer and transfrom data with it.

In [6]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: user 10.6 s, sys: 256 ms, total: 10.8 s
Wall time: 10.8 s


((253561, 50000), (82797, 50000))

In [7]:
cv.vocabulary_

{'56': 37327,
 '55': 36748,
 '56 55': 37369,
 '55 56': 36949,
 '56 55 56': 37399,
 '55 56 55': 36967,
 '946': 49032,
 '951': 49150,
 '945': 49024,
 '948': 49104,
 '784': 44765,
 '949': 49146,
 '946 946': 49037,
 '946 951': 49047,
 '951 946': 49153,
 '946 945': 49036,
 '945 948': 49028,
 '948 784': 49120,
 '947': 49085,
 '948 945': 49121,
 '945 946': 49025,
 '946 947': 49041,
 '947 945': 49086,
 '950': 49148,
 '952': 49162,
 '947 950': 49094,
 '948 947': 49124,
 '953': 49165,
 '955': 49206,
 '947 953': 49097,
 '953 946': 49183,
 '947 946': 49087,
 '946 953': 49049,
 '953 955': 49198,
 '955 946': 49207,
 '947 953 946': 49099,
 '953 946 947': 49185,
 '946 947 946': 49042,
 '947 946 953': 49090,
 '1033': 744,
 '953 947': 49187,
 '953 1033': 49166,
 '953 947 946': 49188,
 '953 946 953': 49186,
 '954': 49203,
 '956': 49224,
 '957': 49233,
 '946 954': 49052,
 '946 956': 49054,
 '956 946': 49225,
 '946 955': 49053,
 '946 948': 49044,
 '946 946 946': 49038,
 '946 946 948': 49040,
 '49': 34338,


Save train targets into a separate vector.

In [8]:
y_train = train_df['target'].astype('int').values

We'll be performing time series cross-validation, see sklearn TimeSeriesSplit and this dicussion on StackOverflow.

In [9]:
time_split = TimeSeriesSplit(n_splits=10)

In [10]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

Perform time series cross-validation with logistic regression

In [11]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [12]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=8)

CPU times: user 79.6 ms, sys: 109 ms, total: 189 ms
Wall time: 12.5 s


In [13]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64669477, 0.87991957, 0.9631551 , 0.84221742,
        0.87840596, 0.94476054, 0.85321751, 0.92987618, 0.90752852]),
 0.8677195468747974)

In [14]:
%%time
logit.fit(X_train, y_train)

CPU times: user 21.6 s, sys: 89.6 ms, total: 21.7 s
Wall time: 14.3 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

Now we'll add some time features: indicators of morning, day, evening and night.

In [38]:
def add_time_features(df, X_sparse):
    #hour = df['time1'].apply(lambda ts: ts.hour)
    hour = df['time1'].dt.hour
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [17]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

CPU times: user 1min 38s, sys: 1.54 s, total: 1min 40s
Wall time: 1min 39s


In [18]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [19]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=8)

CPU times: user 56.3 ms, sys: 52.1 ms, total: 108 ms
Wall time: 12.6 s


In [20]:
cv_scores, cv_scores.mean()

(array([0.87652191, 0.75122947, 0.93061982, 0.97864617, 0.90399626,
        0.93831404, 0.96249244, 0.92731339, 0.94886477, 0.9404357 ]),
 0.9158433979627594)

In [21]:
logit.fit(X_train_new, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

Now we tune regularization parameter C.

In [23]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=10, cv=time_split, verbose=1)

In [24]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    7.8s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  2.0min finished


CPU times: user 10.1 s, sys: 147 ms, total: 10.3 s
Wall time: 2min 5s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [31]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9173760760277387, {'C': 0.21544346900318834})

In [32]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242

# let's try to beat 0.95343

In [114]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from pathlib2 import Path
from mlxtend.feature_selection import ColumnSelector


In [15]:
PATH_TO_DATA = Path('../../data/alice/')


sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv', index_col='session_id', parse_dates=times)
train_df = train_df.sort_values('time1')

test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv', index_col='session_id', parse_dates=times)
y_train = train_df['target']

In [16]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [139]:
class SiteStringer(BaseEstimator, TransformerMixin):
    """Takes raw dataframe and makes a list of strings with site IDs"""
    def fit(self, x, y=None):
        return self
    
    def transform(self, df):
        return (df.fillna(0)
                  .astype('int')
                  .to_csv(header=None, sep=' ', index=None)
                  .strip('\n')
                  .split('\n'))


class TimeFeatureExtractor(BaseEstimator, TransformerMixin):
    """Takes a dataframe of times and makes time features"""
    def fit(self, x, y=None):
        return self
        
    def transform(self, df):
        features = pd.DataFrame(index=df.index)

        hour = df['time1'].dt.hour
        features['is_morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
        features['is_day'] = ((hour >= 12) & (hour <= 18)).astype('int')
        features['is_evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
        features['is_night'] = ((hour >= 0) & (hour <= 6)).astype('int')
        
        # Find sessions' starting and ending
        min_time = df[times].min(axis=1)
        max_time = df[times].max(axis=1)

        # Calculate sessions' duration in seconds
        features['session_length'] = (max_time - min_time) / np.timedelta64(1, 's')
        features['day_of_week'] = df['time1'].dt.dayofweek
        features['month'] = df['time1'].dt.month

        return features

In [140]:
# build a site_transformer
site_transformer = Pipeline(steps=[
    ('to_string', SiteStringer()),
    ('count_vec', CountVectorizer(ngram_range=(1, 3), max_features=50000)),
])
preprocessor = ColumnTransformer(transformers=[
    ('sites', site_transformer, sites),
])


In [141]:
# build a site_transformer
site_transformer = Pipeline(steps=[
    ('to_string', SiteStringer()),
    ('count_vec', CountVectorizer(ngram_range=(1, 5), max_features=50000)),
    ('tfidf', TfidfTransformer())
])

In [142]:
# build a time_transformer
time_transformer = Pipeline(steps=[
    ('basic', TimeFeatureExtractor()),
    ('select', ColumnSelector(cols=('is_morning', 'is_day', 'is_evening', 'is_night')))
])

In [128]:
# apply the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('sites', site_transformer, sites),
    ('times', time_transformer, times)
])

In [129]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [136]:
model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

In [137]:
model.steps

[('prep',
  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
           transformer_weights=None,
           transformers=[('sites', Pipeline(memory=None,
       steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=50000,...e))]), ['time1', 'time2', 'time3', 'time4', 'time5', 'time6', 'time7', 'time8', 'time9', 'time10'])])),
 ('logit',
  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='warn',
            n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
            tol=0.0001, verbose=0, warm_start=False))]

In [131]:
model.fit(train_df, y_train)

Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [54]:
preds = model.predict_proba(test_df)[:, 1]
write_to_submission_file(preds, 'subm5.csv') # 0.94683

In [56]:
time_split = TimeSeriesSplit(n_splits=10)

In [57]:
c_values = np.logspace(-2, 2, 10)

model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

model_grid_searcher = GridSearchCV(estimator=model, param_grid={'logit__C': c_values},
                                  scoring='roc_auc', n_jobs=10, cv=time_split, verbose=1)

CPU times: user 316 µs, sys: 7 µs, total: 323 µs
Wall time: 312 µs


In [58]:
%%time
model_grid_searcher.fit(train_df, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   54.3s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  3.2min finished


CPU times: user 21.2 s, sys: 590 ms, total: 21.8 s
Wall time: 3min 31s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'logit__C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [60]:
model_grid_searcher.best_score_, model_grid_searcher.best_params_

(0.9239997164996069, {'logit__C': 1.6681005372000592})

In [61]:
preds = model_grid_searcher.predict_proba(test_df)[:, 1]
write_to_submission_file(preds, 'subm6.csv') # 0.94707

In [62]:
C = 0.21544346900318834

In [63]:
logit = LogisticRegression(C=C, random_state=17, solver='liblinear')

In [64]:
model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

In [65]:
model.fit(train_df, y_train)

Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [66]:
preds = model.predict_proba(test_df)[:, 1]
write_to_submission_file(preds, 'subm7.csv') # 0.94267

In [80]:
# we need to scale time

In [85]:
# build a time_transformer
time_transformer = Pipeline(steps=[
    ('basic', TimeFeatureMaker()),
    ('picker', ColumnTransformer([
        ('session_length', StandardScaler(), ['session_length'])
    ], remainder='passthrough'))
])

In [86]:
# apply the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('sites', site_transformer, sites),
    ('times', time_transformer, times)
])

In [87]:
c_values = np.logspace(-2, 2, 10)

model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

model_grid_searcher = GridSearchCV(estimator=model, param_grid={'logit__C': c_values},
                                  scoring='roc_auc', n_jobs=10, cv=time_split, verbose=1)

In [88]:
%%time
model_grid_searcher.fit(train_df, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   51.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  3.1min finished


CPU times: user 17.7 s, sys: 645 ms, total: 18.3 s
Wall time: 3min 19s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'logit__C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [89]:
model_grid_searcher.best_score_, model_grid_searcher.best_params_

(0.9239897653734674, {'logit__C': 1.6681005372000592})

In [90]:
preds = model_grid_searcher.predict_proba(test_df)[:, 1]
write_to_submission_file(preds, 'subm8.csv') # 0.94707

# try lasso

In [91]:
logit = LogisticRegression(penalty='l1', C=1, random_state=17, solver='liblinear')

In [92]:
model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

model_grid_searcher = GridSearchCV(estimator=model, param_grid={'logit__C': c_values},
                                  scoring='roc_auc', n_jobs=10, cv=time_split, verbose=1)

In [93]:
%%time
model_grid_searcher.fit(train_df, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   50.3s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  4.9min finished


CPU times: user 18.8 s, sys: 667 ms, total: 19.4 s
Wall time: 5min 12s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l1', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'logit__C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [94]:
model_grid_searcher.best_score_, model_grid_searcher.best_params_

(0.9246915254488547, {'logit__C': 4.6415888336127775})

In [95]:
preds = model_grid_searcher.predict_proba(test_df)[:, 1]
write_to_submission_file(preds, 'subm9.csv') # 0.94215

# try category encoding

In [155]:
from category_encoders.target_encoder import TargetEncoder

In [156]:
# build a site_transformer
site_transformer = Pipeline(steps=[
    ('to_string', SiteStringer()),
    ('count_vec', CountVectorizer(ngram_range=(1, 5), max_features=50000)),
    ('tfidf', TfidfTransformer())
])

In [157]:
time_transformer = Pipeline(steps=[
    ('basic', TimeFeatureMaker()),
    ('picker', ColumnTransformer([
        ('scaler', StandardScaler(), ['session_length']),
        ('encoder', TargetEncoder(), ['day_of_week', 'month'])
    ], remainder='passthrough'))
])

In [158]:
preprocessor = ColumnTransformer(transformers=[
    ('sites', site_transformer, sites),
    ('times', time_transformer, times)
])

In [159]:
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', logit)
])

In [160]:
c_values = np.logspace(-2, 2, 10)

grid_searcher = GridSearchCV(
    estimator=pipe,
    param_grid={'logit__C': c_values},
    scoring='roc_auc',
    n_jobs=10,
    cv=time_split,
    verbose=1
)

In [162]:
%%time
grid_searcher.fit(train_df, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  4.9min finished


CPU times: user 31.6 s, sys: 1.53 s, total: 33.1 s
Wall time: 5min 21s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('prep', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('sites', Pipeline(memory=None,
     steps=[('to_string', SiteStringer()), ('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='stric...alty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'logit__C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [163]:
grid_searcher.best_score_, grid_searcher.best_params_

(0.9169679468821296, {'logit__C': 4.6415888336127775})

In [168]:
grid_searcher.cv_results_['mean_test_score']

array([0.85429792, 0.8778973 , 0.89520675, 0.90656619, 0.91317331,
       0.91666104, 0.91696795, 0.91465547, 0.91240752, 0.90977036])

In [169]:
grid_searcher.cv_results_['std_test_score']

array([0.10938405, 0.09492241, 0.08389083, 0.07793173, 0.07216026,
       0.06709131, 0.06496537, 0.06392055, 0.06139666, 0.05857167])

In [161]:
model_pred = model.predict_proba(test_df)[:, 1]

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [244]:
model_test_pred2

array([5.96718434e-05, 7.39286651e-08, 7.13873722e-08, ...,
       2.15149267e-03, 1.79966206e-05, 3.15131966e-07])

In [160]:
logit_test_pred

array([1.46616461e-03, 2.05809873e-08, 4.68841892e-08, ...,
       2.88173848e-03, 5.58172648e-04, 1.41339344e-05])

In [245]:
model_pred

array([5.96147246e-05, 7.39316402e-08, 7.14386462e-08, ...,
       2.14930678e-03, 1.79816570e-05, 3.14517259e-07])

In [70]:
# tfidf
# 
# time of session in seconds
# day of week OHE
# start hour

In [114]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,NaT,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [None]:
def make_site_features(df):
    df[sites].fillna(0).to_csv('tmp_sessions.txt', sep=' ', index=None, header=None)
    

In [115]:
train_df[:2].to_string()

'            site1               time1  site2               time2   site3               time3  site4               time4  site5               time5   site6               time6   site7               time7   site8               time8   site9               time9  site10              time10  target\nsession_id                                                                                                                                                                                                                                                                                            \n1             718 2014-02-20 10:02:45    NaN                 NaT     NaN                 NaT    NaN                 NaT    NaN                 NaT     NaN                 NaT     NaN                 NaT     NaN                 NaT     NaN                 NaT     NaN                 NaT       0\n2             890 2014-02-22 11:19:50  941.0 2014-02-22 11:19:50  3847.0 2014-02-22 11:19:51  941.0 2014-02-22 

In [93]:
train_sessions = pd.read_table('train_sessions_text.txt', sep=' ', header=0).values

In [94]:
train_sessions

array([[  56,   55,   56, ...,    0,    0,    0],
       [ 946,  946,  951, ...,  784,  949,  946],
       [ 945,  948,  949, ...,  945,  946,  946],
       ...,
       [  52, 3346,  784, ..., 7330, 3594, 3329],
       [3328, 3324, 3599, ..., 3359, 3359, 3346],
       [ 222, 3346, 3346, ...,    0,    0,    0]])

In [73]:
transformer = TfidfTransformer(smooth_idf=False)

In [74]:
%%time
X_tfidf = transformer.fit_transform(X_train)

CPU times: user 151 ms, sys: 4.32 ms, total: 155 ms
Wall time: 129 ms


In [75]:
# time features
hour = train_df['time1'].dt.hour
morning = ((hour >= 7) & (hour <= 11)).astype('int')
day = ((hour >= 12) & (hour <= 18)).astype('int')
evening = ((hour >= 19) & (hour <= 23)).astype('int')
night = ((hour >= 0) & (hour <= 6)).astype('int')


In [61]:
day_of_week = train_df['time1'].dt.weekday
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(day_of_week.values.reshape(-1, 1))

TypeError: __init__() got an unexpected keyword argument 'drop'

In [62]:
??OneHotEncoder

array([ 8,  8,  8, ..., 23, 23, 23])

In [63]:
day_of_week_OHE = enc.transform(day_of_week.values.reshape(-1, 1))[:,1:]

In [64]:
%%time
X = hstack([X_tfidf,
            morning.values.reshape(-1, 1), 
            day.values.reshape(-1, 1),
            evening.values.reshape(-1, 1), 
            night.values.reshape(-1, 1),
            hour.values.reshape(-1,1),
            day_of_week_OHE])

CPU times: user 164 ms, sys: 12.6 ms, total: 177 ms
Wall time: 81.8 ms


In [65]:
cv_scores = cross_val_score(logit, X, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=8)

In [67]:
cv_scores, cv_scores.mean()

(array([0.75372165, 0.83503966, 0.89446062, 0.97918675, 0.90079151,
        0.97884438, 0.92108396, 0.94015448, 0.81492106, 0.97630635]),
 0.8994510407476037)

In [68]:
%%time
logit_grid_searcher.fit(X, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    5.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:   41.1s finished


CPU times: user 6.87 s, sys: 135 ms, total: 7 s
Wall time: 45.4 s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [69]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.8993688055795148, {'C': 1.6681005372000592})

# Lasso, get non-zero features

In [367]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from scipy.optimize import differential_evolution

X, y = make_classification(n_samples=2000, n_features=50, n_informative=10, random_state=10)
logit = LogisticRegression(penalty='l1', C=1.0)

target = 10

def func(C):
    logit = LogisticRegression(penalty='l1', C=C[0], solver='liblinear')
    logit.fit(X, y)
    n_nonzero = np.sum(logit.coef_ != 0)
    return (target-n_nonzero)**2

differential_evolution(func, bounds=[(0, 2)], tol=0.1, maxiter=5)

     fun: 0.0
 message: 'Maximum number of iterations has been exceeded.'
    nfev: 92
     nit: 5
 success: False
       x: array([0.02574225])

In [369]:
logit = LogisticRegression(penalty='l1', C=0.0257, solver='liblinear')
logit.fit(X, y)
np.sum(logit.coef_ != 0)

10

In [378]:
class LassoNSelector(BaseEstimator, TransformerMixin):
    """This transformer finds the best N coefficients in a Lasso"""

    def __init__(self, N=20):
        self.N = N
        self.cols = None
    
    def fit(self, x, y=None):
        def func(C):
            logit = LogisticRegression(penalty='l1', C=C[0], solver='liblinear')
            logit.fit(X, y)
            n_nonzero = np.sum(logit.coef_ != 0)
            return (self.N-n_nonzero)**2  
        
        differential_evolution(func, bounds=[(0, 2)], tol=0.1, maxiter=20)
        
        #assert np.sum(logit.coef_ !=0) == self.N

        self.cols = (logit.coef_ !=0)[0]
        return self
        
    def transform(self, X):
        return X[:, self.cols]

class PassThru(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        return X
    
# data --> FeatureUnion(PassThru, Pipeline(LassoNSelector, PolynomialFeatures))

In [371]:
lns = LassoNSelector(10)
sel = np.repeat(False, 50)
sel[0] = True

In [372]:
lns.fit(X, y)

LassoNSelector(N=10)

In [375]:
lns.transform(X).shape

(2000, 10)

In [365]:
sel

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [331]:
X[:, 10]

array([-2.31670583,  0.2776982 , -0.52779681, ..., -0.01362218,
       -0.47736536, -0.31843113])

In [296]:
logit.coef_

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.41906931,
         0.        ,  0.        ,  0.        ,  0.10642132,  0.        ,
         0.        ,  0.06075621,  0.        , -0.25040566, -0.08884983,
         0.        ,  0.        ,  0.        ,  0.        ,  0.01892889,
         0.        ,  0.        ,  0.        , -0.99702976,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.23003589,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.47054174,  0.        ,  0.        ,
         0.        , -0.38939861,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [299]:
n_nonzero = np.sum(logit.coef_ != 0)

In [298]:
n_nonzero

10







     fun: 0.0
 message: 'Optimization terminated successfully.'
    nfev: 122
     nit: 7
 success: True
       x: array([0.03132074])

In [259]:
C0

array([0.5])

In [262]:
#Rosenbrock Function
def fun_rosenbrock(x):
   return np.array([10 * (x[1] - x[0]**2), (1 - x[0])])
   
from scipy.optimize import least_squares
input = np.array([4, 2])
res = least_squares(fun_rosenbrock, input)

print(res)

 active_mask: array([0., 0.])
        cost: 6.162975822039155e-31
         fun: array([-1.11022302e-15,  0.00000000e+00])
        grad: array([ 2.22044607e-14, -1.11022302e-14])
         jac: array([[-20.00000015,  10.        ],
       [ -1.        ,   0.        ]])
     message: '`gtol` termination condition is satisfied.'
        nfev: 4
        njev: 4
  optimality: 2.2204460657939253e-14
      status: 1
     success: True
           x: array([1., 1.])


In [33]:
from wordbatch.models import FM_FTRL

In [57]:
%%time
modelF = FM_FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=X_train_new.shape[1],
                    alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15,
                    inv_link="sigmoid", threads=8)

CPU times: user 101 ms, sys: 36 ms, total: 137 ms
Wall time: 49.7 ms


In [58]:
modelF.fit(X_train_new, y_train)

Total e: 6493.802805323637
Total e: 4053.556609184768
Total e: 3622.9755687458073
Total e: 3362.4154739248543
Total e: 3177.9835674862807
Total e: 3033.9980971711943
Total e: 2914.1283612662137
Total e: 2810.4854671922985
Total e: 2719.002908387339
Total e: 2637.1639679429927
Total e: 2563.159171622697
Total e: 2495.6239905384255
Total e: 2433.5434798916067
Total e: 2376.126902382593
Total e: 2322.7639836313024


<wordbatch.models.fm_ftrl.FM_FTRL at 0x55e5587fdf60>

In [59]:
preds = modelF.predict(X_test_new)

In [55]:
preds_comb = (0.5*logit_test_pred3) + (0.5*preds)

In [46]:
from sklearn.metrics import roc_auc_score

In [48]:
roc_auc_score(y_train, preds)

0.9877243530025209

In [60]:
write_to_submission_file(preds, 'sub_fm_l2.csv') # 0.93792


# From Model Validation in a Competition NB