Strategy:
1. Create 6 useful features
2. Cross-validate with TimeSeriesSplit (say 10 splits)
3. Scoring with 'roc_auc'
4. GridSearchCV for regularization parameter

### Imports

In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
sns.set()

### Helper functions

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# function to add time features to the data
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    # dayofweek = df['time1'].apply(lambda ts: ts.dayofweek) ## added this myself
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)]) #, dayofweek.values.reshape(-1,1)])
    return X

In [4]:
# function to add more features
def add_more_features(df, X_sparse):
    time_min = df[times].min(axis=1)
    time_max = df[times].max(axis=1)
    sess_time = (time_max - time_min) / np.timedelta64(1, 's')
    
    tmp = StandardScaler().fit_transform([sess_time])
    X = hstack([X_sparse, tmp.reshape(-1, 1)])
    print(sess_time.shape)
    return X

In [5]:
## Create time-split CV and logistic regression estimator
logit = LogisticRegression(C=1, random_state=17)
time_split = TimeSeriesSplit(n_splits=10)

### Extracting data

In [6]:
# Read the training and test data sets, change paths if needed
train_df = pd.read_csv('../../data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../../data/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
# train_df.head()

In [7]:
# Load websites dictionary
with open("../../data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
#sites_dict.head()

Websites total: 48371


In [8]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)


In [9]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000) # not defining max_features as 50,000 here
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)

## This only puts the sites data into a sparse matrix
## We'll need to add more feature columns later

Wall time: 11.4 s


In [10]:
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [11]:
## Train target data is saved separately
y_train = train_df['target'].astype('int')

## full dataset
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# split index
idx_split = train_df.shape[0]

In [None]:
## Test/Train/Full Sites DataFrames
train_df_sites = train_df[sites].fillna(0).astype('int')
test_df_sites = test_df[sites].fillna(0).astype('int')
full_sites = full_df[sites].fillna(0).astype('int')

In [None]:
## Test/Train/Full Times DataFrames
train_df_times = train_df[times]
test_df_times = test_df[times]
full_times = full_df[times]

In [None]:
train_df_new_features = pd.DataFrame(index=train_df.index)
train_df_new_features['hour'] = train_df_times['time1'].apply(lambda ts: ts.hour)
train_df_new_features['day'] = train_df_times['time1'].apply(lambda ts: ts.day)
train_df_new_features['month'] = train_df_times['time1'].apply(lambda ts: ts.month)
train_df_new_features['target'] = train_df['target']

In [None]:
train_df_new_features['day_of_week'] = train_df_times['time1'].apply(lambda ts: ts.dayofweek)

In [12]:
%%time
X_train_new = add_time_features(train_df, X_train)
X_test_new = add_time_features(test_df, X_test)
## Augment X_train and X_test sparse matrices with TOU columns

Wall time: 1.38 s


In [13]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [None]:
## Train DataFrame for time differences between sessions
time_df = pd.DataFrame(index=train_df.index)
time_df['target'] = train_df['target']

# Find sessions' starting and ending
time_df['min'] = train_df[times].min(axis=1)
time_df['max'] = train_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df['seconds'] = (time_df['max'] - time_df['min']) / np.timedelta64(1, 's')

In [None]:
## Test DataFrame for time differences between sessions
time_df_test = pd.DataFrame(index=test_df.index)

# Find sessions' starting and ending
time_df_test['min'] = test_df[times].min(axis=1)
time_df_test['max'] = test_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df_test['seconds'] = (time_df_test['max'] - time_df_test['min']) / np.timedelta64(1, 's')

In [None]:
%%time
X_train_new = add_time_features(train_df, X_train_new)
X_test_new = add_time_features(test_df, X_test_new)
print(X_train_new.shape, X_test_new.shape)

**Run this part for cross-validation:**

In [14]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)
print(cv_scores, cv_scores.mean())

[0.87652191 0.75129605 0.93062022 0.978644   0.90399606 0.93831555
 0.96249405 0.92731303 0.9488597  0.94043603] 0.915849660111882
Wall time: 37.4 s


# PLAYGROUND

In [None]:
train_df_new_features.head()

In [None]:
sns.countplot(x='day_of_week', data=train_df_new_features[train_df_new_features.target==1])

In [None]:
time_df.groupby('target').describe()

In [None]:
pd.Series(train_df_sites[train_df.target==1].values.flatten()).value_counts().sort_values(ascending=False).head(10)