In [1]:
import os
import pandas as pd
import numpy as np
import pickle

from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import eli5
from IPython.display import display_html

from sklearn.feature_selection import RFECV

# try https://www.scikit-yb.org/en/latest/index.html

In [2]:
SEED = 42

In [3]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)
    
def save_obj(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) # HIGHEST_PROTOCOL - binary protocol


def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

### Data downloading

In [4]:
train_data_path = 'Data/train_sessions.csv'
test_data_path = 'Data/test_sessions.csv'
dict_path = 'Data/site_dic.pkl'

In [5]:
raw_train = pd.read_csv(train_data_path)
raw_train.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,...,,,,,,,,,,0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [6]:
raw_test = pd.read_csv(test_data_path)
raw_test.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
0,1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,...,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
1,2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,...,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53
2,3,55,2014-12-05 15:55:12,55.0,2014-12-05 15:55:13,55.0,2014-12-05 15:55:14,55.0,2014-12-05 15:56:15,55.0,...,55.0,2014-12-05 15:56:17,55.0,2014-12-05 15:56:18,55.0,2014-12-05 15:56:19,1445.0,2014-12-05 15:56:33,1445.0,2014-12-05 15:56:36
3,4,1023,2014-11-04 10:03:19,1022.0,2014-11-04 10:03:19,50.0,2014-11-04 10:03:20,222.0,2014-11-04 10:03:21,202.0,...,3374.0,2014-11-04 10:03:22,50.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:23,3374.0,2014-11-04 10:03:23
4,5,301,2014-05-16 15:05:31,301.0,2014-05-16 15:05:32,301.0,2014-05-16 15:05:33,66.0,2014-05-16 15:05:39,67.0,...,69.0,2014-05-16 15:05:40,70.0,2014-05-16 15:05:40,68.0,2014-05-16 15:05:40,71.0,2014-05-16 15:05:40,167.0,2014-05-16 15:05:44


In [7]:
with open(dict_path, 'rb') as file:
    site2id_dict = pickle.load(file)

In [8]:
print(raw_train.shape, raw_test.shape)

(253561, 22) (82797, 21)


https://www.kaggle.com/kashnitsky/model-validation-in-a-competition#Submission-3:-Example-of-overfitting

### Bag of words (TfidfVectorizer) preparation

In [9]:
def prepare_tfidf_features(train_df, test_df, site_dict, vectorizer_params):
    site_columns = ['site{}'.format(i) for i in range(1, 11)]
    time_columns = ['time{}'.format(i) for i in range(1, 11)]
    
    # Make inverse dict
    id2site_dict = {v: k for (k, v) in site2id_dict.items()}
    id2site_dict[0] = 'unknown'
    
    # Sort values by initial session time to avoid a data leak during cross-validation.
    # Prepare df for mapping
    train_df = train_df.sort_values('time1').reset_index(drop = True)
    
    train_sessions = train_df[site_columns].fillna(0).astype('int').apply(lambda row:
                                                                          ' '.join([id2site_dict[i] for i in row]), axis = 1).tolist()
    test_sessions = test_df[site_columns].fillna(0).astype('int').apply(lambda row:
                                                                          ' '.join([id2site_dict[i] for i in row]), axis = 1).tolist()
    # encoding
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train_tfidf =  vectorizer.fit_transform(train_sessions)
    X_test_tfidf = vectorizer.transform(test_sessions)
    
    tfidf_feature_names = vectorizer.get_feature_names() 
    
    y_train = train_df ['target'].astype('int').to_numpy()
    
    for column_name in time_columns:
        train_df[column_name] = pd.to_datetime(train_df[column_name])
        test_df[column_name] = pd.to_datetime(test_df[column_name])
        
    X_train_time = train_df[time_columns]
    X_test_time = test_df[time_columns]
    
    
    
    return list(tfidf_feature_names), X_train_tfidf, X_test_tfidf, X_train_time, X_test_time, y_train

In [10]:
vectorizer_params={'ngram_range': (1, 5), 
                    'max_features': 50000,
                    'tokenizer': lambda s: s.split()}

In [11]:
%%time
temp_output = prepare_tfidf_features(raw_train, raw_test, site_dict = site2id_dict, 
                                     vectorizer_params = vectorizer_params)
tfidf_feature_names, X_train_tfidf, X_test_tfidf, X_train_time, X_test_time, y_train = temp_output

CPU times: user 25.3 s, sys: 576 ms, total: 25.9 s
Wall time: 23 s


### Naive linear model

Set up cross-validation strategy

In [27]:
cross_val_time_split = TimeSeriesSplit(n_splits = 10)

Create logistic model

In [28]:
logistic_params = {'random_state': SEED, 
                   'solver': 'liblinear',
                   'C': 1}
logit = LogisticRegression(**logistic_params)

In [29]:
naive_cross_val_score = cross_val_score(logit, X_train_tfidf, y_train, 
                                        cv = cross_val_time_split, scoring = 'roc_auc')

In [None]:
print('roc-auc scores by folds: \n', naive_cross_val_score)
print()
print('mean roc-auc score: ', naive_cross_val_score.mean())

In [None]:
logit.fit(X_train_tfidf, y_train)
naive_logit_predictions = logit.predict_proba(X_test_tfidf)[:, 1]

In [None]:
write_to_submission_file(naive_logit_predictions, 'Predictions/naive_logit_predictions.csv')

In [None]:
naive_kaggle_result = 0.91803
print(naive_kaggle_result)

In [None]:
eli5.show_weights(estimator = logit,
                  feature_names = tfidf_feature_names,
                  top = 30)

### EDA

In [None]:
viz_data = X_train_time.copy()
viz_data['target'] = y_train

In [None]:
viz_data['target'].value_counts().reset_index().plot.bar(x = 'index', y = 'target')
plt.title('Target distribution')
plt.show()

Dataset is higly imbalanced

In [None]:
def plot_countplots(feature_name, suptitle, figsize = (10, 5)):
    fig, axs = plt.subplots(1, 2, figsize = figsize) 
    axs = axs.ravel()

    for target, ax in enumerate(axs):
        sns.countplot(viz_data[viz_data['target'] == target][str(feature_name)], ax = ax)
        ax.set_title('Target: {}'.format(target))
    fig.suptitle(str(suptitle))
    plt.show()    

In [None]:
viz_data['start_year'] = viz_data['time1'].dt.year
plot_countplots('start_year', 'Start years')

**Time features**

In [None]:
viz_data['start_month'] = viz_data['time1'].dt.month
plot_countplots('start_month', 'Start months')

In [None]:
viz_data['start_hour'] = viz_data['time1'].dt.hour
plot_countplots('start_hour', 'Start hours')

We can see, that Alice prefer to start internet session between 12:00 - 13:00 and 16:00 - 18:00. So, we can encode it into categorical featurs: morning, launch, day, evnening, night. <br>
morning: 8:00 - 11:00 <br>
launch: 12:00 - 13:00 <br>
day: 14:00 - 18:00 <br>
evening: 19:00 - 23:00 <br> 
night: 00:00 - 7:00 (strange night, yeah)


In [None]:
viz_data['week_day'] = viz_data['time1'].dt.weekday
plot_countplots('week_day', 'Start day of week')

**Text features**

In [None]:
############### TRY Yellowbrick t-SNE #########################