In [None]:
import json
import datetime
import os
import time

import pandas as pd
import numpy as np

from sklearn import preprocessing

In [None]:
def load(path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'},
                     nrows=nrows)
    
    # Normalize JSON columns
    for column in JSON_COLUMNS:
        column_as_df = pd.io.json.json_normalize(df[column])
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    
    # Parse date
    df['date'] = df['date'].apply(lambda x: pd.datetime.strptime(str(x), '%Y%m%d'))
    print("Loaded file {}\nShape is: {}".format(path, df.shape))
    return df

def process(train, test):
    print("Dropping constant columns...")
    
    # Remove columns with constant values.
    const_cols = [c for c in train.columns if train[c].nunique(dropna=False) == 1]
    train = train.drop(const_cols, axis=1)
    test = test.drop(const_cols, axis=1)
    
    train_len = train.shape[0]
    merged = pd.concat([train, test], sort=False)

    # Create some features.
    merged['diff_visitId_time'] = merged['visitId'] - merged['visitStartTime']
    merged['diff_visitId_time'] = (merged['diff_visitId_time'] != 0).astype(int)
    del merged['visitId']
    del merged['sessionId']

    print("Generating date columns...")
    merged['WoY'] = merged['date'].apply(lambda x: x.isocalendar()[1])
    merged['month'] = merged['date'].apply(lambda x: x.month)
    merged['quarterMonth'] = merged['date'].apply(lambda x: x.day // 8)
    merged['weekday'] = merged['date'].apply(lambda x: x.weekday())
    del merged['date']

    format_time = lambda t: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
    merged['visitHour'] = pd.to_datetime(merged['visitStartTime'].apply(format_time)).apply(lambda t: t.hour)
    del merged['visitStartTime']
    
    print("Finding total visits...")
    # This could be considered an information leak as I am including information about the future when predicting
    # the revenue of a transaction. In reality, when looking at the 3rd visit we would have no way of knowning
    # that the user will actually shop X more times (or if he will visit again at all). However since this
    # info also exists in the test set we might use it.
    total_visits = merged[["fullVisitorId", "visitNumber"]].groupby("fullVisitorId", as_index=False).max()
    total_visits.rename(columns={"visitNumber": "totalVisits"}, inplace=True)
    merged = merged.merge(total_visits)

    print("Splitting back...")
    train = merged[:train_len]
    test = merged[train_len:]
    return train, test

def preprocess_and_save(data_dir):
    train = load(os.path.join(data_dir, "train.csv"))
    test = load(os.path.join(data_dir, "test.csv"))

    target = train['transactionRevenue'].fillna(0).astype(float)
    train['target'] = target.apply(lambda x: np.log1p(x))
    del train['transactionRevenue']

    train, test = process(train, test)
    train.to_csv(os.path.join(data_dir, "preprocessed_train.csv"), index=False)
    test.to_csv(os.path.join(data_dir, "preprocessed_test.csv"), index=False)    
    

# Call this to save the preprocessed data for later use
# preprocess_and_save("../data/")

In [None]:
train = load("../data/train.csv", nrows=15000)
test = load("../data/test.csv", nrows=10000)

train, test = process(train, test)