In [1]:
import pandas as pd
import dask
import dask.dataframe as dd
import featuretools as ft
import numpy as np
import json
import gc
import time
import shap
import sys, os, psutil
import cmath

from tqdm import tqdm
from pandas.io.json import json_normalize
from contextlib import contextmanager
from functools import partial
from itertools import count, repeat, cycle # repeat(10, 3) --> 10 10 10

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

./input/train_labels.csv
./input/test.csv
./input/specs.csv
./input/train.csv
./input/sample_submission.csv


In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    

def cpuStats():
    print("########## CPU STATS ############")
    print(sys.version)
    print(psutil.cpu_percent())
    print(psutil.virtual_memory())  # physical memory usage
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('memory GB:', memoryUse)
    print("########## CPU STATS ############")

In [3]:
def preprocess_train(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence = (dataset
    .drop_duplicates(subset=['game_session'], keep='first')
        [['event_id', 'game_session', 'timestamp', 'installation_id', 'type', 'title', 'world', 'game_time']]
        .reset_index(drop=True)
    )

    game_sequence_y = (game_sequence
        .query('type == "Assessment"')
        .reset_index(drop=True)
        .copy()
    )

    game_sequence_filter = (game_sequence
        .merge(game_sequence_y, on='installation_id', how='inner', suffixes=('_x', '_y'))
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        .reset_index(drop=True)
        [['game_session_x', 'game_session_y']]
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)  
    return dataset_df

In [4]:
def preprocess_test(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence_filter = (dataset
        .assign(game_session_y = lambda df: df['installation_id'])
        .merge(dataset.groupby('installation_id')['timestamp'].max().reset_index().rename(columns={'timestamp': 'timestamp_y'}), 
               on='installation_id', how='inner')
        .rename(columns={'game_session': 'game_session_x', 'timestamp': 'timestamp_x'})
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        [['game_session_x', 'game_session_y']]
        .drop_duplicates(subset=['game_session_x', 'game_session_y'], keep='first')
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)
    return dataset_df

In [5]:
def add_event_data_info(dataset, dataset_event_data, game_sequence_filter):
    def flatten_(record, sel_cols):
        return dict(map(lambda x: (x, record.get(x)), sel_cols))

    event_data_cols = ['coordinates',
                       'correct', 'duration', 'dwell_time', 'misses', 'round', 'total_duration', 'version']
    event_data_cols_meta = {
        'correct': np.bool, 
        'duration': np.float16, 
        'dwell_time': np.float32, 
        'misses': np.float32, 
        'round': np.float16, 
        'total_duration': np.float32, 
        'version': np.float16
    }
    flatten_ = partial(flatten_, sel_cols=event_data_cols)

    event_data = dataset_event_data['event_data'].to_bag().map(flatten_).to_dataframe(meta=event_data_cols_meta)
    dataset = (dd.concat([dataset, event_data], axis=1)
        .merge(game_sequence_filter, left_on='game_session', right_on='game_session_x', how='inner')
        .query('game_session_y != game_session') # filter out game_session_y data to prevent data leakage
        .drop(columns=['game_session_x'])
    )
    
    dataset_df = dataset.assign(timestamp = lambda df: df['timestamp'].dt.tz_localize(None)).compute(scheduler='threads').reset_index(drop=True)
    
    return dataset_df

In [6]:
def create_entityset(dataset_df):
    es = ft.EntitySet(id="game_session_y_data")

    es = es.entity_from_dataframe(entity_id="actions",
                                  dataframe=dataset_df,
                                  index='action_id',
                                  time_index='timestamp',
                                  make_index=True)

    es = es.normalize_entity(base_entity_id="actions",
                             new_entity_id="game_sessions",
                             index="game_session",
                             additional_variables=["title", "type", "world", "game_session_y", "installation_id"])

    es = es.normalize_entity(base_entity_id="game_sessions",
                             new_entity_id="game_session_ys",
                             index="game_session_y",
                             additional_variables=["installation_id"])

    es = es.normalize_entity(base_entity_id="game_session_ys",
                             new_entity_id="installation_ids",
                             index="installation_id")
    
    return es

In [7]:
def create_feature_matrix(dataset_es):
    feature_matrix, feature_defs = ft.dfs(entityset=dataset_es, target_entity="game_session_ys")    
    return feature_matrix

In [8]:
def train_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_train(dataset, dataset_event_data)))

In [9]:
def test_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_test(dataset, dataset_event_data)))

### valid label function

In [10]:
def create_installation_kfold_label(train_df, n_split):
    installations = train_df['installation_id'].unique()
    install_ser = pd.Series(installations)
    c = cycle(range(n_split)) # cycle('ABCD') --> A B C D A B C D A B C D ...

    result = (install_ser
        .sample(frac=1)
        .to_frame(name='installation_id')
        .assign(fold_label = list(map(lambda _: next(c), range(len(install_ser)))))
        .sort_index()
    )
    return result

In [11]:
def create_session_order_label(train_df):
    last_assessment_valid = (train_df
        .groupby(['game_session_y', 'installation_id'], as_index=False)['timestamp']
        .max()
        .assign(session_order_label = lambda df: df.groupby('installation_id')['timestamp'].rank(ascending=False).astype(int))
        [['game_session_y', 'session_order_label']]
    )
    return last_assessment_valid

In [12]:
# https://www.kaggle.com/khoongweihao/bayesian-opt-seed-blending-with-tuning-69

import scipy as sp

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -cohen_kappa_score(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(self.coef_['x'])) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

# Reading Data

In [13]:
col_dtype = {
    'event_id':'object', 'game_session':'object', 'installation_id':'object',
    'event_count':'int16', 'event_code':'category', 'game_time':'int32', 'title':'category', 
    'type':'category', 'world':'category'
}

In [14]:
# Train Dataset
train = dd.read_csv('./input/train.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 'installation_id', 
                             'event_count', 'event_code', 'game_time', 'title', 'type', 'world'])

train_event_data = dd.read_csv('./input/train.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

# Train target Column
train_labels_df = dd.read_csv('./input/train_labels.csv').compute(scheduler='threads')

In [15]:
# Test Dataset
test = dd.read_csv('./input/test.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 'installation_id', 
                             'event_count', 'event_code', 'game_time', 'title', 'type', 'world'])

test_event_data = dd.read_csv('./input/test.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

sample_submission = dd.read_csv('./input/sample_submission.csv').compute(scheduler='threads')

# Create Features

In [16]:
with timer("Process train_df"):
    # Raw Training Cutting Data
    train_df = preprocess_train(train, train_event_data, cutting_sec=3600*8)
    cpuStats()
    gc.collect()

Concatenating dataframes with unknown divisions.
We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.


########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
16.3
svmem(total=17179869184, available=9283420160, percent=46.0, used=7838679040, free=3851825152, active=5385895936, inactive=5425172480, wired=2452783104)
memory GB: 1.7406234741210938
########## CPU STATS ############
Process train_df - done in 210s


In [17]:
with timer("Process train_es"):
    # Featuretools EntitySet
    train_es = create_entityset(train_df)
    cpuStats()
    gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
21.0
svmem(total=17179869184, available=8905900032, percent=48.2, used=8214069248, free=3220586496, active=5674827776, inactive=5678866432, wired=2539241472)
memory GB: 2.3238601684570312
########## CPU STATS ############
Process train_es - done in 18s


In [18]:
with timer("Process train_feature"):
    # Featuretools Feature
    train_features = create_feature_matrix(train_es)
    cpuStats()
    gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
14.9
svmem(total=17179869184, available=10731401216, percent=37.5, used=6338318336, free=6854320128, active=3880165376, inactive=3704754176, wired=2458152960)
memory GB: 2.3865280151367188
########## CPU STATS ############
Process train_feature - done in 490s


In [19]:
#  1 Days local 557 seconds
#  7 Days local 731 seconds
# 30 Days local 833 seconds & test failed


In [20]:
with timer("Validation Label"):
    installation_kfold_label = create_installation_kfold_label(train_df, n_split=5)
    session_order_label = create_session_order_label(train_df)
    cpuStats()
    gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
18.4
svmem(total=17179869184, available=10560659456, percent=38.5, used=6509330432, free=6563528704, active=3951976448, inactive=3824685056, wired=2557353984)
memory GB: 2.615753173828125
########## CPU STATS ############
Validation Label - done in 2s


In [21]:
# Train Features add target column 'accuracy_group', installation_kfold_label, session_order_label
train_data_df = (train_features
    .reset_index()
    .merge(installation_kfold_label, on='installation_id', how='inner') # add kfold by installation_id
    .merge(session_order_label, on='game_session_y', how='inner') # add session order, session_order_label==1 is last assessment
    .merge(train_labels_df[['game_session', 'accuracy_group']], 
           left_on='game_session_y', 
           right_on='game_session', how='inner') # add target label
    .drop(columns=['game_session_y', 'game_session', 'installation_id'])
    .reset_index(drop=True)
)

In [22]:
# Train Column Schema
target_col = 'accuracy_group'
feature_cols = train_data_df.columns.drop([target_col]+['session_order_label', 'fold_label']).to_list()

In [23]:
train_data_df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17112 entries, 0 to 17111
Columns: 425 entries, COUNT(game_sessions) to accuracy_group
dtypes: float16(129), float32(129), float64(85), int16(9), int32(9), int64(48), object(16)
memory usage: 33.0+ MB


In [24]:
del train_features, train_df;
cpuStats()
gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
49.0
svmem(total=17179869184, available=11018080256, percent=35.9, used=6051573760, free=7316647936, active=3491491840, inactive=3528978432, wired=2560081920)
memory GB: 1.8949737548828125
########## CPU STATS ############


22

In [25]:
with timer("Create Test Features"):

    #test_features = test_feature_pipe(test, test_event_data)
    # Raw testing Cutting Data
    test_df = preprocess_test(test, test_event_data, cutting_sec=3600*8)

    # Featuretools EntitySet
    test_es = create_entityset(test_df)

    # Featuretools Feature
    test_features = create_feature_matrix(test_es)
    # 
    test_data_df = (test_features
        .drop(columns=['installation_id'])
        [feature_cols]
    )

Concatenating dataframes with unknown divisions.
We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.


Create Test Features - done in 57s


In [26]:
# ['fold_label', 'session_order_label']
test_data_df.shape, train_data_df.shape

((1000, 422), (17112, 425))