In [1]:
import pandas as pd
import dask
import dask.dataframe as dd
import featuretools as ft
import numpy as np
import json
from pandas.io.json import json_normalize

from functools import partial
import gc
import time
from contextlib import contextmanager
import shap

import sys, os, psutil

import cmath

from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

./input/train_labels.csv
./input/test.csv
./input/specs.csv
./input/train.csv
./input/sample_submission.csv


In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
    
def cpuStats():
    print("########## CPU STATS ############")
    print(sys.version)
    print(psutil.cpu_percent())
    print(psutil.virtual_memory())  # physical memory usage
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('memory GB:', memoryUse)
    print("########## CPU STATS ############")

In [3]:
col_dtype = {
    'event_id':'object', 'game_session':'object', 'installation_id':'object',
    'event_count':'int16', 'event_code':'category', 'game_time':'int32', 'title':'category', 
    'type':'category', 'world':'category'
}



train = dd.read_csv('./input/train.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 'installation_id', 
                             'event_count', 'event_code', 'game_time', 'title', 'type', 'world'])

train_event_data = dd.read_csv('./input/train.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])


In [4]:
train_labels_df = dd.read_csv('./input/train_labels.csv').compute(scheduler='threads')

In [5]:
test = dd.read_csv('./input/test.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 
                             'installation_id', 'event_count', 'event_code', 
                             'game_time', 'title', 'type', 'world'])

In [6]:
game_sequence = (train
    .drop_duplicates(subset=['game_session'], keep='first')
    [['event_id', 'game_session', 'timestamp', 'installation_id', 'type', 'title', 'world', 'game_time']]
    .reset_index(drop=True)
    #.assign(diff_sec = lambda df: df.groupby('installation_id')['timestamp'].transform(lambda x: x.diff()).dt.total_seconds().fillna(0))
)

game_sequence_y = (game_sequence
    .query('type == "Assessment"')
    #[['event_id', 'game_session', 'timestamp', 'installation_id', 'event_code', 'title', 'type', 'world', 'game_time']]
    .reset_index(drop=True)
    .copy()
)

game_sequence_filter = (game_sequence
    .merge(game_sequence_y, on='installation_id', how='inner', suffixes=('_x', '_y'))
    .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
    .query('0 <= diff < 604800')#
    .reset_index(drop=True)
    [['game_session_x', 'game_session_y', 'title_y', 'world_y', 'game_time_y']]
)#.compute(scheduler='threads')

In [7]:
def flatten(record, sel_cols):
    return dict(map(lambda x: (x, record.get(x)), sel_cols))

event_data_cols = ['coordinates',
                   'correct', 'duration', 'dwell_time', 'misses', 'round', 'total_duration', 'version']
event_data_cols_meta = {
    'correct': np.bool, 
    'duration': np.float16, 
    'dwell_time': np.float32, 
    'misses': np.float32, 
    'round': np.float16, 
    'total_duration': np.float32, 
    'version': np.float16
}


with timer("Process Event Data"):

    flatten = partial(flatten, sel_cols=event_data_cols)

    event_data = train_event_data['event_data'].to_bag().map(flatten).to_dataframe(meta=event_data_cols_meta)
    train = (dd.concat([train, event_data], axis=1)
        .merge(game_sequence_filter, left_on='game_session', right_on='game_session_x', how='inner')
        .query('game_session_y != game_session')
        .drop(columns=['game_session_x'])
    )
    gc.collect()

Process Event Data - done in 0s


Concatenating dataframes with unknown divisions.
We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.


In [None]:
with timer("Process train_df"):

    train_df = train.compute(scheduler='threads').reset_index(drop=True)
    print(train_df.shape)
    cpuStats()
    gc.collect()

In [13]:
train_df.shape

(37230923, 21)

## PreProcessing
- `correct`
    - 尚未驗證：與目標關聯度

In [33]:
train_df.correct.value_counts()

False    35425335
True      1805588
Name: correct, dtype: int64

In [34]:
train_df.columns

Index(['action_id', 'event_id', 'game_session', 'timestamp', 'installation_id',
       'event_count', 'event_code', 'game_time', 'title', 'type', 'world',
       'correct', 'duration', 'dwell_time', 'misses', 'round',
       'total_duration', 'version', 'game_session_y', 'title_y', 'world_y',
       'game_time_y'],
      dtype='object')

## FeatureTools

In [None]:
es = ft.EntitySet(id="game_session_y_data")

In [None]:
es = es.entity_from_dataframe(entity_id="actions",
                              dataframe=train_df,
                              index='action_id',
                              make_index=True)

In [None]:
es = es.normalize_entity(base_entity_id="actions",
                         new_entity_id="game_sessions",
                         index="game_session",
                         additional_variables=["title", "type", "world",
                                               "game_session_y", "title_y", "world_y", "installation_id"])

In [None]:
es = es.normalize_entity(base_entity_id="game_sessions",
                         new_entity_id="game_session_ys",
                         index="game_session_y",
                         additional_variables=["title_y", "world_y", "installation_id"])

In [None]:
es = es.normalize_entity(base_entity_id="game_session_ys",
                         new_entity_id="installation_ids",
                         index="installation_id")

In [28]:
# es.plot()

In [None]:
with timer("Process Feature Matrix"):

    feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_entity="game_session_ys")
    gc.collect()

In [None]:
with timer("Process Train Data DF"):

    train_data_df = (feature_matrix
        .merge(train_labels_df[['game_session', 'accuracy_group']], 
               left_index=True, right_on='game_session', how='inner')
        .drop(columns=['game_session', 'installation_id'])
#         .reset_index(drop=True)
        )
    
    gc.collect()

In [32]:
[i for i in train_data_df.columns if 'correct' in i ]

In [None]:
from scipy.stats import pearsonr
xx = 'MAX(game_sessions.PERCENT_TRUE(actions.correct))'
a = train_data_df[xx]
b = train_data_df.accuracy_group
corr, _ = pearsonr(a, b)
print('Pearsons correlation: %.3f' % corr)