In [1]:
import pandas as pd
import dask
import dask.dataframe as dd
import featuretools as ft
import numpy as np
import json
import gc
import time
import shap
import sys, os, psutil
import cmath

from tqdm import tqdm
from pandas.io.json import json_normalize
from contextlib import contextmanager
from functools import partial
from itertools import count, repeat

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

./input/train_labels.csv
./input/test.csv
./input/specs.csv
./input/train.csv
./input/sample_submission.csv


In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
    
def cpuStats():
    print("########## CPU STATS ############")
    print(sys.version)
    print(psutil.cpu_percent())
    print(psutil.virtual_memory())  # physical memory usage
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print('memory GB:', memoryUse)
    print("########## CPU STATS ############")

In [3]:
def preprocess_train(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence = (dataset
    .drop_duplicates(subset=['game_session'], keep='first')
        [['event_id', 'game_session', 'timestamp', 'installation_id', 'type', 'title', 'world', 'game_time']]
        .reset_index(drop=True)
    )

    game_sequence_y = (game_sequence
        .query('type == "Assessment"')
        .reset_index(drop=True)
        .copy()
    )

    game_sequence_filter = (game_sequence
        .merge(game_sequence_y, on='installation_id', how='inner', suffixes=('_x', '_y'))
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        .reset_index(drop=True)
        [['game_session_x', 'game_session_y']]
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)  
    return dataset_df

In [4]:
def preprocess_test(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence_filter = (dataset
        .assign(game_session_y = lambda df: df['installation_id'])
        .merge(dataset.groupby('installation_id')['timestamp'].max().reset_index().rename(columns={'timestamp': 'timestamp_y'}), on='installation_id', how='inner')
        .rename(columns={'game_session': 'game_session_x', 'timestamp': 'timestamp_x'})
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        [['game_session_x', 'game_session_y']]
        .drop_duplicates(subset=['game_session_x', 'game_session_y'], keep='first')
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)  
    
    return dataset_df

In [5]:
def add_event_data_info(dataset, dataset_event_data, game_sequence_filter):
    
    def flatten_(record, sel_cols):
        return dict(map(lambda x: (x, record.get(x)), sel_cols))

    event_data_cols = ['coordinates',
                       'correct', 'duration', 'dwell_time', 
                       'misses', 'round', 'total_duration', 'version']
    event_data_cols_meta = {
        'correct': np.bool, 
        'duration': np.float16, 
        'dwell_time': np.float32, 
        'misses': np.float32, 
        'round': np.float16, 
        'total_duration': np.float32, 
        'version': np.float16
    }
    flatten_ = partial(flatten_, sel_cols=event_data_cols)

    event_data = dataset_event_data['event_data'].to_bag().map(flatten_).to_dataframe(meta=event_data_cols_meta)
    dataset = (dd.concat([dataset, event_data], axis=1)
        .merge(game_sequence_filter, left_on='game_session', right_on='game_session_x', how='inner')
        .query('game_session_y != game_session') # filter out game_session_y data to prevent data leakage
        .drop(columns=['game_session_x'])
    )
    
    dataset_df = dataset.compute(scheduler='threads').reset_index(drop=True)
    
    return dataset_df

In [6]:
def create_entityset(dataset_df):
    es = ft.EntitySet(id="game_session_y_data")

    es = es.entity_from_dataframe(entity_id="actions",
                                  dataframe=dataset_df,
                                  index='action_id',
                                  make_index=True)

    es = es.normalize_entity(base_entity_id="actions",
                             new_entity_id="game_sessions",
                             index="game_session",
                             additional_variables=["title", "type", "world", 
                                                   "game_session_y", "installation_id"])

    es = es.normalize_entity(base_entity_id="game_sessions",
                             new_entity_id="game_session_ys",
                             index="game_session_y",
                             additional_variables=["installation_id"])

    es = es.normalize_entity(base_entity_id="game_session_ys",
                             new_entity_id="installation_ids",
                             index="installation_id")
    
    return es

In [7]:
def create_feature_matrix(dataset_es):
    feature_matrix, feature_defs = ft.dfs(entityset=dataset_es, target_entity="game_session_ys")    #1 1/16/2020
    return feature_matrix

In [8]:
def train_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_train(dataset, dataset_event_data)))

In [9]:
def test_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_test(dataset, dataset_event_data)))

# Reading Data

In [10]:
col_dtype = {
    'event_id':'object', 'game_session':'object', 'installation_id':'object',
    'event_count':'int16', 'event_code':'category', 'game_time':'int32', 'title':'category', 
    'type':'category', 'world':'category'
}

In [11]:
# Train Dataset
train = dd.read_csv('./input/train.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 
                             'installation_id', 'event_count', 'event_code', 
                             'game_time', 'title', 'type', 'world'])

train_event_data = dd.read_csv('./input/train.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

# Train target Column
train_labels_df = dd.read_csv('./input/train_labels.csv').compute(scheduler='threads')

In [12]:
# Test Dataset
test = dd.read_csv('./input/test.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 
                             'installation_id', 'event_count', 'event_code', 
                             'game_time', 'title', 'type', 'world'])

test_event_data = dd.read_csv('./input/test.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

sample_submission = dd.read_csv('./input/sample_submission.csv').compute(scheduler='threads')

# Create Features
- Create Train Features
- train_features = train_feature_pipe(train, train_event_data)
- create_feature_matrix(create_entityset(preprocess_train(dataset, dataset_event_data)))




In [13]:
with timer("Process train_df"):
    # Raw Training Cutting Data
    train_df = preprocess_train(train, train_event_data, cutting_sec = 2592000) # Cutting Sec
    cpuStats()
    gc.collect()

Concatenating dataframes with unknown divisions.
We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.


########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
32.6
svmem(total=17179869184, available=10691715072, percent=37.8, used=5099737088, free=5583634432, active=2628849664, inactive=5091782656, wired=2470887424)
memory GB: 3.7191390991210938
########## CPU STATS ############
Process train_df - done in 294s


In [21]:
train_df.columns.tolist()

In [14]:
with timer("Process train_es"):
    # Featuretools EntitySet
    train_es = create_entityset(train_df)
    cpuStats()
    gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
26.1
svmem(total=17179869184, available=9601208320, percent=44.1, used=5922172928, free=6120763392, active=3436826624, inactive=3470835712, wired=2485346304)
memory GB: 3.3749008178710938
########## CPU STATS ############
Process train_es - done in 72s


In [20]:
# train_es.plot()

In [15]:
with timer("Featuretools Feature"):
    # Featuretools Feature
    train_features = create_feature_matrix(train_es)
    cpuStats()
    gc.collect()

########## CPU STATS ############
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
19.5
svmem(total=17179869184, available=11488231424, percent=33.1, used=4109799424, free=8383426560, active=1665593344, inactive=2944565248, wired=2444206080)
memory GB: 0.26819610595703125
########## CPU STATS ############
Featuretools Feature - done in 849s


In [16]:
# Train Features add target column 'accuracy_group'

train_data_df = (train_features
        .merge(train_labels_df[['game_session', 'accuracy_group']], 
           left_index=True, right_on='game_session', how='inner')
        .drop(columns=['game_session', 'installation_id'])
        .reset_index(drop=True))

# Train Column Schema
target_col = 'accuracy_group'
feature_cols = train_data_df.columns.drop([target_col]).to_list()

In [18]:
train_data_df.

Unnamed: 0,COUNT(game_sessions),NUM_UNIQUE(game_sessions.type),NUM_UNIQUE(game_sessions.world),NUM_UNIQUE(game_sessions.title),MODE(game_sessions.type),MODE(game_sessions.world),MODE(game_sessions.title),SUM(actions.event_count),SUM(actions.version),SUM(actions.misses),...,installation_ids.MEAN(actions.dwell_time),installation_ids.MEAN(actions.duration),installation_ids.MEAN(actions.round),installation_ids.COUNT(actions),installation_ids.PERCENT_TRUE(actions.correct),installation_ids.NUM_UNIQUE(actions.event_code),installation_ids.NUM_UNIQUE(actions.event_id),installation_ids.MODE(actions.event_code),installation_ids.MODE(actions.event_id),accuracy_group
0,18,3,3,14,Clip,TREETOPCITY,All Star Sorting,218832.0,42.0,90.0,...,,inf,2.964844,9907,0.036136,38,175,3010,0a08139c,3
1,7,4,1,7,Clip,TREETOPCITY,Air Show,201350.0,20.0,5.0,...,,inf,2.964844,9907,0.036136,38,175,3010,0a08139c,0
2,1,1,1,1,Assessment,TREETOPCITY,Bird Measurer (Assessment),15312.0,4.0,0.0,...,,inf,2.964844,9907,0.036136,38,175,3010,0a08139c,3
3,1,1,1,1,Assessment,TREETOPCITY,Mushroom Sorter (Assessment),306.0,2.0,0.0,...,,inf,2.964844,9907,0.036136,38,175,3010,0a08139c,2
4,9,4,1,7,Clip,TREETOPCITY,Crystals Rule,25573.0,5.0,3.0,...,,inf,2.964844,9907,0.036136,38,175,3010,0a08139c,3


In [17]:
from scipy.stats import pearsonr
xx = 'MAX(game_sessions.PERCENT_TRUE(actions.correct))'
a = train_data_df[xx]
b = train_data_df.accuracy_group
corr, _ = pearsonr(a, b)
print('Pearsons correlation: %.5f' % corr)

Pearsons correlation: 0.1344


In [18]:
# Create Test Features
#test_features = test_feature_pipe(test, test_event_data)
with timer("Test Features"):
    # Raw testing Cutting Data
    test_df = preprocess_test(test, test_event_data, cutting_sec=259200)

    # Featuretools EntitySet
    test_es = create_entityset(test_df)

    # Featuretools Feature
    test_features = create_feature_matrix(test_es)

    test_data_df = (test_features
        .drop(columns=['installation_id'])
        [feature_cols]
    )

Concatenating dataframes with unknown divisions.
We're assuming that the indexes of each dataframes are 
 aligned. This assumption is not generally safe.


Test Features - done in 74s


In [17]:
test_data_df

NameError: name 'test_data_df' is not defined