In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import dask
import dask.dataframe as dd
import featuretools as ft
import h2o
import numpy as np
import json
from pandas.io.json import json_normalize
from sklearn.model_selection import KFold
from functools import partial
from itertools import count, repeat, cycle



# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def preprocess_train(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence = (dataset
    .drop_duplicates(subset=['game_session'], keep='first')
        [['event_id', 'game_session', 'timestamp', 'installation_id', 'type', 'title', 'world', 'game_time']]
        .reset_index(drop=True)
    )

    game_sequence_y = (game_sequence
        .query('type == "Assessment"')
        .reset_index(drop=True)
        .copy()
    )

    game_sequence_filter = (game_sequence
        .merge(game_sequence_y, on='installation_id', how='inner', suffixes=('_x', '_y'))
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        .reset_index(drop=True)
        [['game_session_x', 'game_session_y']]
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)  
    return dataset_df

In [None]:
def preprocess_test(dataset, dataset_event_data, cutting_sec=10000):
    game_sequence_filter = (dataset
        .assign(game_session_y = lambda df: df['installation_id'])
        .merge(dataset.groupby('installation_id')['timestamp'].max().reset_index().rename(columns={'timestamp': 'timestamp_y'}), on='installation_id', how='inner')
        .rename(columns={'game_session': 'game_session_x', 'timestamp': 'timestamp_x'})
        .assign(diff = lambda df: (df['timestamp_y'] - df['timestamp_x']).dt.total_seconds())
        .query('0 <= diff < {0}'.format(cutting_sec)) # determine the cutting gap
        [['game_session_x', 'game_session_y']]
        .drop_duplicates(subset=['game_session_x', 'game_session_y'], keep='first')
    )
    
    dataset_df = add_event_data_info(dataset, dataset_event_data, game_sequence_filter)  
    
    return dataset_df

In [None]:
def add_event_data_info(dataset, dataset_event_data, game_sequence_filter):
    def flatten_(record, sel_cols):
        return dict(map(lambda x: (x, record.get(x)), sel_cols))

    event_data_cols = ['coordinates',
                       'correct', 'duration', 'dwell_time', 'misses', 'round', 'total_duration', 'version']
    event_data_cols_meta = {
        'correct': np.bool, 
        'duration': np.float16, 
        'dwell_time': np.float32, 
        'misses': np.float32, 
        'round': np.float16, 
        'total_duration': np.float32, 
        'version': np.float16
    }
    flatten_ = partial(flatten_, sel_cols=event_data_cols)

    event_data = dataset_event_data['event_data'].to_bag().map(flatten_).to_dataframe(meta=event_data_cols_meta)
    dataset = (dd.concat([dataset, event_data], axis=1)
        .merge(game_sequence_filter, left_on='game_session', right_on='game_session_x', how='inner')
        .query('game_session_y != game_session') # filter out game_session_y data to prevent data leakage
        .drop(columns=['game_session_x'])
    )
    
    dataset_df = dataset.assign(timestamp = lambda df: df['timestamp'].dt.tz_localize(None)).compute(scheduler='threads').reset_index(drop=True)
    
    return dataset_df

In [None]:
def create_entityset(dataset_df):
    es = ft.EntitySet(id="game_session_y_data")

    es = es.entity_from_dataframe(entity_id="actions",
                                  dataframe=dataset_df,
                                  index='action_id',
                                  time_index='timestamp',
                                  make_index=True)

    es = es.normalize_entity(base_entity_id="actions",
                             new_entity_id="game_sessions",
                             index="game_session",
                             additional_variables=["title", "type", "world", "game_session_y", "installation_id"])

    es = es.normalize_entity(base_entity_id="game_sessions",
                             new_entity_id="game_session_ys",
                             index="game_session_y",
                             additional_variables=["installation_id"])

    es = es.normalize_entity(base_entity_id="game_session_ys",
                             new_entity_id="installation_ids",
                             index="installation_id")
    
    return es

In [None]:
def create_feature_matrix(dataset_es):
    feature_matrix, feature_defs = ft.dfs(entityset=dataset_es, target_entity="game_session_ys")    
    return feature_matrix

In [None]:
def train_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_train(dataset, dataset_event_data)))

In [None]:
def test_feature_pipe(dataset, dataset_event_data):
    return create_feature_matrix(create_entityset(preprocess_test(dataset, dataset_event_data)))

### valid label function

In [None]:
def create_installation_kfold_label(train_df, n_split):
    installations = train_df['installation_id'].unique()
    install_ser = pd.Series(installations)
    c = cycle(range(n_split))
    
    result = (install_ser
        .sample(frac=1)
        .to_frame(name='installation_id')
        .assign(fold_label = list(map(lambda _: next(c), range(len(install_ser)))))
        .sort_index()
    )
    
    return result

In [None]:
def create_session_order_label(train_df):
    last_assessment_valid = (train_df
        .groupby(['game_session_y', 'installation_id'], as_index=False)['timestamp']
        .max()
        .assign(session_order_label = lambda df: df.groupby('installation_id')['timestamp'].rank(ascending=False).astype(int))
        [['game_session_y', 'session_order_label']]
    )
    
    return last_assessment_valid

In [None]:
# https://www.kaggle.com/khoongweihao/bayesian-opt-seed-blending-with-tuning-69

import scipy as sp

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -cohen_kappa_score(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(self.coef_['x'])) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

# Reading Data

In [None]:
col_dtype = {
    'event_id':'object', 'game_session':'object', 'installation_id':'object',
    'event_count':'int16', 'event_code':'category', 'game_time':'int32', 'title':'category', 
    'type':'category', 'world':'category'
}

In [None]:
# Train Dataset
train = dd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 'installation_id', 'event_count', 'event_code', 'game_time', 'title', 'type', 'world'])

train_event_data = dd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

# Train target Column
train_labels_df = dd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv').compute(scheduler='threads')

In [None]:
# Test Dataset
test = dd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv', 
                    parse_dates=['timestamp'], 
                    dtype=col_dtype,
                    usecols=['event_id', 'game_session', 'timestamp', 'installation_id', 'event_count', 'event_code', 'game_time', 'title', 'type', 'world'])

test_event_data = dd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv', 
                               converters={'event_data': json.loads},
                               usecols=['event_data'])

sample_submission = dd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv').compute(scheduler='threads')

# Create Features

In [None]:
# Create Train Features
# train_features = train_feature_pipe(train, train_event_data)
# create_feature_matrix(create_entityset(preprocess_train(dataset, dataset_event_data)))

# Raw Training Cutting Data
train_df = preprocess_train(train, train_event_data, cutting_sec=10000)

# Featuretools EntitySet
train_es = create_entityset(train_df)

# Featuretools Feature
train_features = create_feature_matrix(train_es)


# Validation Label
installation_kfold_label = create_installation_kfold_label(train_df, n_split=5)
session_order_label = create_session_order_label(train_df)

# Train Features add target column 'accuracy_group', installation_kfold_label, session_order_label
train_data_df = (train_features
    .reset_index()
    .merge(installation_kfold_label, on='installation_id', how='inner') # add kfold by installation_id
    .merge(session_order_label, on='game_session_y', how='inner') # add session order, session_order_label==1 is last assessment
    .merge(train_labels_df[['game_session', 'accuracy_group']], left_on='game_session_y', right_on='game_session', how='inner') # add target label
    .drop(columns=['game_session_y', 'game_session', 'installation_id'])
    .reset_index(drop=True)
)

# Train Column Schema
target_col = 'accuracy_group'
feature_cols = train_data_df.columns.drop([target_col]).to_list()

In [None]:
train_data_df

In [None]:
# Create Test Features
#test_features = test_feature_pipe(test, test_event_data)

# Raw testing Cutting Data
test_df = preprocess_test(test, test_event_data, cutting_sec=10000)

# Featuretools EntitySet
test_es = create_entityset(test_df)

# Featuretools Feature
test_features = create_feature_matrix(test_es)

# 
test_data_df = (test_features
    .drop(columns=['installation_id'])
    [feature_cols]
)

In [None]:
test_data_df

## h2o

In [None]:
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix

h2o.init()

## Regression

In [None]:
train_data_hf = h2o.H2OFrame(train_data_df)

train = train_data_hf[train_data_hf['session_order_label'] != 1]
valid = train_data_hf[train_data_hf['session_order_label'] == 1]

# Identify predictors and response
x = train.columns
y = "accuracy_group"
x.remove(y)

train[y] = train[y]
valid[y] = valid[y]
aml = H2OAutoML(max_models=10, seed=1, stopping_metric='RMSE')
aml.train(x=x, y=y, training_frame=train, validation_frame=valid, fold_column='fold_label')

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
opr = OptimizedRounder()
opr.fit(aml.predict(train).as_data_frame()['predict'], train[y].as_data_frame().values.reshape(-1))
y_preds = opr.predict(aml.predict(valid).as_data_frame()['predict'])
y_trues = valid[y].as_data_frame()['accuracy_group']
cohen_kappa_score(y_trues, y_preds, weights='quadratic')

In [None]:
confusion_matrix(y_trues, y_preds)

### Multiclass

In [None]:
# train_data_hf = h2o.H2OFrame(train_data_df)

# train, valid = train_data_hf.split_frame(ratios=[0.7], seed=1)

# # Identify predictors and response
# x = train.columns
# y = "accuracy_group"
# x.remove(y)

# train[y] = train[y].asfactor()
# valid[y] = valid[y].asfactor()

# aml = H2OAutoML(max_runtime_secs=1200, seed=1, stopping_metric='RMSE')
# aml.train(x=x, y=y, training_frame=train, validation_frame=valid)

# # View the AutoML Leaderboard
# lb = aml.leaderboard
# lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

# y_preds = aml.predict(valid).as_data_frame()['predict']
# y_trues = valid[y].as_data_frame()['accuracy_group']

# cohen_kappa_score(y_trues, y_preds, weights='quadratic')