In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime, timedelta
import json
import pickle as pkl

from sklearn import tree

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')

# Helper Functions

In [None]:
def get_prev_assessment_accuracy(df):
    try:
        acc = df[df['type'] == 'Assessment'].tail(1)['accuracy'].item()
    except:
        acc = np.nan
    return acc

In [None]:
def num_events_during_school_hours(df):
    start_time = datetime(2019, 9, 5, 7, 45).time()
    end_time = datetime(2019, 9, 6, 15, 30).time()
    df.index = df['timestamp']
    df = df.between_time(start_time, end_time)
    return len(df)

In [None]:
def total_number_of_misses(df):
    total_number_of_misses = (df['num_correct'] == 0).sum()
    return total_number_of_misses

In [None]:
def percentage_of_misses(df):
    percentage_misses = total_number_of_misses(df) / df['num_correct'].notnull()
    return percentage_misses

In [None]:
def calc_exit_type_other_than_gamecompleted(df):
    """1 if never had an exit_type other than 'game_completed', and 0 otherwise"""
    a = df['event_data'].apply(json.loads).apply(lambda d: d['exit_type'] if 'exit_type' in d else np.nan)
    a = a[a.notnull()]
    if len(a) == 0:
        return 0
    if (a == 'game_completed').all():
        return 1
    else:
        return 0

In [None]:
def num_events_correct_json(df):
    """Count number of events that say 'correct':true"""
    a = df['event_data'].apply(json.loads).apply(lambda d: int(d['correct']) if 'correct' in d else np.nan)
    a = a[a.notnull()]
    if len(a) == 0:
        return 0
    else:
        return a.sum()

In [None]:
def num_events_incorrect_json(df):
    """Count number of events that say 'correct':false"""
    a = df['event_data'].apply(json.loads).apply(lambda d: int(not d['correct']) if 'correct' in d else np.nan)
    a = a[a.notnull()]
    if len(a) == 0:
        return 0
    else:
        return a.sum()

In [None]:
def calc_game_play_total_time(df):
    session_times = df.groupby('game_session')['timestamp']
    return (session_times.max() - session_times.min()).sum().total_seconds()

In [None]:
def num_wild_click_sessions(df, clicks, seconds):
    """A wild click session is defined as a spurt of at least n clicks in m seconds.
    Warning: this will measure long click sessions as multiple shorter ones.
    
    ARGS: 
    df -- (DataFrame) the user's complete history
    clicks -- (int) number of clicks in the clickspurt.
    seconds -- (float) number of seconds that define the span of a clickspurt.
    
    RETURNS: the amount of individual wild click sessions in player history
    """
    return (df['timestamp'].diff(clicks) < timedelta(seconds=seconds)).sum()

In [None]:
def longest_wild_click_session(df, clicks, seconds):
    """A wild click session (WCS) is defined as a spurt of at least n clicks in m seconds.
    If you consider consecutive WCSs as a single, longer WCS, you can measure the duration of them.
    This function returns the duration of the longest WCS in a user's history,
    and the number of WCSs the user had if you treatconsecutive WCSs as a single, longer WCS.
    
    ARGS: 
    df -- (DataFrame) the user's complete history
    clicks -- (int) number of clicks in the clickspurt.
    seconds -- (float) number of seconds that define the span of a clickspurt.
    
    RETURNS: 
    longest_run -- (int) longest run of consecutive WCSs
    number_of_runs -- (int) number of WCSs if you treat consecutive ones as a single WCS
    """
    df = df.reset_index(drop=True)  # just in case there are two rows with the same index label
    wcs_rows = (df['timestamp'].diff(clicks) < timedelta(seconds=seconds))
    wcs_rows = wcs_rows[wcs_rows].index  # get all the locations where a single WCS occurred
    wcs_locations = pd.Series(df.loc[wcs_rows,:].index)
    location_diffs = wcs_locations.diff(1).iloc[1:]  # the first value is null

    longest_run, number_of_runs, current_run = 0, 0, 0
    for val in location_diffs:
        if val == 1:
            current_run += 1
        else:
            longest_run = max(longest_run, current_run)
            if current_run > 0:
                number_of_runs += 1
            current_run = 0

    return longest_run, number_of_runs

In [None]:
def length_of_current_play_session(df, break_minutes=30):
    """A play session is active until there are no events for at least m minutes."""
    df = df.reset_index(drop=True)
    break_start_locations = (df['timestamp'].diff(1) > timedelta(minutes=break_minutes)).reset_index(drop=True)
    break_start_locations.iloc[0] = True
    break_start_locations = break_start_locations[break_start_locations].index
    break_end_locations = pd.Series(break_start_locations - 1).iloc[1:]
    break_end_locations = break_end_locations.append(pd.Series([len(df) - 1])).reset_index(drop=True)
    session_durations = df.loc[break_end_locations, 'timestamp'].reset_index(drop=True) - df.loc[break_start_locations, 'timestamp'].reset_index(drop=True)
    return session_durations.iloc[-1].total_seconds()

In [None]:
def num_play_sessions(df, break_minutes=30):
    """A play session is active until there are no events for at least m minutes."""
    df = df.reset_index(drop=True)
    break_start_locations = (df['timestamp'].diff(1) > timedelta(minutes=break_minutes)).reset_index(drop=True)
    break_start_locations.iloc[0] = True
    break_start_locations = break_start_locations[break_start_locations].index
    break_end_locations = pd.Series(break_start_locations - 1).iloc[1:]
    break_end_locations = break_end_locations.append(pd.Series([len(df) - 1])).reset_index(drop=True)
    session_durations = df.loc[break_end_locations, 'timestamp'].reset_index(drop=True) - df.loc[break_start_locations, 'timestamp'].reset_index(drop=True)
    return len(session_durations)

In [None]:
def current_play_session_compared_to_mean(df, break_minutes=30):
    df = df.reset_index(drop=True)
    break_start_locations = (df['timestamp'].diff(1) > timedelta(minutes=break_minutes)).reset_index(drop=True)
    break_start_locations.iloc[0] = True
    break_start_locations = break_start_locations[break_start_locations].index
    break_end_locations = pd.Series(break_start_locations - 1).iloc[1:]
    break_end_locations = break_end_locations.append(pd.Series([len(df) - 1])).reset_index(drop=True)
    session_durations = df.loc[break_end_locations, 'timestamp'].reset_index(drop=True) - df.loc[break_start_locations, 'timestamp'].reset_index(drop=True)
    mean_session_duration = session_durations.mean()
    return (mean_session_duration.total_seconds() - session_durations.iloc[-1].total_seconds())

In [None]:
def current_play_session_compared_to_median(df, break_minutes=30):
    df = df.reset_index(drop=True)
    break_start_locations = (df['timestamp'].diff(1) > timedelta(minutes=break_minutes)).reset_index(drop=True)
    break_start_locations.iloc[0] = True
    break_start_locations = break_start_locations[break_start_locations].index
    break_end_locations = pd.Series(break_start_locations - 1).iloc[1:]
    break_end_locations = break_end_locations.append(pd.Series([len(df) - 1])).reset_index(drop=True)
    session_durations = df.loc[break_end_locations, 'timestamp'].reset_index(drop=True) - df.loc[break_start_locations, 'timestamp'].reset_index(drop=True)
    mean_session_duration = session_durations.median()
    return (mean_session_duration.total_seconds() - session_durations.iloc[-1].total_seconds())

In [None]:
def calc_global_session_lengths(df):
    global_session_lengths = df.groupby(['installation_id', 'game_session'])[['title', 'game_time', 'event_count']].max()
    global_session_lengths[global_session_lengths['game_time'] > 0]
    global_session_lengths = global_session_lengths.groupby('title')[['game_time', 'event_count']]
    global_mean_session_lengths = global_session_lengths.mean()
    global_median_session_lengths = global_session_lengths.median()
    return global_mean_session_lengths, global_median_session_lengths

In [None]:
def calc_player_vs_global_features(df, global_mean_session_lengths, global_median_session_lengths):
    session_lengths = df.groupby('game_session')[['title', 'game_time', 'event_count']].max()
    session_lengths = session_lengths[session_lengths['game_time'] > 0]  # ignore sessions w/ no duration
    player_session_lengths = session_lengths.groupby('title')[['game_time', 'event_count']]
    player_mean_session_lengths = player_session_lengths.mean()
    player_median_session_lengths = player_session_lengths.median()
    
    player_vs_global_mean = pd.merge(player_mean_session_lengths, global_mean_session_lengths, 
                                on='title', how='inner', suffixes=['_player', '_global'])
    p_v_g_game_time_mean = player_vs_global_mean['game_time_player'] - player_vs_global_mean['game_time_global']
    p_v_g_event_count_mean = player_vs_global_mean['event_count_player'] - player_vs_global_mean['event_count_global']
    p_v_g_gt_sum_mean = p_v_g_game_time_mean.sum()
    p_v_g_ec_sum_mean = p_v_g_event_count_mean.sum()
    
    player_vs_global_median = pd.merge(player_median_session_lengths, global_median_session_lengths, 
                                on='title', how='inner', suffixes=['_player', '_global'])
    p_v_g_game_time_median = player_vs_global_median['game_time_player'] - player_vs_global_median['game_time_global']
    p_v_g_event_count_median = player_vs_global_median['event_count_player'] - player_vs_global_median['event_count_global']
    p_v_g_gt_sum_median = p_v_g_game_time_median.sum()
    p_v_g_ec_sum_median = p_v_g_event_count_median.sum()
    return (p_v_g_gt_sum_mean, p_v_g_ec_sum_mean, 
            p_v_g_gt_sum_median, p_v_g_ec_sum_median)

In [None]:
def events_from_specs():
    df = specs
    correct_events = df[(df['info'].str.contains('\(Correct\)', case=False, na=False)) | (df['info'].str.contains(' correct', case=False, na=False))]
    correct_events = set(correct_events['event_id'])
    incorrect_events = df[(df['info'].str.contains('\(Inorrect\)', case=False, na=False)) | (df['info'].str.contains('incorrect', case=False, na=False))]
    incorrect_events = set(incorrect_events['event_id'])
    return correct_events, incorrect_events

In [None]:
def num_events_correct_eventid(df, correct_events):
    return df['event_id'].isin(correct_events).sum()

In [None]:
def num_events_incorrect_eventid(df, incorrect_events):
    return df['event_id'].isin(incorrect_events).sum()

In [None]:
def times_took_assess(df):
    df_1 = df.groupby(["installation_id", "title"]).transform('count')
    df_2= df_1[["event_id"]]
    df_2 = df_2.rename(columns={"event_id": "times_played"})
    df = df.merge(df_2, left_index=True, right_index=True)
    return df["times_played"].iloc[-1]

In [None]:
def calc_all_features(df):
    feature = {'installation_id': installation_id,
               'worlds_played': max(df['world'].nunique(), 0),
               'time_as_player': max((df.iloc[-1]['timestamp'] - df.iloc[0]['timestamp']).total_seconds(), 0),
               'num_assessments': max((df['type'] == 'Assessment').sum(), 0),
               'avg_assessment_time': max(df[df['type'] == 'Assessment']['game_time'].mean(), 0),
               'tot_time_playing_game': max(df['game_time'].sum(), 0),
               'prev_assessment_accuracy': get_prev_assessment_accuracy(df),
               'num_events_during_school_hours': num_events_during_school_hours(df),
               'exit_type_other_than_gamecompleted': calc_exit_type_other_than_gamecompleted(df),
               'game_play_total_time': calc_game_play_total_time(df),
               'num_wild_click_sessions': num_wild_click_sessions(df, 5, 0.9),
               'num_wild_click_sessions_grouped': num_wild_click_sessions_grouped,
               'longest_wild_click_run': longest_wild_click_run,
               'length_of_current_play_session': length_of_current_play_session(df, 30),
               'num_play_sessions': num_play_sessions(df, 30),
               'current_play_session_compared_to_mean': current_play_session_compared_to_mean(df, 30),
               'current_play_session_compared_to_median': current_play_session_compared_to_median(df, 30),
               'p_v_g_gt_sum_mean': p_v_g_gt_sum_mean,
               'p_v_g_ec_sum_mean': p_v_g_ec_sum_mean,
               'p_v_g_gt_sum_median': p_v_g_gt_sum_median,
               'p_v_g_ec_sum_median': p_v_g_ec_sum_median,
               'num_events_correct_json': num_events_correct_json(df),
               'num_events_incorrect_json': num_events_correct_json(df),
               'num_events_correct_eventid': num_events_correct_eventid(df, correct_events),
               'num_events_incorrect_eventid': num_events_incorrect_eventid(df, incorrect_events),
               'part_of_day': max(df["segment_of_day"].iloc[-1], 0),
               'assessment_taken': max(df["assessment"].iloc[-1], 0),                
               'time_playing_for': max(df["game_time"].iloc[-1], 0),
               'times_took_asses': times_took_assess(df),
               '2010': max(df[2010].iloc[-1], 0),
               '2020': max(df[2020].iloc[-1], 0),
               '2025': max(df[2025].iloc[-1], 0),
               '2030': max(df[2030].iloc[-1], 0),
               '2035': max(df[2035].iloc[-1], 0),
               '3010': max(df[3010].iloc[-1], 0),
               '3020': max(df[3020].iloc[-1], 0),
               '3021': max(df[3021].iloc[-1], 0),
               '3110': max(df[3110].iloc[-1], 0),
               '3120': max(df[3120].iloc[-1], 0),
               '3121': max(df[3121].iloc[-1], 0),
               '4020': max(df[4020].iloc[-1], 0),
               '4025': max(df[4025].iloc[-1], 0),
               '4030': max(df[4030].iloc[-1], 0),
               '4035': max(df[4035].iloc[-1], 0),
               '4040': max(df[4040].iloc[-1], 0),
               '4070': max(df[4070].iloc[-1], 0),
               '4080': max(df[4080].iloc[-1], 0),
               '4090': max(df[4090].iloc[-1], 0),
               '4100': max(df[4100].iloc[-1], 0),
               '4110': max(df[4110].iloc[-1], 0)
#                'total_number_of_misses': total_number_of_misses(df),
#                'percentage_of_misses': percentage_of_misses(df),
#                'avg_assessment_accuracy': df[df['type'] == 'Assessment']['accuracy'].mean(),
#                'total_correct': max(df['num_correct'].sum(), 0),
#                'total_incorrect': max(df['num_incorrect'].sum(), 0),
#                'playtime_vs_avg': time_compared_to_normal(df),
              }
    return feature

# Train

In [None]:
# TRAIN - pre-processing
assessed_ids = train[train['type'] == 'Assessment']['installation_id'].unique()
train = train[train['installation_id'].isin(assessed_ids)]
train.shape

labeled_ids = train_labels['installation_id'].unique()
train = train[train['installation_id'].isin(labeled_ids)]

train['timestamp'] = pd.to_datetime(train['timestamp'])
train.sort_values(['installation_id', 'timestamp'], inplace=True)

train = pd.merge(train, train_labels, on=['installation_id', 'game_session'], how='left')

train_cuts = train[(train['event_code'] == 2000) & 
                   (train['type'] == 'Assessment') & 
                   (train['accuracy_group'].notnull())]

# TRAIN - global calculations for features
global_mean_session_lengths, global_median_session_lengths = calc_global_session_lengths(train)
correct_events, incorrect_events = events_from_specs()

a = train[["installation_id", "event_code"]]
a1 = a.pivot_table(index='installation_id', columns='event_code', aggfunc=len, fill_value=0)
a1 = a1[[2010, 2020,2025, 2030, 2035,3010, 3020, 3021,3110, 3120, 3121,4020, 4025, 4030,4035, 4040, 4070,4080, 4090,4100,4110]]
a1["installation_id"] = a1.index
a1.reset_index(drop=True, inplace=True)
train = pd.merge(train, a1, on=['installation_id'], how='left')

train["time_of_day"] = train["timestamp"].astype(str).str[11:13]
train["time_of_day"] = train["time_of_day"].astype(int)
train["segment_of_day"] = np.where(train["time_of_day"]<7, 1.524239, 
                          np.where(train["time_of_day"]<12, 1.746823, 
                           np.where(train["time_of_day"]<18, 1.556186, 1.502395)))

train["assessment"] = np.where(train["title_x"]=="Bird Measurer (Assessment)", 1.14, 
                      np.where(train["title_x"]=="Cart Balancer (Assessment)", 1.86, 
                       np.where(train["title_x"]=="Cauldron Filler (Assessment)", 2.08, 
                        np.where(train["title_x"]=="Chest Sorter (Assessment)", 0.67, 
                         np.where(train["title_x"]=="Bird Measurer (Assessment)", 1.97, float("nan"))))))

In [None]:
# TRAIN - calculate features
count = 0
X, y = [], []
for i, row in train_cuts.iterrows():
    count += 1
    installation_id, game_session = row['installation_id'], row['game_session']
    df = train[train['installation_id'] == installation_id]
    # get the timestamp of the cut row
    cut_time = df.loc[i,'timestamp']
    # cut the df
    df = df[df['timestamp'] <= cut_time]
    if df.empty:
        continue
    df['accuracy_group'] = row['accuracy_group']
    
    # added player calculations, used to create features below
    longest_wild_click_run, num_wild_click_sessions_grouped = longest_wild_click_session(df, 5, 0.9)
    p_v_g_stats = calc_player_vs_global_features(df, global_mean_session_lengths, global_median_session_lengths)
    p_v_g_gt_sum_mean, p_v_g_ec_sum_mean, p_v_g_gt_sum_median, p_v_g_ec_sum_median = p_v_g_stats
    
    feature = calc_all_features(df)
    
    X.append(feature)
    y.append(row['accuracy_group'])
    
    if count % 1000 == 0:
        print('progress = {}%'.format(count/17690*100))
    
#     if count > 2:  # note we're just making features for the first n cuts
#         break

X = pd.DataFrame(X)
X.to_csv('all_features.csv', index=False)
X.shape, len(y)

In [None]:
# TRAIN - fill missing values
fill_vals = {'assessment_taken': 0,
             'prev_assessment_accuracy': 0}
X.fillna(fill_vals, inplace=True)

In [None]:
# TRAIN - augment data to even out the classes
X_y = pd.concat([X, pd.Series(y, name='labels')], axis=1)

X_y['labels'] = X_y['labels'].astype('int64')
zeros = X_y[X_y['labels'] == 0]
ones = X_y[X_y['labels'] == 1]
twos = X_y[X_y['labels'] == 2]

X_y = pd.concat([X_y, zeros, ones, ones, ones, twos, twos, twos])
X = X_y[[col for col in X_y.columns if col not in ['labels']]]
y = X_y['labels']

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

# Test

In [None]:
test['timestamp'] = pd.to_datetime(test['timestamp'])
test.sort_values(['installation_id', 'timestamp'], inplace=True)

In [None]:
# TEST - added global calculations, used to create features
# global_mean_session_lengths, global_median_session_lengths = calc_global_session_lengths(pd.concat([train, test]))
# correct_events, incorrect_events = events_from_specs()

a = test[["installation_id", "event_code"]]
a1 = a.pivot_table(index='installation_id', columns='event_code', aggfunc=len, fill_value=0)
a1 = a1[[2010, 2020,2025, 2030, 2035,3010, 3020, 3021,3110, 3120, 3121,4020, 4025, 4030,4035, 4040, 4070,4080, 4090,4100,4110]]
a1["installation_id"] = a1.index
a1.reset_index(drop=True, inplace=True)
test = pd.merge(test, a1, on=['installation_id'], how='left')

test["time_of_day"] = test["timestamp"].astype(str).str[11:13]
test["time_of_day"] = test["time_of_day"].astype(int)
test["segment_of_day"] = np.where(test["time_of_day"]<7, 1.524239, 
                          np.where(test["time_of_day"]<12, 1.746823, 
                           np.where(test["time_of_day"]<18, 1.556186, 1.502395)))

test["assessment"] = np.where(test["title"]=="Bird Measurer (Assessment)", 1.14, 
                      np.where(test["title"]=="Cart Balancer (Assessment)", 1.86, 
                       np.where(test["title"]=="Cauldron Filler (Assessment)", 2.08, 
                        np.where(test["title"]=="Chest Sorter (Assessment)", 0.67, 
                         np.where(test["title"]=="Bird Measurer (Assessment)", 1.97, float("nan"))))))

In [None]:
# TEST - create features
count = 0
X, y = [], []
for installation_id in test['installation_id'].unique():
    count += 1
    df = test[test['installation_id'] == installation_id]
    
    # added player calculations, used to create features below
    longest_wild_click_run, num_wild_click_sessions_grouped = longest_wild_click_session(df, 5, 0.9)
    p_v_g_stats = calc_player_vs_global_features(df, global_mean_session_lengths, global_median_session_lengths)
    p_v_g_gt_sum_mean, p_v_g_ec_sum_mean, p_v_g_gt_sum_median, p_v_g_ec_sum_median = p_v_g_stats
    
    feature = calc_all_features(df)
    
    X.append(feature)
    
    if count % 100 == 0:
        print('progress = {}%'.format(count/1000*100))
    
#     if count > 2:  # note we're just making features for the first n cuts
#         break

X = pd.DataFrame(X)
X.shape

In [None]:
fill_vals = {'assessment_taken': 0,
             'prev_assessment_accuracy': 0}
X.fillna(fill_vals, inplace=True)

In [None]:
cols = [col for col in X.columns if col not in ['installation_id']]

X_1 = np.array(X[cols])
X['accuracy_group'] = clf.predict(X_1)

submission = X[['installation_id', 'accuracy_group']]
submission

In [None]:
submission.to_csv('submission.csv')