In [4]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import pickle

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score 

In [5]:
path = r'C:\Users\micha\Desktop\Kaggle - Data Science Bowl 2019'
os.chdir(path)

In [21]:
train = pd.read_csv('train.csv')#, index_col='timestamp', parse_dates=True) #,usecols=keep_cols)
train_labels = pd.read_csv('train_labels.csv')

In [22]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [23]:
train = reduce_mem_usage(train)
train_labels = reduce_mem_usage(train_labels)

Mem. usage decreased to 778.73 Mb (18.2% reduction)
Mem. usage decreased to  0.49 Mb (48.2% reduction)


In [24]:
train_labels['assessment_id'] = range(1,len(train_labels)+1)

# Merge train_labels info with event data info to look at accuracy groups with respect to
# different event info
train_order = pd.merge_ordered(left = train, right = train_labels, 
                 on = ['installation_id','game_session'], how='left')
train_order.sort_index()


### Fill in missing accuracy group values for all events and
### set index to variables that dataframe will be grouped by
# First set index and sort chronologically within an 
#   [installation_id, world] pair with timestamp in index
train_time = train_order.set_index(['installation_id','world','timestamp'])
del train_order

train_time = train_time.sort_index()

# Backfill NaN values of events around assessments, bounded by installation_id and world 
train_time = train_time.groupby(level=['installation_id','world']).bfill()

train_time.info()

# # 'Clip' type activties don't have a gametime, so we need to fill in a time.  Assume about 2min / clip.
train_time_clips = train_time[train_time['type'] == 'Clip'].replace(0,120000)['game_time']
train_time.loc[train_time['type'] == 'Clip','game_time'] = train_time_clips 

# Then fill the rest of the events with 0 for those events 
# not associated with installation_ids or game_sessions where an assessment took place.  
train_time = train_time.fillna(0)

# Add accuracy_group and type to index so you can do a multi-level sort
train_time = train_time.set_index(['game_session','accuracy_group','assessment_id','type'],append=True)

# Auto-dispatch the sum aggregation on multi-level group using the groupby method
# Essentially sum the amount of time for each type of gameplay for a particular installation_id ->
# world -> accuracy_group -> type of event to get an estimate of time spent in 
# each type of activity
train_time_agg = train_time.groupby( \
                 level=['installation_id','world','assessment_id','game_session']) \
                 .last()['game_time']

train_time_agg = train_time['game_time'].groupby(level=['installation_id','world',\
                                                        'assessment_id','type','accuracy_group'])\
                                                        .agg(game_time='sum')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11341042 entries, (0001e90f, MAGMAPEAK, 2019-09-06T17:54:17.519Z) to (fffc0583, TREETOPCITY, 2019-10-10T15:15:12.483Z)
Data columns (total 14 columns):
event_id          object
game_session      object
event_data        object
event_count       int16
event_code        int16
game_time         int32
title_x           object
type              object
title_y           object
num_correct       float64
num_incorrect     float64
accuracy          float16
accuracy_group    float64
assessment_id     float64
dtypes: float16(1), float64(4), int16(2), int32(1), object(6)
memory usage: 1.1+ GB


In [25]:
# Now unstack the type column to get counts for each type of activity within each 
# [installation_id, world] pair
train_time = train_time_agg.unstack('type')

# Fill all those gameplay times that are empty with zeros so that the next step,
# the cumulative sum below, executes correctly. 
train_time = train_time.fillna(0)

# Calculate the cumulative sum of game time at assessment time in each type of gameplay for each 
# [installation_id world] pair
train_time = train_time['game_time'].groupby(level=['installation_id','world'])\
                                        .transform(pd.DataFrame.cumsum)

train_time = train_time.unstack('world')

del train_time_agg

# Fill all [installation_id, world] rows that are missing an activity count with zeros
train_time = train_time.fillna(0)

# Now move the accuracy group out of the index to look at 
# correlation with type of activity counts
# train_time.reset_index(level=3, inplace=True)

# If unstacking world
train_time.reset_index(level=2, inplace=True)

In [26]:
# Time to build models with the gameplay time features

# Create arrays for the features and the response variable
y = train_time['accuracy_group'].values
X = train_time.drop(['accuracy_group','Assessment'], axis=1).values

# del train_time

# Create a k-NN classifier with 4 neighbors, one for each accuracy_group
knn = KNeighborsClassifier(n_neighbors=4)

# Fit the classifier to the data
knn.fit(X, y)

  new_axis = axis.drop(labels, errors=errors)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [27]:
# Print the accuracy
print(knn.score(X, y))

y_pred = knn.predict(X)

cv_scores = cross_val_score(knn, X, y, cv = 5)

# Print the 5-fold cross-validation scores
print(cv_scores)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

0.7522407170294494
[0.61594414 0.61908919 0.61574276 0.63285797 0.62281723]
Average 5-Fold CV Score: 0.6212902584334759


In [29]:
# Its important to use binary mode 
knnPickle = open('knn_World2_pickle_file', 'wb') 

# source, destination 
pickle.dump(knn, knnPickle)  