In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/train.csv')
train_labels = pd.read_csv('../data/train_labels.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [3]:
train.shape

(11341042, 11)

In [2]:
feat = pd.read_csv('emily_features.csv')

In [4]:
feat.shape

(17577, 5)

### Data Cleaning

Ya don't need 'em if they didn't take an Assessment!

In [4]:
assessed_ids = train[train['type'] == 'Assessment']['installation_id'].unique()
train = train[train['installation_id'].isin(assessed_ids)]
train.shape

(8294138, 11)

Ya don't need 'em if they ain't got no labels!

In [5]:
labeled_ids = train_labels['installation_id'].unique()
train = train[train['installation_id'].isin(labeled_ids)]
train.shape

(7734558, 11)

Turn `timestamp` into a datetime (for sorting purposes, just to be safe), and sort

In [6]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
train.sort_values(['installation_id', 'timestamp'], inplace=True)

***

Notes: 
- not all assessments w/in a single `installation_id` have labels 
- in the test set, for each `installation_id` a random assessment is picked and you have to evaluate that one, so you can only use data prior to that assessment to make a prediction
- when training, make features out of all data prior to an assessment (if assessment has a label), then attach the label of that assessment to make `X` and `y`.
    - think: "what would I have guessed if the data was cut at this assessment"
- therefore, you can make multiple predictions per `installation_id`, one prediction per combination of `installation_id` and `game_session` (only if game_session has a label)

- every assessment begins with `event_code == 2000` and `type == Assessment`, so this is where we need to cut the data off

challenge: we need to evaluate the data for each `installation_id+game_session` pair for each cut
- this is going to take a long ass time
    - how can we do it quicker?

merge the `train` df w/ the `train_labels` df b/c we want to make sure we cut only where there is a label

In [7]:
train_raw = train.copy()  # just in case we need the original data

train = pd.merge(train, train_labels, on=['installation_id', 'game_session'], how='left')

filter the `train` data using a boolean mask, this is what we'll loop through (slowly)

In [8]:
train_cuts = train[(train['event_code'] == 2000) & 
                   (train['type'] == 'Assessment') & 
                   (train['accuracy_group'].notnull())]

how many loop iterations? count the combos of `installation_id` and `game_session`

In [9]:
combos = train_cuts['installation_id'] + train_cuts['game_session']
combos.nunique(), combos.count()  # make sure there are no duplicates

(17690, 17690)

build features

### original

In [10]:
def get_prev_assessment_accuracy(df):
    
    try:
        acc = df[df['type'] == 'Assessment'].tail(1)['accuracy'].item()
    except:
        acc = np.nan
    return acc

In [11]:
%%time

count = 0
X, y = [], []
for i, row in train_cuts.iterrows():
    count += 1
    installation_id, game_session = row['installation_id'], row['game_session']
    df = train[train['installation_id'] == installation_id]
    # get the timestamp of the cut row
    cut_time = df.loc[i,'timestamp']
    # cut the df
    df = df[df['timestamp'] <= cut_time]
    # cut off last row (assumes df is sorted by time)
    df = df.iloc[:-1,:]
    if df.empty:
        continue
    df['accuracy_group'] = row['accuracy_group']
    
#     feature = {'worlds_played': max(df['world'].nunique(), 0),
#                'total_correct': max(df['num_correct'].sum(), 0),
#                'time_played': max((df.iloc[-1]['timestamp'] - df.iloc[0]['timestamp']).total_seconds(), 0),
#                'total_incorrect': max(df['num_incorrect'].sum(), 0),
#                'num_assessments': max((df['type'] == 'Assessment').sum(), 0)}

    feature = {
        'avg_assessment_time': max(df[df['type'] == 'Assessment']['game_time'].mean(),0),
        'tot_time_playing_game': max(df['game_time'].sum(), 0),
        'prev_assessment_accuracy': get_prev_assessment_accuracy(df),
        'avg_assessment_accuracy': df[df['type'] == 'Assessment']['accuracy'].mean()
    }
    
    X.append(feature)
    y.append(row['accuracy_group'])
    
    if count % 1000 == 0:
        print('progress = {}%'.format(count/17690*100))
    
#     if count > 10:  # note we're just making features for the first n cuts
#         break

X = pd.DataFrame(X)
X.shape, len(y)

progress = 5.652911249293386%
progress = 11.305822498586773%
progress = 16.95873374788016%
progress = 22.611644997173546%
progress = 28.26455624646693%
progress = 33.91746749576032%
progress = 39.5703787450537%
progress = 45.22328999434709%
progress = 50.87620124364047%
progress = 56.52911249293386%
progress = 62.182023742227244%
progress = 67.83493499152064%
progress = 73.48784624081401%
progress = 79.1407574901074%
progress = 84.7936687394008%
progress = 90.44657998869418%
progress = 96.09949123798756%
Wall time: 3h 14min 2s


note: there is some loss here (users with no data) and we need to figure out how to handle these cases

In [None]:
X

In [None]:
len(X), len(y)

In [12]:
X.to_csv('emily_features.csv')

In [13]:
X

Unnamed: 0,avg_assessment_accuracy,avg_assessment_time,prev_assessment_accuracy,tot_time_playing_game
0,,,,35855793
1,1.000000,17534.645833,1.000000,71139760
2,0.355556,37020.718519,0.000000,75295894
3,0.488235,28933.117647,,196389427
4,0.490566,25943.733624,0.500000,230405691
5,,,,140477200
6,1.000000,6937.851852,,174095222
7,0.171429,35845.643357,0.000000,179080057
8,,27.333333,,82
9,,,,0


K-Fold Cross Validation

In [None]:
from sklearn import tree
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True)

X, y = np.array(X), np.array(y)
total_runs = skf.get_n_splits()
scores = []
count = 0
for train_index, test_index in skf.split(X, y):
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    score = cohen_kappa_score(clf.predict(X_test), y_test, weights='quadratic')
    scores.append(score)
    print('Run {}/{} -- kappa_score: {}'.format(count, total_runs, score))
print('\nmean score: {}'.format(sum(scores)/len(scores))) 

## Feature Ideas

- mean time in an assessment (exclude outliers?)
- cumulative time playing the game?
- installation duration mean (what does this mean? same as above?)
- accuracy of game/activity directly prior to the assessment
- has it been a long gaming session?


- "exit_type":"game_completed" -- if a player has any other exit types, they are a bad player
- event_id=cdd22e43 -- this event could show a player is unskilled