In [27]:
import pandas as pd
import numpy as np
import glob

In [28]:
files = glob.glob('features/*.csv')
files

['features\\emily_features.csv',
 'features\\labels.csv',
 'features\\sheehan_features.csv']

In [29]:
dflist = []
for f in files:
    df = pd.read_csv(f)
    dflist.append(df)
    
df = pd.concat(dflist, axis=1)
df.shape

(17577, 33)

In [30]:
exclude_cols = ['Unnamed: 0', 'installation_id', 'game_session']
df = df[[col for col in df.columns if col not in exclude_cols]]
df.shape

(17577, 31)

In [31]:
# to fix class imbalance
df['labels'] = df['labels'].astype('int64')

zeros = df[df['labels'] == 0]
ones = df[df['labels'] == 1]
twos = df[df['labels'] == 2]

df = pd.concat([df, zeros, ones, ones, ones, twos, twos, twos])

In [32]:
y = df['labels']
del df['labels']

In [33]:
df.shape

(35533, 30)

In [34]:
avg_assessment_accuracy = df['avg_assessment_accuracy'].mean()
avg_assessment_time = df['avg_assessment_time'].mean()

fill_vals = {'avg_assessment_accuracy': avg_assessment_accuracy,
             'avg_assessment_time': avg_assessment_time,
             'prev_assessment_accuracy': avg_assessment_accuracy,
             'assessment_taken': 0,
             'times_took_asses': 0
             }

df.fillna(fill_vals, inplace=True)

In [35]:
X = df

In [36]:
from sklearn.preprocessing import StandardScaler

X2 = X.copy()
for col in X2.columns:
    scaler = StandardScaler()
    data = np.array(X2[col]).reshape(-1, 1)
    scaler.fit(data)
    X2[col] = scaler.transform(data)



In [37]:
from sklearn import tree
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

model = tree.DecisionTreeClassifier
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True)

X, y = np.array(X), np.array(y)
total_runs = skf.get_n_splits()
scores = []
count = 0
for train_index, test_index in skf.split(X, y):
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = model()
    clf = clf.fit(X_train, y_train)
    score = cohen_kappa_score(clf.predict(X_test), y_test, weights='quadratic')
    scores.append(score)
    print('Run {}/{} -- kappa_score: {}'.format(count, total_runs, score))
print('\nmean score: {}'.format(sum(scores)/len(scores))) 

Run 1/5 -- kappa_score: 0.691865072486128
Run 2/5 -- kappa_score: 0.6946123876255179
Run 3/5 -- kappa_score: 0.6740761655879589
Run 4/5 -- kappa_score: 0.6786301698034738
Run 5/5 -- kappa_score: 0.6927399014396027

mean score: 0.6863847393885362


In [127]:
from sklearn import tree
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeClassifier

model = tree.DecisionTreeClassifier
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True)

X, y = np.array(X2), np.array(y)
total_runs = skf.get_n_splits()
scores = []
count = 0
for train_index, test_index in skf.split(X, y):
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = model()
    clf = clf.fit(X_train, y_train)
    score = cohen_kappa_score(clf.predict(X_test), y_test, weights='quadratic')
    scores.append(score)
    print('Run {}/{} -- kappa_score: {}'.format(count, total_runs, score))
print('\nmean score: {}'.format(sum(scores)/len(scores))) 

Run 1/5 -- kappa_score: 0.6026190039710442
Run 2/5 -- kappa_score: 0.5758403198600048
Run 3/5 -- kappa_score: 0.5705334143075844
Run 4/5 -- kappa_score: 0.5795979639568171
Run 5/5 -- kappa_score: 0.5853871022940575

mean score: 0.5827955608779016
