In [257]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelBinarizer
from datetime import datetime

In [258]:
# using the disaggregated dataset from https://osf.io/akqt4/
filename=os.path.join('crowdstorm_disaggregated.csv') 
df = pd.read_csv(filename)
df = df.dropna()

In [259]:
df.columns

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp', 'skintone', 'allreds', 'allredsStrict',
       'refCount'],
      dtype='object')

In [260]:
data = np.array(df)

In [261]:
lb = LabelBinarizer()
club = lb.fit_transform(data[:,2])

In [262]:
lb = LabelBinarizer()
league = lb.fit_transform(data[:,3])

In [263]:
lb = LabelBinarizer()
position = lb.fit_transform(data[:,7])

In [264]:
refCountry = data[:,20] # are ordinal numbers = transform to one-hot

In [265]:
skintone = data[:,28]

In [266]:
birthday = np.array([ datetime.strptime(date, '%d.%m.%Y') for date in data[:,4]], dtype='datetime64[D]')
age =  np.array(np.datetime64(datetime.now()) - birthday, dtype="timedelta64[Y]")

In [267]:
games = data[:,8]

In [268]:
victories = data[:,9]

In [269]:
ties = data[:,10]

In [270]:
defeats = data[:,11]

In [271]:
goals = data[:,12]

In [272]:
yellowCards = data[:,13]

In [273]:
refNum = data[:,19]

In [274]:
allReds = data[:,29]

In [275]:
# Exclude playerShort, player, club (adds too many dimensions at the moment), birthday (use age instead), 
x = np.column_stack((league, position, refCountry, skintone, age, games, victories, ties, defeats, goals, yellowCards, refNum, refCountry))
y = np.array(allReds / games)

In [276]:
x.shape

(350269, 27)

In [277]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [278]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [279]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
prediction = reg.predict(X_test)
mean_squared_error(y_test, prediction)

0.0016856561289310193

In [280]:
#regr = RandomForestRegressor()
#regr = regr.fit(X_train, y_train)
#prediction = regr.predict(X_test)
#mean_squared_error(y_test, prediction)

In [281]:
errors1 = []
errors2 = []
rating = x[:,0]
for it in range(19):
    randomRating = np.random.permutation(rating)
    data = x.copy()
    data[:,0] = randomRating

    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.4, random_state=0)

    reg = linear_model.LinearRegression()
    reg.fit(X_train, y_train)
    prediction = reg.predict(X_test)

    error = mean_squared_error(y_test, prediction)
    errors1.append(error)

    # Takes very long (ca. 5min)
    # regr = RandomForestRegressor()
    # regr = regr.fit(X_train, y_train)
    # prediction = regr.predict(X_test)

    error = mean_squared_error(y_test, prediction)
    errors2.append(error)

print(np.average(errors1))
print(np.average(errors2))

0.0016856726083804797
0.0016856726083804797
