In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm

cm = sns.color_palette("coolwarm", as_cmap=True)

In [22]:
cleaned_data = pd.read_csv('./cleaned.csv', index_col='count')
features = ['wr0', 'rating0', 'apm0', 'pps0', 'vs0'] # does not include the '1' version of the stats because they are of two different players
def compute_vif(df, features):    
    vif = pd.DataFrame()
    vif['Features'] = features
    vif['vif score'] = [variance_inflation_factor(df, i) for i in range(len(features))]
    return vif

compute_vif(cleaned_data[features], features)

Unnamed: 0,Features,vif score
0,wr0,17.38442
1,rating0,16.840981
2,apm0,229.071913
3,pps0,44.261417
4,vs0,265.420226


**Modify Features**
* combine wr0 and wr1 into wr delta
* combine rating0 and rating1 into rating delta
* combine apm0 and apm1 into apm delta
* combine pps0 and pps1 into pps delta
* combine vs0 and vs1 into vs delta

In [23]:

cleaned_data['wr delta'] = cleaned_data['wr0'] - cleaned_data['wr1']
cleaned_data['rating delta'] = cleaned_data['rating0'] - cleaned_data['rating1']
cleaned_data['apm delta'] = cleaned_data['apm0'] - cleaned_data['apm1']
cleaned_data['pps delta'] = cleaned_data['pps0'] - cleaned_data['pps1']
cleaned_data['vs delta'] = cleaned_data['vs0'] - cleaned_data['vs1']

features2 = ['wr delta', 'rating delta', 'apm delta', 'pps delta', 'vs delta']
compute_vif(cleaned_data[features2], features2)

Unnamed: 0,Features,vif score
0,wr delta,1.203991
1,rating delta,1.710192
2,apm delta,12.131076
3,pps delta,1.205804
4,vs delta,13.534938


* Drop apm delta

In [24]:
features3 = ['wr delta', 'rating delta', 'pps delta', 'vs delta']
compute_vif(cleaned_data[features3], features3)

Unnamed: 0,Features,vif score
0,wr delta,1.203824
1,rating delta,1.553054
2,pps delta,1.168093
3,vs delta,1.541321


In [25]:
cleaned_data[features3].corr().style.background_gradient(cmap=cm)

Unnamed: 0,wr delta,rating delta,pps delta,vs delta
wr delta,1.0,0.351434,0.111456,0.264498
rating delta,0.351434,1.0,0.256996,0.498696
pps delta,0.111456,0.256996,1.0,0.348153
vs delta,0.264498,0.498696,0.348153,1.0


**Modifying the Data**

When the data was retrieved, the winner of the match was always the first player, which is the player that has a winrate of wr0, and the loser of the match was always the second player.  

As a result, the winner was always 0 and never 1. So I'm going to flip half of the data so that the player who has a winrate of wr0 is the player that loses.

The data ranges are quite different between features, for instance the avg rating feature ranges from 100 to 25000 while the range for pps delta is around 1. I will use min max normalization to scale the data to help with gradient descent.

In [26]:
flip_data = cleaned_data[['wr delta','rating delta', 'pps delta', 'vs delta']]
# Flip data
m = flip_data.shape[0]
print(m)

flip_data['winner'] = np.zeros(m, dtype='int')
for i in tqdm(range(m)):
    if (i % 2 == 1):
        flip_data.iat[i,0] *= -1
        flip_data.iat[i, 1] *= -1
        flip_data.iat[i, 2] *= -1
        flip_data.iat[i, 3] *= -1
        flip_data.iat[i, 4] = 1
flip_data.to_csv('./flip_data.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flip_data['winner'] = np.zeros(m, dtype='int')


459899


100%|██████████| 459899/459899 [00:26<00:00, 17418.83it/s]


In [38]:
# Scale Data
def min_max (column):
    min = column.min()
    max = column.max()

    norm = (column - min) / (max - min)

    return norm

flip_data = pd.read_csv('./flip_data.csv')
n = flip_data.shape[1]
final_data = pd.DataFrame()
final_data['count'] = [i for i in range(m)]
final_data.set_index('count', inplace=True)

final_data['wr delta'] = min_max(flip_data['wr delta'])
final_data['rating delta'] = min_max(flip_data['rating delta'])
final_data['pps delta'] = min_max(flip_data['pps delta'])
final_data['vs delta'] = min_max(flip_data['vs delta'])
final_data['winner'] = flip_data['winner']

final_data.to_csv('./final_data.csv')