In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV

In [33]:
df = pd.read_csv('../data/high_diamond_ranked_10min.csv')

In [34]:
# Columns in our DataFrame
df.columns

Index(['gameId', 'blueWins', 'blueWardsPlaced', 'blueWardsDestroyed',
       'blueFirstBlood', 'blueKills', 'blueDeaths', 'blueAssists',
       'blueEliteMonsters', 'blueDragons', 'blueHeralds',
       'blueTowersDestroyed', 'blueTotalGold', 'blueAvgLevel',
       'blueTotalExperience', 'blueTotalMinionsKilled',
       'blueTotalJungleMinionsKilled', 'blueGoldDiff', 'blueExperienceDiff',
       'blueCSPerMin', 'blueGoldPerMin', 'redWardsPlaced', 'redWardsDestroyed',
       'redFirstBlood', 'redKills', 'redDeaths', 'redAssists',
       'redEliteMonsters', 'redDragons', 'redHeralds', 'redTowersDestroyed',
       'redTotalGold', 'redAvgLevel', 'redTotalExperience',
       'redTotalMinionsKilled', 'redTotalJungleMinionsKilled', 'redGoldDiff',
       'redExperienceDiff', 'redCSPerMin', 'redGoldPerMin'],
      dtype='object')

In [35]:
# This is the mean wins, it's our baseline. As expected blue wins 50% of the time
df['blueWins'].mean()

0.4990383642069035

In [36]:
# All our datatypes are as expected and ready to use
df.dtypes

gameId                            int64
blueWins                          int64
blueWardsPlaced                   int64
blueWardsDestroyed                int64
blueFirstBlood                    int64
blueKills                         int64
blueDeaths                        int64
blueAssists                       int64
blueEliteMonsters                 int64
blueDragons                       int64
blueHeralds                       int64
blueTowersDestroyed               int64
blueTotalGold                     int64
blueAvgLevel                    float64
blueTotalExperience               int64
blueTotalMinionsKilled            int64
blueTotalJungleMinionsKilled      int64
blueGoldDiff                      int64
blueExperienceDiff                int64
blueCSPerMin                    float64
blueGoldPerMin                  float64
redWardsPlaced                    int64
redWardsDestroyed                 int64
redFirstBlood                     int64
redKills                          int64


In [37]:
# No values are missing
df.isnull().mean()

gameId                          0.0
blueWins                        0.0
blueWardsPlaced                 0.0
blueWardsDestroyed              0.0
blueFirstBlood                  0.0
blueKills                       0.0
blueDeaths                      0.0
blueAssists                     0.0
blueEliteMonsters               0.0
blueDragons                     0.0
blueHeralds                     0.0
blueTowersDestroyed             0.0
blueTotalGold                   0.0
blueAvgLevel                    0.0
blueTotalExperience             0.0
blueTotalMinionsKilled          0.0
blueTotalJungleMinionsKilled    0.0
blueGoldDiff                    0.0
blueExperienceDiff              0.0
blueCSPerMin                    0.0
blueGoldPerMin                  0.0
redWardsPlaced                  0.0
redWardsDestroyed               0.0
redFirstBlood                   0.0
redKills                        0.0
redDeaths                       0.0
redAssists                      0.0
redEliteMonsters            

In [38]:
# Let's look at the correlations our features have with our target
lt.figure(figsize=(4,8))
sns.heatmap(df.corr()[['blueWins']].
            sort_values(by='blueWins', ascending=False),cmap='coolwarm', annot=True, vmin=-1);

NameError: name 'lt' is not defined

In [None]:
# Some of these are right skewed. I could log them but I'm not going to unless my r2 scores are low
df.hist(figsize=(15, 15));

In [None]:
# print our corrilation values
df_plot = df.corr()[['blueWins']].sort_values(by='blueWins', ascending=False)
df_plot['feature'] = df_plot.index
df_plot.drop(axis = 0, index = 'blueWins', inplace = True)
df_plot.drop(labels = 'gameId', inplace = True)
df_plot.head(50)

In [None]:
# Plot our correlation values
plt.figure(figsize=(20,20))
sns.barplot(y = 'feature', x = 'blueWins', data = df_plot, orient = 'h');

In [None]:
# We'll drop all features that have less than a 0.2 correlation with our target
df_plot.drop((df_plot[abs(df_plot['blueWins']) < .2]).index, axis=0, inplace = True)

In [None]:
# New plot of our correlations with features removed
plt.figure(figsize=(20,20))
sns.barplot(y = 'feature', x = 'blueWins', data = df_plot, orient = 'h');

In [None]:
# Define our features
features = df_plot.index

In [None]:
# Create a new dataframe and populate it with our features
df_out = pd.DataFrame()
df_out = df[features]

In [None]:
# Add our target feature to the dataframe
df_out['blueWins'] = df['blueWins']
df_out.columns

In [16]:
# create csv with our cleaned data
df_out.to_csv('../data/cleaned_data.csv')