In [168]:
from sklearn.feature_selection.variance_threshold import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cluster import KMeans
from pandas import read_excel

import warnings
warnings.filterwarnings('ignore')

In [2]:
vt = VarianceThreshold()

In [3]:
slo_df = read_excel('slo_dataset.xlsx')

In [136]:
slo_df.reset_index(drop=True, inplace=True)

In [137]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [138]:
stats_df = slo_df.select_dtypes(include=numerics)

### Removing identifiers

In [139]:
stats_df.drop([col for col in stats_df.columns if 'Id' in col], axis=1, inplace=True)

### Removing useless numeric columns

In [140]:
stats_df.drop(['season', 'champLevel'], axis=1, inplace=True)

### Normalizing the rest of the stats by time

#### Transform game duration format into minutes

In [141]:
stats_df['gameDuration_in_minutes'] = stats_df.gameDuration / 60

### Exclude columns that aren't affected by time

In [142]:
stats_to_normalize = [col for col in stats_df.columns if '_at_' not in col and 'tt' not in col and 'gameDuration' not in col]

In [143]:
stats_normalized_df = stats_df[stats_to_normalize].apply(lambda x: x / stats_df.gameDuration_in_minutes)

In [144]:
not_time_affected_stats_df = stats_df[[col for col in stats_df.columns if '_at_' in col or 'tt' in col]]

### Clustering playstyles by position

In [145]:
positions = slo_df.position.unique().tolist()

In [146]:
positions

['TOP', 'JUNG', 'MID', 'ADC', 'SUPP']

In [147]:
stats_by_position = {}

In [169]:
for i, p in enumerate(positions):
    # Preprocessing
    stats = stats_normalized_df[i::5]
    nan_cols = stats.iloc[:, stats.isnull().any().tolist()].columns
    stats.drop(nan_cols, axis=1, inplace=True)
    labels = slo_df[i::5].win
    
    # Clustering
    km = KMeans(n_clusters=3)
    clusters = km.fit_predict(X=stats)
    stats['clusters'] = clusters
    c0 = stats[stats.clusters == 0]
    c1 = stats[stats.clusters == 1]
    c2 = stats[stats.clusters == 2]
    
    clusters = [c0, c1, c2]
    stats_by_position[p] = {'X': stats, 'top_10_features_by_cluster': []}
    for i, c in enumerate(clusters):
        c_new = SelectKBest(chi2, k=10).fit(X=c, y=slo_df.ix[c.index].win)
        c_new_cols = c.iloc[:, c_new.get_support()].columns.tolist()
        stats_by_position[p]['top_10_features_by_cluster'].append(c_new_cols)

In [171]:
stats_by_position['SUPP']['top_10_features_by_cluster']

[['damageDealtToObjectives',
  'damageDealtToTurrets',
  'damageSelfMitigated',
  'goldEarned',
  'goldSpent',
  'magicalDamageTaken',
  'physicalDamageDealt',
  'totalDamageDealt',
  'totalHeal',
  'trueDamageDealt'],
 ['damageDealtToObjectives',
  'damageDealtToTurrets',
  'goldEarned',
  'goldSpent',
  'magicDamageDealt',
  'magicDamageDealtToChampions',
  'totalDamageDealt',
  'totalDamageTaken',
  'totalHeal',
  'trueDamageDealt'],
 ['damageDealtToObjectives',
  'damageDealtToTurrets',
  'damageSelfMitigated',
  'goldEarned',
  'goldSpent',
  'magicalDamageTaken',
  'physicalDamageDealt',
  'totalDamageDealt',
  'totalDamageTaken',
  'totalHeal']]

In [86]:
vt = VarianceThreshold(threshold=.5)

In [89]:
vt.fit(X=top_c_2)

VarianceThreshold(threshold=0.5)

In [90]:
top_stats.iloc[:, vt.get_support()].columns

Index(['damageDealtToObjectives', 'damageDealtToTurrets',
       'damageSelfMitigated', 'goldEarned', 'goldSpent',
       'largestCriticalStrike', 'longestTimeSpentLiving', 'magicDamageDealt',
       'magicDamageDealtToChampions', 'magicalDamageTaken', 'perk0Var1',
       'perk0Var2', 'perk1Var1', 'perk1Var2', 'perk2Var1', 'perk3Var1',
       'perk3Var2', 'perk4Var1', 'perk5Var1', 'perk5Var2',
       'physicalDamageDealt', 'physicalDamageDealtToChampions',
       'physicalDamageTaken', 'totalDamageDealt',
       'totalDamageDealtToChampions', 'totalDamageTaken', 'totalHeal',
       'totalMinionsKilled', 'totalTimeCrowdControlDealt', 'trueDamageDealt',
       'trueDamageDealtToChampions', 'trueDamageTaken'],
      dtype='object')

In [144]:
top_stats.iloc[:, vt.get_support()].columns

Index(['damageSelfMitigated', 'magicDamageDealt', 'physicalDamageDealt',
       'totalDamageDealt', 'trueDamageDealt'],
      dtype='object')

In [36]:
top_stats.fillna(top_stats.mean())

Unnamed: 0,assists,baronKills_team,blue_trinkets_killed,blue_trinkets_placed,ccs_at_10,ccs_at_15,ccs_at_20,ccs_at_5,control_wards_killed,control_wards_placed,...,turretKills,undefined_killed,undefined_placed,unrealKills,visionScore,visionWardsBoughtInGame,wardsKilled,wardsPlaced,yellow_trinkets_killed,yellow_trinkets_placed
0,1,0,0,0,66,111,156,33,3,3,...,2,1,0,0,27,3,5,14,1,11
5,1,1,1,0,87,123,156,32,2,5,...,0,1,0,0,25,5,5,17,1,12
0,2,0,1,0,79,127,175,38,5,7,...,0,0,0,0,43,8,7,17,1,10
5,8,2,1,0,86,136,174,34,5,1,...,2,0,0,0,30,1,6,14,0,13
0,3,1,0,0,82,137,193,39,1,2,...,0,0,0,0,31,2,1,15,0,13
5,2,0,0,0,86,148,193,36,3,3,...,0,0,0,0,32,3,3,16,0,13
0,8,0,0,0,77,120,168,31,1,4,...,2,0,0,0,34,4,2,19,1,15
5,0,1,0,0,85,123,158,31,2,5,...,0,0,0,0,29,6,2,18,0,13
0,0,0,0,0,79,116,151,35,5,3,...,0,0,0,0,28,3,5,17,0,14
5,4,1,2,0,76,119,165,34,4,3,...,3,1,0,0,34,3,8,16,0,13
