In [125]:
from sklearn.feature_selection.variance_threshold import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from pandas import read_excel, concat, DataFrame

import warnings
warnings.filterwarnings('ignore')

In [2]:
slo_df = read_excel('data/slo_dataset.xlsx')

In [3]:
slo_df.drop(['player_name', 'season', 'split', 'team_name'], axis=1, inplace=True)

In [4]:
lck_df = read_excel('data/lck_s8_dataset.xlsx')

In [5]:
lck_df.shape, slo_df.shape

((4750, 151), (2000, 151))

In [6]:
slo_df = concat([slo_df, lck_df])

In [7]:
slo_df.reset_index(drop=True, inplace=True)

In [116]:
Series((slo_df.gameVersion)).apply(lambda x: x.split('.')[0] + '.' + x.split('.')[1]).unique()

array(['8.4', '8.10', '8.5', '8.3', '8.13', '8.12', '8.11', '8.2', '8.14',
       '8.16', '8.15', '8.1', '8.6'], dtype=object)

In [8]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [9]:
stats_df = slo_df.select_dtypes(include=numerics)

### Removing identifiers

In [10]:
stats_df.drop([col for col in stats_df.columns if 'Id' in col], axis=1, inplace=True)

### Removing useless numeric columns

In [11]:
stats_df.drop(['champLevel'], axis=1, inplace=True)

### Normalizing the rest of the stats by time

#### Transform game duration format into minutes

In [12]:
stats_df['gameDuration_in_minutes'] = stats_df.gameDuration / 60

### Exclude columns that aren't affected by time

In [13]:
stats_to_normalize = [col for col in stats_df.columns if '_at_' not in col and 'tt' not in col and 'gameDuration' not in col]

In [75]:
stats_normalized_df = stats_df[stats_to_normalize].apply(lambda x: x / stats_df.gameDuration_in_minutes)

In [76]:
not_time_affected_stats_df = stats_df[[col for col in stats_df.columns if '_at_' in col or 'tt' in col]]

### Now add them again

In [77]:
stats_normalized_df = concat([stats_normalized_df, not_time_affected_stats_df], axis=1)

### Add champ names transformed to dummies

In [78]:
champ_dummies = slo_df.champ_name.str.get_dummies()

In [79]:
stats_normalized_df = concat([stats_normalized_df, champ_dummies], axis=1)

### Clustering playstyles by position and feature selection for each cluster in each position

In [81]:
positions = slo_df.position.unique().tolist()

In [82]:
stats_by_position = {}

In [87]:
for i, p in enumerate(positions):
    # Preprocessing
    stats = stats_normalized_df[i::5]
    nan_cols = stats.iloc[:, stats.isnull().any().tolist()].columns
    stats.drop(nan_cols, axis=1, inplace=True)
    labels = slo_df[i::5].win
    
    # Clustering
    n_clusters = 3
    km = KMeans(n_clusters=n_clusters)
    clusters = km.fit_predict(X=stats)
    stats['cluster'] = clusters
    cluster_dfs = [stats[stats.clusters == c] c for c in clusters]
    stats_by_position[p] = {'X': stats, 'SelectKBest': [], 'ExtraTreesClassifier_FI': [], 'LogisticRegression_RFE': []}
    for i, c in enumerate(cluster_dfs):
        X = c.drop('clusters', axis=1)
        y = slo_df.ix[c.index].win
        
        c_new = SelectKBest(chi2, k=10).fit(X=X, y=y)
        c_new_cols = c.iloc[:, c_new.get_support()].columns
        stats_by_position[p]['SelectKBest'].append(c_new_cols)
        
        model = ExtraTreesClassifier()
        model.fit(X=X, y=y)
        stats_by_position[p]['ExtraTreesClassifier_FI'].append(stats_by_position[p]['X'].iloc[:, model.feature_importances_  > 0.02].columns)
        
        model2 = LogisticRegression()
        rfe = RFE(model2, 10)
        fit = rfe.fit(X, y)
        stats_by_position[p]['LogisticRegression_RFE'].append(stats_by_position[p]['X'].iloc[:, fit.support_].columns)

In [93]:
fit = km.fit(stats.drop('position', axis=1))

In [98]:
tfm = fit.transform(stats.drop('position', axis=1))

In [88]:
for pos in positions:
    stats_by_position[pos]['X']['position'] = pos

In [118]:
df = concat([stats_by_position[pos]['X'] for pos in positions])

In [119]:
df.drop(champ_dummies.columns, axis=1, inplace=True)

In [120]:
positions = df.position.unique().tolist()
clusters = df.clusters.unique().tolist()

## Calculate the mean of every column for every cluster, get the top features with more variance and then get the cluster with the highest score at every feature

In [129]:
n_features = 15

In [148]:
key_features = {p: {c: {'first': [], 'second': []} for c in clusters} for p in positions}

In [149]:
for pos in positions:
    means_df = concat([DataFrame(df[(df.position == pos) & (df.clusters == c)].mean(), columns=[c]).T for c in clusters])
    top_features = means_df.var().sort_values(ascending=False).head(n_features)
    for col in top_features.keys():
        cluster_1 = means_df[col].sort_values(ascending=False).index[0]
        cluster_2 = means_df[col].sort_values(ascending=False).index[1]
        key_features[pos][cluster_1]['first'].append(col)
        key_features[pos][cluster_2]['second'].append(col)

In [150]:
key_features

{'TOP': {0: {'first': [],
   'second': ['physicalDamageDealt',
    'totalDamageDealt',
    'magicDamageDealt',
    'gold_at_20',
    'gold_at_15',
    'damageSelfMitigated',
    'physicalDamageDealtToChampions',
    'gold_at_10',
    'damageDealtToObjectives',
    'gold_at_5',
    'goldEarned']},
  1: {'first': ['magicDamageDealt',
    'damageSelfMitigated',
    'magicDamageDealtToChampions',
    'totalHeal'],
   'second': ['trueDamageDealt', 'totalDamageDealtToChampions']},
  2: {'first': ['physicalDamageDealt',
    'totalDamageDealt',
    'gold_at_20',
    'gold_at_15',
    'trueDamageDealt',
    'physicalDamageDealtToChampions',
    'gold_at_10',
    'damageDealtToObjectives',
    'totalDamageDealtToChampions',
    'gold_at_5',
    'goldEarned'],
   'second': ['magicDamageDealtToChampions', 'totalHeal']}},
 'JUNG': {0: {'first': ['physicalDamageDealt',
    'totalDamageDealt',
    'gold_at_20',
    'gold_at_15',
    'damageDealtToObjectives',
    'trueDamageDealt',
    'gold_at_10',
