In [1]:
from sklearn.feature_selection.variance_threshold import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from pandas import read_excel, concat, DataFrame, Series
from numpy import mean, std
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
slo_df = read_excel('data/slo_dataset.xlsx')

In [3]:
pro_df = read_excel('data/pro_leagues_dataset.xlsx')

In [4]:
pro_df.shape, slo_df.shape

((20900, 155), (2000, 155))

In [5]:
all_df = concat([slo_df, pro_df])

In [6]:
# all_df.drop(['season', 'split', 'team_name'], axis=1, inplace=True)

In [7]:
all_df.reset_index(drop=True, inplace=True)

In [8]:
len(Series((all_df.gameVersion)).apply(lambda x: x.split('.')[0] + '.' + x.split('.')[1]).unique())

16

In [9]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [10]:
stats_df = all_df.select_dtypes(include=numerics)

### Removing identifiers

In [11]:
stats_df.drop([col for col in stats_df.columns if 'Id' in col], axis=1, inplace=True)

### Removing useless numeric columns

In [12]:
stats_df.drop(['champLevel'], axis=1, inplace=True)

### Normalizing the rest of the stats by time

#### Transform game duration format into minutes

In [13]:
stats_df['gameDuration_in_minutes'] = stats_df.gameDuration / 60

### Exclude columns that aren't affected by time

In [14]:
stats_to_normalize = [col for col in stats_df.columns if '_at_' not in col and 'tt' not in col and 'gameDuration' not in col]

In [15]:
stats_normalized_df = stats_df[stats_to_normalize].apply(lambda x: x / stats_df.gameDuration_in_minutes)

In [16]:
not_time_affected_stats_df = stats_df[[col for col in stats_df.columns if '_at_' in col or 'tt' in col]]

### Now add them again

In [17]:
stats_normalized_df = concat([stats_normalized_df, not_time_affected_stats_df], axis=1)

### Add champ names transformed to dummies

In [18]:
champ_dummies = all_df.champ_name.str.get_dummies()

In [19]:
stats_normalized_df = concat([stats_normalized_df, champ_dummies], axis=1)

### Clustering playstyles by position and feature selection for each cluster in each position

In [20]:
positions = all_df.position.unique().tolist()

In [21]:
stats_by_position = {}

In [22]:
for i, p in enumerate(positions):
    # Preprocessing
    stats = stats_normalized_df[i::5]
    nan_cols = stats.iloc[:, stats.isnull().any().tolist()].columns
    stats.drop(nan_cols, axis=1, inplace=True)
    labels = all_df[i::5].win
    
    # Clustering
    n_clusters = 3
    km = KMeans(n_clusters=n_clusters, random_state=42)
    c_fit = km.fit(X=stats)
    c_predict = c_fit.predict(X=stats)
    stats['cluster'] = c_predict
    cluster_dfs = [stats[stats.cluster == c] for c in range(n_clusters)]
    stats_by_position[p] = {'X': stats, 'SelectKBest': [], 'ExtraTreesClassifier_FI': [], 'LogisticRegression_RFE': []}
    # for i, c in enumerate(cluster_dfs):
    #     X = c.drop('cluster', axis=1)
    #     y = all_df.ix[c.index].win
    #     
    #     c_new = SelectKBest(chi2, k=10).fit(X=X, y=y)
    #     c_new_cols = c.iloc[:, c_new.get_support()].columns
    #     stats_by_position[p]['SelectKBest'].append(c_new_cols)
    #     
    #     model = ExtraTreesClassifier()
    #     model.fit(X=X, y=y)
    #     stats_by_position[p]['ExtraTreesClassifier_FI'].append(stats_by_position[p]['X'].iloc[:, model.feature_importances_  > 0.02].columns)
    #     
    #     model2 = LogisticRegression()
    #     rfe = RFE(model2, 10)
    #     fit = rfe.fit(X, y)
    #     stats_by_position[p]['LogisticRegression_RFE'].append(stats_by_position[p]['X'].iloc[:, fit.support_].columns)

In [23]:
for pos in positions:
    stats_by_position[pos]['X']['position'] = pos

In [24]:
df = concat([stats_by_position[pos]['X'] for pos in positions])

In [25]:
df.drop(champ_dummies.columns, axis=1, inplace=True)

In [26]:
positions = df.position.unique().tolist()
clusters = df.cluster.unique().tolist()

## Calculate the mean of every column for every cluster, get the top features with more variance and then get the cluster with the highest score at every feature

In [27]:
n_features = 15

In [28]:
key_features = {p: {c: {'first': [], 'second': [], 'third': []} for c in clusters} for p in positions}

In [29]:
for pos in positions:
    means_df = concat([DataFrame(df[(df.position == pos) & (df.cluster == c)].mean(), columns=[c]).T for c in clusters])
    top_features = means_df.var().sort_values(ascending=False).head(n_features)
    for col in top_features.keys():
        cluster_1 = means_df[col].sort_values(ascending=False).index[0]
        cluster_2 = means_df[col].sort_values(ascending=False).index[1]
        cluster_3 = means_df[col].sort_values(ascending=False).index[2]
        key_features[pos][cluster_1]['first'].append(col)
        key_features[pos][cluster_2]['second'].append(col)
        key_features[pos][cluster_3]['third'].append(col)

### Calculate the weights for each feature in each cluster

In [30]:
# For 3 clusters
def set_f_weights(f_lens, clusters):
    test_list = [1, 2, 3]
    test_set = set(test_list)
    f_lens_set = set([a + b for a,b in zip(f_lens, test_list)])
    zeros_set = test_set - f_lens_set
    
    f_weights = [0, 0, 0]
    if len(zeros_set) == 3:
        f_weights = [.6, .3, .1]
    elif len(zeros_set) == 2:
        pos1 = list(zeros_set)[0]
        pos2 = list(zeros_set)[1]
        f_weights[pos1 - 1] = .7
        f_weights[pos2 - 1] = .3
    else:
        pos = list(zeros_set)[0]
        f_weights[pos - 1] = 1
    
    return f_weights

In [31]:
for p in key_features:
    for c in key_features[p]:
        c = key_features[p][c]
        
        f_len = len(c['first'])
        s_len = len(c['second'])
        t_len = len(c['third'])
        
        f_weights = set_f_weights([f_len, s_len, t_len], clusters)
        c['f_weights'] = f_weights

In [32]:
for p in positions:
    for c in clusters:
        key_features[p][c]['scores'] = []
        for i, r in enumerate(['first', 'second', 'third']):
            features = key_features[p][c][r]
            weight = key_features[p][c]['f_weights'][i]
            if features:
                df_tmp = df[df.position == p]
                mms = MinMaxScaler()
                df_tmp[features] = df_tmp[features].apply(mms.fit_transform)
                score = mean(df_tmp[features].mean()) * weight
                key_features[p][c]['scores'].append(score)
            else:
                key_features[p][c]['scores'].append(0)          

In [33]:
key_features

{'ADC': {0: {'f_weights': [0, 0.7, 0.3],
   'first': [],
   'scores': [0, 0.2106990669018149, 0.039958257907983734],
   'second': ['physicalDamageDealt',
    'totalDamageDealt',
    'physicalDamageDealtToChampions',
    'gold_at_15',
    'damageDealtToObjectives',
    'totalDamageDealtToChampions',
    'gold_at_10',
    'damageDealtToTurrets',
    'gold_at_5',
    'perk1Var2'],
   'third': ['magicDamageDealt',
    'magicDamageDealtToChampions',
    'trueDamageDealt',
    'totalHeal',
    'totalDamageTaken']},
  1: {'f_weights': [0.7, 0.3, 0],
   'first': ['physicalDamageDealt',
    'totalDamageDealt',
    'physicalDamageDealtToChampions',
    'gold_at_15',
    'damageDealtToObjectives',
    'trueDamageDealt',
    'totalDamageDealtToChampions',
    'gold_at_10',
    'damageDealtToTurrets',
    'gold_at_5',
    'perk1Var2'],
   'scores': [0.19400211655269206, 0.04705147098553716, 0],
   'second': ['magicDamageDealt',
    'magicDamageDealtToChampions',
    'totalHeal',
    'totalDamageTak

## Analyse clusters

In [34]:
all_df[stats_to_normalize] = all_df[stats_to_normalize].fillna(0).apply(lambda x: x / (all_df.gameDuration / 60))

In [35]:
mms = MinMaxScaler()

In [36]:
all_df[stats_to_normalize] = all_df[stats_to_normalize].apply(mms.fit_transform)

In [37]:
player_means_df = all_df.groupby('player_name').mean()

In [38]:
player_positions = all_df.groupby('player_name').first()['position']

In [39]:
player_means_df['position'] = player_positions

In [40]:
player_means_df['cluster'] = df.cluster

In [41]:
for p in key_features:
    for c in key_features[p]:
        for r in key_features[p][c]:
            if r in ['first', 'second']:
                print(p, c, key_features[p][c][r])

TOP 0 ['damageSelfMitigated', 'totalDamageTaken']
TOP 0 ['physicalDamageDealt', 'magicDamageDealt', 'physicalDamageDealtToChampions', 'magicDamageDealtToChampions', 'totalHeal']
TOP 1 ['physicalDamageDealt', 'gold_at_15', 'physicalDamageDealtToChampions', 'damageDealtToObjectives', 'gold_at_10', 'totalDamageDealtToChampions', 'damageDealtToTurrets', 'goldEarned']
TOP 1 ['totalDamageDealt', 'trueDamageDealt']
TOP 2 ['totalDamageDealt', 'trueDamageDealt', 'magicDamageDealt', 'magicDamageDealtToChampions', 'totalHeal']
TOP 2 ['damageSelfMitigated', 'gold_at_15', 'damageDealtToObjectives', 'gold_at_10', 'totalDamageDealtToChampions', 'damageDealtToTurrets', 'totalDamageTaken', 'goldEarned']
JUNG 0 ['physicalDamageDealt', 'totalDamageDealt', 'gold_at_15', 'damageDealtToObjectives', 'trueDamageDealt', 'gold_at_10', 'physicalDamageDealtToChampions', 'totalDamageDealtToChampions', 'gold_at_5', 'damageDealtToTurrets']
JUNG 0 ['totalHeal']
JUNG 1 ['damageSelfMitigated']
JUNG 1 ['physicalDamageDe

In [42]:
all_df['cluster'] = df.cluster

In [60]:
for key, value in dict(all_df[(all_df.cluster == 2) & (all_df.position == 'TOP')].champ_name.value_counts().head(10)).items():
    print('{} ({})'.format(key, value))

Ornn (95)
Vladimir (81)
Sion (37)
Singed (25)
Ryze (21)
Cho'Gath (21)
Shen (20)
Dr. Mundo (13)
Maokai (13)
Swain (12)


In [45]:
pca = PCA(n_components=2, random_state=42)

In [46]:
stats_normalized_df.fillna(0, inplace=True)

In [47]:
df_pca = DataFrame(pca.fit_transform(stats_normalized_df.drop([col for col in stats_normalized_df.columns if 'tt' in col], axis=1)))

In [48]:
df_pca['cluster'] = all_df.cluster

In [49]:
df_pca['position'] = all_df.position

In [50]:
df_pca.to_excel('slo_pro_leagues_clusters.xlsx')

In [51]:
all_df.shape

(22900, 156)