In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from bidict import bidict
from copy import copy
from itertools import combinations, permutations
from collections import Counter
from math import log2
from tqdm import tqdm

In [2]:
train = pd.read_csv('data/train_features.csv', index_col='match_id_hash')
target = pd.read_csv('data/train_targets.csv', index_col='match_id_hash')['radiant_win']

In [3]:
train['game_mode'] = (train['game_mode'] == 22)

In [4]:
X_train, X1, y_train, y1 = train_test_split(train, target, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X1, y1, test_size=0.5, random_state=42)

In [5]:
attributes = ["kills", "deaths", "assists", "lh", "denies", "gold", "xp", "level", "health", "max_health", "max_mana"]
radiant_attributes = [f"r{i}_{j}" for i in range(1,6) for j in attributes]
dire_attributes = [f"d{i}_{j}" for i in range(1,6) for j in attributes]
game_attributes = ['game_time', 'objectives_len', 'chat_len']

In [6]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_valid_scaled = X_valid.copy()

In [7]:
scale = StandardScaler()

In [8]:
X_train_scaled[
    game_attributes + radiant_attributes + dire_attributes
] = scale.fit_transform(X_train[game_attributes + radiant_attributes + dire_attributes])
X_test_scaled[
    game_attributes + radiant_attributes + dire_attributes
] = scale.transform(X_test[game_attributes + radiant_attributes + dire_attributes])
X_valid_scaled[
    game_attributes + radiant_attributes + dire_attributes
] = scale.transform(X_valid[game_attributes + radiant_attributes + dire_attributes])

In [9]:
def get_hero_dummies(heroes):
    n = heroes.shape[1]
    return sum(pd.get_dummies(heroes.iloc[:,i]) for i in range(n))
    
    
class HeroBinarizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        heroes_binarized_r = sum(pd.get_dummies(X[f"r{i}_hero_id"]) for i in range(1,6))
        heroes_binarized_d = sum(pd.get_dummies(X[f"d{i}_hero_id"]) for i in range(1,6))
        X1 = pd.concat([X, heroes_binarized_r, heroes_binarized_d], axis = 1)
        X1 = X1.drop([f"r{i}_hero_id" for i in range(1,6)] + [f"d{i}_hero_id" for i in range(1,6)], axis=1)
        return X1


class DummiesGetter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_dummies = pd.concat([X] + [pd.get_dummies(X[c]) for c in self.columns], axis=1)
        return X_dummies.drop(self.columns, axis=1)


class HeroRemapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = [f"r{i}_hero_id" for i in range(1,6)] + [f"d{i}_hero_id" for i in range(1,6)]

    def fit(self, X, y=None):
        hero_idx = np.sort(X[self.columns[0]].unique())
        self.hero_idx_map = bidict(zip(hero_idx, range(len(hero_idx))))
        return self

    def transform(self, X, y=None):
        X_mapped = copy(X)
        for c in self.columns:
            X_mapped[c] = X_mapped[c].map(self.hero_idx_map)
        return X_mapped

In [10]:
def triplets_to_adjacency(edges):
    rows, cols, vals = edges.T
    adj = np.zeros([int(np.max(rows))+1, int(np.max(cols))+1])
    adj[rows.astype(int), cols.astype(int)] = vals
    return adj


def get_adjacency_matrix(df):
    pair_list = []
    for i, row in df.iterrows():
        pair_list.extend(permutations(row, 2))
    counter = Counter(pair_list)
    edges = np.array([(*key, value) for key, value in counter.items()])
    return triplets_to_adjacency(edges)


def context_matrix(data):
    def pmi(i, j, n_ij):
        n_i = n_counts[i]
        n_j = n_counts[j]
        return log2((n*n_ij)/(n_i*n_j))

    n = data.shape[0]
    n_counts = get_hero_dummies(data).sum().to_dict()
    
    mate_pair_list = []
    enemy_pair_list = []
    for i, row in data.iterrows():
        radiant, dire = row[:5], row[5:]
        # add all pairs of teammates
        mate_pair_list.extend(permutations(radiant, 2))
        mate_pair_list.extend(permutations(dire, 2))
        # add all pairs of enemies
        for h in radiant:
            enemy_pair_list.extend([(h, e) for e in dire])
        for h in dire:
            enemy_pair_list.extend([(h, e) for e in radiant])

    mate_counter = Counter(mate_pair_list)
    enemy_counter = Counter(enemy_pair_list)
    
    mate_edges = np.array([(*key, pmi(*key, value)) for key, value in mate_counter.items()])
    mate_adj = triplets_to_adjacency(mate_edges)
    enemy_edges = np.array([(*key, pmi(*key, value*4/5)) for key, value in enemy_counter.items()])
    enemy_adj = triplets_to_adjacency(enemy_edges)

    return np.hstack((mate_adj, enemy_adj))

    
class HeroFactorizer(BaseEstimator, TransformerMixin):
    def __init__(self, n_factors=10):
        self.n_factors = n_factors
        self.columns = [f"r{i}_hero_id" for i in range(1,6)] + [f"d{i}_hero_id" for i in range(1,6)]

    def fit(self, X, y=None):
        heroes = X[self.columns]
        self.context = context_matrix(heroes)
        u, sigma, vt = np.linalg.svd(self.context)

        self.embeddings = StandardScaler().fit_transform(u[:, :self.n_factors])
        return self

    def transform(self, X, y=None):
        radiant = np.zeros((X.shape[0], self.n_factors))
        for radiant_hero in self.columns[:5]:
            radiant += self.embeddings[X[radiant_hero], :]
            
        dire = np.zeros((X.shape[0], self.n_factors))
        for dire_hero in self.columns[:5]:
            dire += self.embeddings[X[dire_hero], :]
        
        radiant_df = pd.DataFrame(radiant, columns=[f'r_feat_{i}' for i in range(self.n_factors)], index=X.index)
        dire_df = pd.DataFrame(dire, columns=[f'd_feat_{i}' for i in range(self.n_factors)], index=X.index)
        return pd.concat((X, radiant_df, dire_df), axis=1).drop(self.columns, axis=1)

In [11]:
def shuffle_df(data, vars, n_shuffles=10):
    data_shuffle = [data.copy() for i in range(n_shuffles)]
    perm = list(permutations(range(1,6)))
    perm_idx = np.random.choice(len(perm), size=n_shuffles).tolist()
    for s, idx in enumerate((perm[i] for i in perm_idx)):
        for var in vars:
            data_shuffle[s][[f'r{i}_{var}' for i in range(1,6)]] = data_shuffle[s][[f'r{i}_{var}' for i in idx]]
            data_shuffle[s][[f'd{i}_{var}' for i in range(1,6)]] = data_shuffle[s][[f'd{i}_{var}' for i in idx]]
    
    return pd.concat([data] + data_shuffle, axis=0)

In [12]:
vars = ['hero_id', 'kills', 'deaths', 'assists', 'denies',
       'gold', 'lh', 'xp', 'health', 'max_health',
       'max_mana', 'level', 'x', 'y', 'stuns',
       'creeps_stacked', 'camps_stacked', 'rune_pickups',
       'firstblood_claimed', 'teamfight_participation',
       'towers_killed', 'roshans_killed', 'obs_placed',
       'sen_placed']

In [13]:
default_ppl = Pipeline([
    ("dum", DummiesGetter(['lobby_type'])),
    ("bin", HeroBinarizer())
])

X_train_default = default_ppl.fit_transform(X_train_scaled)
X_train_default_shuffle = default_ppl.fit_transform(shuffle_df(X_train_scaled, vars=vars, n_shuffles=10))
X_valid_default = default_ppl.transform(X_valid_scaled)
X_test_default = default_ppl.transform(X_test_scaled)

In [14]:
factor_ppl = Pipeline([
    ("map", HeroRemapper()),
    ("dum", DummiesGetter(['lobby_type'])),
    ("fact", HeroFactorizer(n_factors=20))
])

X_train_factor = factor_ppl.fit_transform(X_train_scaled)
X_train_factor_shuffle = factor_ppl.fit_transform(shuffle_df(X_train_scaled, vars=vars, n_shuffles=10))
X_valid_factor = factor_ppl.transform(X_valid_scaled)
X_test_factor = factor_ppl.transform(X_test_scaled)

In [33]:
def stack_y(y, n_shuffles=10):
    return pd.concat([y for _ in range(n_shuffles+1)])

# Models

### Random Forest

In [16]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train_default, y_train)
y_valid_predicted = clf.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7794928077535013


In [17]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train_factor, y_train)
y_valid_predicted = clf.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7714842242767577


In [36]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train_default_shuffle, stack_y(y_train, 10))
y_valid_predicted = clf.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7855574573634092


In [39]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train_factor_shuffle, stack_y(y_train, 10))
y_valid_predicted = clf.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7760828292007322


### Gradient Boosting

In [40]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=0)
gbc.fit(X_train_default, y_train)
y_valid_predicted = gbc.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7841882894196972




In [41]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=0)
gbc.fit(X_train_factor, y_train)
y_valid_predicted = gbc.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7789410273735534




In [42]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=0)
gbc.fit(X_train_default_shuffle, stack_y(y_train, 10))
y_valid_predicted = gbc.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7879724356856861


In [43]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=0)
gbc.fit(X_train_factor_shuffle, stack_y(y_train, 10))
y_valid_predicted = gbc.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7833206329187106




### Logistic Regression

In [45]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train_default, y_train)
y_valid_predicted = lr.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))

0.7984161496299009


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train_default_shuffle, stack_y(y_train, 10))
y_valid_predicted = lr.predict_proba(X_valid_default)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.802056524825278


In [47]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train_factor, y_train)
y_valid_predicted = lr.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))



0.7918955139605657


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train_factor_shuffle, stack_y(y_train, 10))
y_valid_predicted = lr.predict_proba(X_valid_factor)
print(roc_auc_score(y_valid, y_valid_predicted[:, 1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7920784432280006
