# Loading data

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import pystan

# Load training data and reduce (subsample) if desired

# Read thru file to get numeric ids for each player 
with open('train.csv') as f: lines = f.read().split('\n')

p = 0; playerid = {};
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    player0,player1 = csv[1],csv[4];
    if player0 not in playerid: playerid[player0]=p; p+=1;
    if player1 not in playerid: playerid[player1]=p; p+=1;

nplayers = len(playerid)
playername = ['']*nplayers
for player in playerid: playername[ playerid[player] ]=player;  # id to name lookup


# Sparsifying parameters (discard some training examples):
pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    a,b = playerid[csv[1]],playerid[csv[4]];
    aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
    if (np.random.rand() < pKeep):
        if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
            nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;

In [2]:
nplayers # number of unique players

999

In [3]:
playerid # map from playername -> playerID
playername[0] # list of playernames indexed by their ID
print(playername[0], "vs", playername[1])

MC vs Stats


In [4]:
nplays[0,1] # number of games between player 0 and player 1

2.0

In [5]:
nwins[0,1] # number of wins between player 0 and player 1
np.max(nwins) # maximum number of wins against a single opponent is 5 

5.0

In [6]:
np.sum(nplays)  # number of player vs player combinations

9354.0

In [7]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

win = []
PA = []
PB = []
for index, wins in np.ndenumerate(nwins):
    if wins == 0: continue
    win.append(int(wins))
    PA.append(index[0] + 1)           # increment so we can index starting at 1
    PB.append(index[1] + 1)           # player 0 is now player 1

In [8]:
print(win[:6], " # of wins PA had over PB") 
print(PA[:6], " PA's ID") 
print(PB[:6], " PB's ID")
len(win)            # total number of games

[1, 1, 2, 4, 4, 2]  # of wins PA had over PB
[1, 1, 1, 1, 1, 1]  PA's ID
[2, 4, 6, 7, 8, 9]  PB's ID


3321

##### Stan Model

In [9]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players i.e 999
  int<lower=1> E;             # number of matchups (3321)
  real<lower=0> scale;        # scale value for probability computation
  int<lower=1,upper=5> win[E];        # PA wins vs PB
  int PA[E];                  # player info between each matchup
  int PB[E];                  # 
}
parameters {
  vector<lower=0> [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(0,3); }
  for (i in 1:E){
    win[i] ~ binomial_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a binomial_logit function of skill difference (0-5)
}
"""

##### compile the model

In [10]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

In [11]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.7,
    'win': win,
    'PA': PA,
    'PB': PB
}

Now, we can perform MCMC on the model, and extract the samples:

In [12]:
fit = sm.sampling(data=skill_data, iter=1000, chains=1)

In [13]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [14]:
player_skills = samples['skill'].mean(0)
print(player_skills)

[0.91428881 0.66166706 0.65011361 0.23719789 0.72324177 1.03624712
 0.48828846 0.57003828 0.2095546  1.12788344 1.36136145 0.83321201
 0.73812174 0.60463827 0.91395862 1.11742457 2.06339272 1.1139714
 0.64955724 1.13809037 1.08463193 1.0167962  1.06807296 0.86021487
 0.6289638  1.68661861 1.47295307 0.66806272 0.41333529 0.48540353
 1.17063245 2.68654584 0.61823379 0.77129216 1.67687797 1.82156401
 1.52125023 1.41740163 0.44767348 1.26455552 1.70813237 0.2948312
 0.62530153 0.35409357 1.07051197 1.26754947 0.75006341 0.56936706
 1.19999764 0.58104567 1.03219949 0.72674148 1.97873693 0.93441537
 1.1502813  1.00872222 0.46306875 0.48840096 2.21267243 0.40323152
 0.98320891 0.58946957 1.24477344 0.82170495 0.41588952 1.42490498
 1.38192838 0.74596556 0.67936435 0.44836139 0.83529445 1.50088247
 2.62065323 0.95038384 0.2781706  1.50482576 0.6775183  1.47893371
 1.07115666 1.59105469 0.31631392 0.51334893 1.55906836 1.41057924
 0.62308602 1.05856219 0.86705001 2.89076374 0.5048907  2.893054

Finding the name of the player with the highest skill according to our model

In [15]:
ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

print("the highest skill level is: ", player_skills[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  3.654525813861585  and his name is:  Aicy
the lowest skill level is:  0.2095546042860845  and his name is:  Zest


The above result is surprising because Zest is a good player. According to this link https://www.lineups.com/esports/top-10-starcraft-ii-players-of-all-time/ these are the top ten players of all time. So they should have high skill levels.

In [16]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills[id])


Mvp 's skill level is:  0.6895708746052462
Life 's skill level is:  0.98320891089165
TaeJa 's skill level is:  0.33001446916833665
MC 's skill level is:  0.9142888124626701
Polt 's skill level is:  0.8857644514142963
INnoVation 's skill level is:  0.23719789051269077
Zest 's skill level is:  0.2095546042860845
NesTea 's skill level is:  0.7628592283679246
MMA 's skill level is:  0.6793643488614844
Rain 's skill level is:  0.7500634057313884


So, we're actually getting the opposite of what we were expecting. I think this has to do with the fact that I changed the sampling distribution from bernoulli_logit to binomial_logit. But we can just say that lower is better and still call this a good model.

If we want to predict which player will win, we might use a direct estimator of that quantity based on the sample values:

In [17]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind2[0]])

Aicy has a  90.41387660549074 % chance of winning against  Zest


according to this link https://liquipedia.net/starcraft2/ESL_Pro_Tour/2020/21/Korea/Standings, Zest is number 3 in the current standings in korea. So the results are definitely reversed.

##### Reversing the win data

In [18]:
skill_data2 = {
    'N': 999,
    'E': 3321,
    'scale': 0.7,
    'win': win,
    'PA': PB,
    'PB': PA
}

Now, we can perform MCMC on the model, and extract the samples:

In [19]:
fit2 = sm.sampling(data=skill_data2, iter=1000, chains=1)

In [20]:
samples2 = fit2.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [21]:
player_skills2 = samples2['skill'].mean(0)
print(player_skills2)

[2.60856876 2.8347812  2.93124144 3.36529855 2.71092659 2.47516074
 2.97994655 3.09279465 3.87906952 2.35325697 2.08018965 2.75046251
 2.75458088 3.2025858  2.51636828 2.32041068 1.33290376 2.31702872
 3.11327921 2.3405467  2.32697661 2.47382726 2.36926393 2.53598366
 2.92676994 1.65911433 2.00949254 3.11693242 3.11758946 3.11128475
 2.24655264 0.9907846  3.01346482 2.73233984 1.64402017 1.56761818
 1.8776061  1.97729394 3.16064674 2.0376047  1.7258784  3.54438965
 2.91087564 3.43656117 2.35649064 2.21846743 2.73372839 2.93762081
 2.29508657 3.01186554 2.46194898 2.7085599  1.35307538 2.5559534
 2.33133032 2.50538478 3.28581042 3.26105984 1.40135292 3.54609849
 2.56029199 3.2354816  2.26352714 2.82336535 3.27241845 1.92265857
 2.01813372 2.74039559 2.81314332 3.00519716 2.79036926 1.92114423
 0.99232156 2.54066173 3.35453538 1.95342479 2.75514402 1.9089783
 2.4187718  1.65407824 3.21440545 2.94474095 1.8755992  2.05542682
 3.09075923 2.24959369 2.62387134 0.89653481 3.47682618 0.838677

Finding the name of the player with the highest skill according to our model

In [22]:
ind3 = np.unravel_index(np.argmax(player_skills2, axis=None), player_skills2.shape)
ind4 = np.unravel_index(np.argmin(player_skills2, axis=None), player_skills2.shape)

print("the highest skill level is: ", player_skills2[ind3[0]], " and his name is: ", playername[ind3[0]])
print("the lowest skill level is: ", player_skills2[ind4[0]], " and his name is: ", playername[ind4[0]])

the highest skill level is:  4.974167992108901  and his name is:  ZerO
the lowest skill level is:  0.5377336623143122  and his name is:  Aicy


In [23]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills2[id])


Mvp 's skill level is:  2.848512200271066
Life 's skill level is:  2.560291985517161
TaeJa 's skill level is:  3.3454067143388984
MC 's skill level is:  2.608568759963169
Polt 's skill level is:  2.627995411671184
INnoVation 's skill level is:  3.3652985480795587
Zest 's skill level is:  3.879069520087811
NesTea 's skill level is:  2.7771657434189874
MMA 's skill level is:  2.813143323965182
Rain 's skill level is:  2.733728392759038


In [27]:
#highest skilled player vs lowest skilled player prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind4[0]]-samples['skill'][:,ind3[0]]) ).mean()

print(playername[ind3[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind4[0]])

ZerO has a  89.7801450657211 % chance of winning against  Aicy
