# Loading data

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import pystan

# Load training data and reduce (subsample) if desired

# Read thru file to get numeric ids for each player 
with open('train.csv') as f: lines = f.read().split('\n')

p = 0; playerid = {};
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    player0,player1 = csv[1],csv[4];
    if player0 not in playerid: playerid[player0]=p; p+=1;
    if player1 not in playerid: playerid[player1]=p; p+=1;

nplayers = len(playerid)
playername = ['']*nplayers
for player in playerid: playername[ playerid[player] ]=player;  # id to name lookup


# Sparsifying parameters (discard some training examples):
pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    a,b = playerid[csv[1]],playerid[csv[4]];
    aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
    if (np.random.rand() < pKeep):
        if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
            nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;

In [2]:
nplayers # number of unique players

999

In [3]:
playerid # map from playername -> playerID
playername[0] # list of playernames indexed by their ID
print(playername[0], "vs", playername[1])

MC vs Stats


In [4]:
nplays[0,1] # number of games between player 0 and player 1

2.0

In [5]:
nwins[0,1] # number of wins between player 0 and player 1
np.max(nwins) # maximum number of wins against a single opponent is 5 

5.0

In [6]:
np.sum(nplays)  # number of player vs player combinations

9354.0

In [7]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

win = []
PA = []
PB = []
for index, wins in np.ndenumerate(nwins):
    if wins == 0: continue
    win.append(int(wins))
    PA.append(index[0] + 1)           # increment so we can index starting at 1
    PB.append(index[1] + 1)           # player 0 is now player 1

In [8]:
print(win[:6], " # of wins PA had over PB") 
print(PA[:6], " PA's ID") 
print(PB[:6], " PB's ID")
len(win)            # total number of games

[1, 1, 2, 4, 4, 2]  # of wins PA had over PB
[1, 1, 1, 1, 1, 1]  PA's ID
[2, 4, 6, 7, 8, 9]  PB's ID


3321

##### Stan Model

In [9]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players i.e 999
  int<lower=1> E;             # number of matchups (3321)
  real<lower=0> scale;        # scale value for probability computation
  int<lower=1,upper=5> win[E];        # PA wins vs PB
  int PA[E];                  # player info between each matchup
  int PB[E];                  # 
}
parameters {
  vector<lower=0> [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(0,3); }
  for (i in 1:E){
    win[i] ~ binomial_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a binomial_logit function of skill difference (0-5)
}
"""

##### compile the model

In [10]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

In [11]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.7,
    'win': win,
    'PA': PA,
    'PB': PB
}

Now, we can perform MCMC on the model, and extract the samples:

In [12]:
fit = sm.sampling(data=skill_data, iter=1000, chains=1)

In [13]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [14]:
player_skills = samples['skill'].mean(0)
print(player_skills)

[0.91419291 0.63417521 0.63382915 0.22968477 0.71563908 0.99406587
 0.46897308 0.56226324 0.20376113 1.10664458 1.35272825 0.81528501
 0.77868997 0.60891634 0.88954497 1.09831877 2.10070943 1.14252531
 0.65561437 1.17296885 1.1526062  1.0068496  1.04484137 0.88917657
 0.62157679 1.61050972 1.46865394 0.67415341 0.42356093 0.49123345
 1.1598095  2.67986512 0.59348842 0.77371793 1.72773273 1.8362436
 1.60199866 1.42866206 0.44979782 1.29455009 1.52843747 0.29902975
 0.6356594  0.35659742 1.07123725 1.31890913 0.7345776  0.56698989
 1.23046146 0.58193712 0.95676694 0.80324227 2.04474336 1.00956325
 1.06343389 1.00313218 0.49327324 0.50650478 2.25856055 0.40061928
 0.97664546 0.62385614 1.24585221 0.80325273 0.40411461 1.40611768
 1.38305825 0.73168187 0.69123231 0.4835299  0.83794166 1.56126061
 2.59079869 0.95630684 0.26544802 1.40863961 0.68986445 1.47535498
 1.0888421  1.57935093 0.33584778 0.48146559 1.43380131 1.37108855
 0.61115823 1.15277081 0.87624735 2.86226518 0.51155722 2.92870

Finding the name of the player with the highest skill according to our model

In [15]:
ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

print("the highest skill level is: ", player_skills[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  3.7284563731255114  and his name is:  Aicy
the lowest skill level is:  0.20376112743330088  and his name is:  Zest


The above result is surprising because Zest is a good player. According to this link https://www.lineups.com/esports/top-10-starcraft-ii-players-of-all-time/ these are the top ten players of all time. So they should have high skill levels.

In [16]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills[id])


Mvp 's skill level is:  0.688391862358984
Life 's skill level is:  0.9766454576879855
TaeJa 's skill level is:  0.32994991577146243
MC 's skill level is:  0.9141929066346802
Polt 's skill level is:  0.8801043529185937
INnoVation 's skill level is:  0.2296847698131551
Zest 's skill level is:  0.20376112743330088
NesTea 's skill level is:  0.7355277441125527
MMA 's skill level is:  0.6912323114241407
Rain 's skill level is:  0.7345775990866452


So, we're actually getting the opposite of what we were expecting. I think this has to do with the fact that I changed the sampling distribution from bernoulli_logit to binomial_logit. But we can just say that lower is better and still call this a good model.

If we want to predict which player will win, we might use a direct estimator of that quantity based on the sample values:

In [17]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind2[0]])

Aicy has a  90.96807696567639 % chance of winning against  Zest


according to this link https://liquipedia.net/starcraft2/ESL_Pro_Tour/2020/21/Korea/Standings, Zest is number 3 in the current standings in korea. So the results are definitely reversed.

##### inversing the skill_levels

In [30]:
player_skills2 = np.array([1/x for x in player_skills])

In [31]:
ind = np.unravel_index(np.argmax(player_skills2, axis=None), player_skills2.shape)
ind2 = np.unravel_index(np.argmin(player_skills2, axis=None), player_skills2.shape)

print("the highest skill level is: ", player_skills2[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills2[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  4.907707434664347  and his name is:  Zest
the lowest skill level is:  0.2682075100054649  and his name is:  Aicy


In [32]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills2[id])


Mvp 's skill level is:  1.452660983779204
Life 's skill level is:  1.023913019948203
TaeJa 's skill level is:  3.0307630103856225
MC 's skill level is:  1.0938610360489365
Polt 's skill level is:  1.1362288990888518
INnoVation 's skill level is:  4.353793248082944
Zest 's skill level is:  4.907707434664347
NesTea 's skill level is:  1.359568021742735
MMA 's skill level is:  1.446691630979616
Rain 's skill level is:  1.3613265654212354


In [33]:
# Highest skilled player vs Lowest skilled player:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", (1-prob) * 100, "% chance of winning against ", playername[ind2[0]])

Zest has a  90.9680769656764 % chance of winning against  Aicy
