# Loading data

In [267]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import pystan

# Load training data and reduce (subsample) if desired

# Read thru file to get numeric ids for each player 
with open('train.csv') as f: lines = f.read().split('\n')

p = 0; playerid = {};
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    player0,player1 = csv[1],csv[4];
    if player0 not in playerid: playerid[player0]=p; p+=1;
    if player1 not in playerid: playerid[player1]=p; p+=1;

nplayers = len(playerid)
playername = ['']*nplayers
for player in playerid: playername[ playerid[player] ]=player;  # id to name lookup


# Sparsifying parameters (discard some training examples):
pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    a,b = playerid[csv[1]],playerid[csv[4]];
    aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
    if (np.random.rand() < pKeep):
        if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
            nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;

In [268]:
nplayers # number of unique players

999

In [269]:
playerid # map from playername -> playerID
playername[0] # list of playernames indexed by their ID
print(playername[0], "vs", playername[1])

MC vs Stats


In [270]:
nplays[0,1] # number of games between player 0 and player 1

2.0

In [271]:
nwins[0,1] # number of wins between player 0 and player 1
np.max(nwins) # maximum number of wins against a single opponent is 5 

5.0

In [272]:
np.sum(nplays)  # number of player vs player combinations

9354.0

In [273]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

win = []
PA = []
PB = []
for index, wins in np.ndenumerate(nwins):
    if wins == 0: continue
    win.append(int(wins))
    PA.append(index[0] + 1)           # increment so we can index starting at 1
    PB.append(index[1] + 1)           # player 0 is now player 1

In [274]:
print(win[:6], " # of wins PA had over PB") 
print(PA[:6], " PA's ID") 
print(PB[:6], " PB's ID")
len(win)            # total number of games

[1, 1, 2, 4, 4, 2]  # of wins PA had over PB
[1, 1, 1, 1, 1, 1]  PA's ID
[2, 4, 6, 7, 8, 9]  PB's ID


3321

##### Stan Model

In [275]:
skill_model = """
data {
  int<lower=1> N;             # Total number of players i.e 999
  int<lower=1> E;             # number of matchups (3321)
  real<lower=0> scale;        # scale value for probability computation
  int<lower=1,upper=5> win[E];        # PA wins vs PB
  int PA[E];                  # player info between each matchup
  int PB[E];                  # 
}
parameters {
  vector [N] skill;           # skill values for each player
}

model{
  for (i in 1:N){ skill[i]~normal(0,5); }
  for (i in 1:E){
    win[i] ~ bernoulli_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   # win probability is a binomial_logit function of skill difference (0-5)
}
"""

##### compile the model

In [276]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

In [277]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.1,
    'win': win,
    'PA': PA,
    'PB': PB
}

Now, we can perform MCMC on the model, and extract the samples:

In [278]:
fit = sm.sampling(data=skill_data, iter=1000, chains=2)

In [279]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [280]:
player_skills = samples['skill'].mean(0)
print(player_skills)


[0.50332678 0.73230844 1.0644105  0.19198068 0.35219029 2.01000608
 0.30597953 1.31795097 0.47922447 1.15018521 2.54122717 2.04680167
 1.39876339 1.83875431 1.65901077 0.821774   2.844181   2.28370498
 1.64128003 1.61818507 2.27469163 1.67491987 0.52268555 0.35581906
 0.38440722 2.78075236 0.73082449 1.55409992 0.2307017  0.82132964
 1.83349028 3.68042626 1.29675467 1.01321477 2.96246133 2.89739406
 2.43988487 2.57616151 0.35509254 2.58563537 2.59593557 0.60280802
 1.057093   0.84421652 2.2630846  2.01180351 0.85262577 0.64213773
 1.87766629 1.37849551 2.12831478 1.30689973 2.87902817 2.02879487
 2.18434828 1.41864851 1.06855203 1.439259   2.5299713  1.38621129
 1.09792547 1.78083963 0.43730163 2.07473279 0.69781062 2.561981
 2.69392629 0.28735266 0.22468761 0.36605283 1.99617834 2.15565099
 3.62381045 0.84189158 0.31195897 2.64411366 0.78009672 2.58985167
 0.68207157 2.61677195 0.3684654  0.25107256 2.21245529 1.3164459
 1.42517219 2.35191672 0.47624004 3.79557223 1.36109018 3.7304633

Finding the name of the player with the highest skill according to our model

In [281]:
ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

print("the highest skill level is: ", player_skills[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  4.230775324829369  and his name is:  Sen
the lowest skill level is:  0.19198067695712345  and his name is:  INnoVation


The above result is surprising because Zest is a good player. According to this link https://www.lineups.com/esports/top-10-starcraft-ii-players-of-all-time/ these are the top ten players of all time. So they should have high skill levels.

In [282]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills[id])


Mvp 's skill level is:  0.41083508825870557
Life 's skill level is:  1.0979254677077046
TaeJa 's skill level is:  0.3958064677725714
MC 's skill level is:  0.5033267762289416
Polt 's skill level is:  0.2542150593402713
INnoVation 's skill level is:  0.19198067695712345
Zest 's skill level is:  0.47922447073044566
NesTea 's skill level is:  1.101697638422049
MMA 's skill level is:  0.2246876148035381
Rain 's skill level is:  0.8526257663685175


So, we're actually getting the opposite of what we were expecting. I think this has to do with the fact that I changed the sampling distribution from bernoulli_logit to binomial_logit. But we can just say that lower is better and still call this a good model.

If we want to predict which player will win, we might use a direct estimator of that quantity based on the sample values:

In [283]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind2[0]])

Sen has a  59.90946367675115 % chance of winning against  INnoVation


Based on online rankings, innovation should be the winner, not Sen, thus inversing them is a must.

##### inversing the skill_levels

In [284]:
player_skills2 = np.array([abs(5-x) for x in player_skills])

In [285]:
ind = np.unravel_index(np.argmax(player_skills2, axis=None), player_skills2.shape)
ind2 = np.unravel_index(np.argmin(player_skills2, axis=None), player_skills2.shape)

print("the highest skill level is: ", player_skills2[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills2[ind2[0]], " and his name is: ", playername[ind2[0]])

the highest skill level is:  4.808019323042877  and his name is:  INnoVation
the lowest skill level is:  0.7692246751706309  and his name is:  Sen


In [286]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills2[id])


Mvp 's skill level is:  4.589164911741294
Life 's skill level is:  3.9020745322922954
TaeJa 's skill level is:  4.604193532227429
MC 's skill level is:  4.496673223771058
Polt 's skill level is:  4.745784940659728
INnoVation 's skill level is:  4.808019323042877
Zest 's skill level is:  4.520775529269554
NesTea 's skill level is:  3.898302361577951
MMA 's skill level is:  4.775312385196462
Rain 's skill level is:  4.147374233631482


In [287]:
# Highest skilled player vs Lowest skilled player:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", (1-prob) * 100, "% chance of winning against ", playername[ind2[0]])

INnoVation has a  59.90946367675114 % chance of winning against  Sen


This looks more correct. Not how, despite innovation having an incredibly high skill number, still has a relatively low chance of winning against Sen. This is due to how we scaled our data, the skill difference being scaled to only .1

### Below we start using validation

In [288]:
import csv
with open("valid.csv", newline='') as i:
    reader = csv.reader(i)
    valid_data= list(reader)

In [289]:
print(ind2[0])

304


In [290]:
#0=date
#1=p1name
#2=p1win?
#3=idk
#4=p2name
#5=p2win?
number_correct_skill=0
number_correct_prob=0
winner=0
for row in valid_data:
#     ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
#     ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

    p1name=row[1]
    p1win=row[2]
    p2name=row[4]
    prob = logit( skill_data['scale']*(samples['skill'][:,playerid[p1name]]-samples['skill'][:,playerid[p2name]])).mean()
    
    
    #using prob. as the factor in correctness
    if ((1-prob) * 100) > 50:
        winner=1
    else:
        winner=2
    #print(1-prob)
    
    #check to see if graphical model is correct
    if p1win=='[winner]' and winner==1:
        number_correct_prob+=1
    elif p1win=='[loser]' and winner==2:
        number_correct_prob+=1
    
    
    #using calculated skill as the factor in correctness
    if player_skills2[playerid[p1name]] > player_skills2[playerid[p2name]]:
        winner=1
    else:
        winner=2
    
    if p1win=='[winner]' and winner==1:
        number_correct_skill+=1
    elif p1win=='[loser]' and winner==2:
        number_correct_skill+=1      

In [291]:
prct_correct=(number_correct_skill/len(valid_data)) *100
print("For this graphical model, using only the calculate skill, it gets",prct_correct,"% correct")

For this graphical model, using only the calculate skill, it gets 60.17424234365526 % correct


In [292]:
prct_correct=(number_correct_prob/len(valid_data)) *100
print("For this graphical model, using only the calculated probability, it gets",prct_correct,"% correct")

For this graphical model, using only the calculated probability, it gets 60.17530609422702 % correct
