# Loading data

In [131]:
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
import pystan

# Load training data and reduce (subsample) if desired

# Read thru file to get numeric ids for each player 
with open('train.csv') as f: lines = f.read().split('\n')

p = 0; playerid = {};
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    player0,player1 = csv[1],csv[4];
    if player0 not in playerid: playerid[player0]=p; p+=1;
    if player1 not in playerid: playerid[player1]=p; p+=1;

nplayers = len(playerid)
playername = ['']*nplayers
for player in playerid: playername[ playerid[player] ]=player;  # id to name lookup


# Sparsifying parameters (discard some training examples):
pKeep = 1.0   # fraction of edges to consider (immed. throw out 1-p edges)
nEdge = 3     # try to keep nEdge opponents per player (may be more; asymmetric)
nKeep = 5     # keep at most nKeep games per opponent pairs (play each other multiple times)

nplays, nwins = np.zeros( (nplayers,nplayers) ), np.zeros( (nplayers,nplayers) );
for i in range(len(lines)):
    csv = lines[i].split(',');
    if len(csv) != 10: continue;   # parse error or blank line
    a,b = playerid[csv[1]],playerid[csv[4]];
    aw,bw = csv[2]=='[winner]',csv[5]=='[winner]';
    if (np.random.rand() < pKeep):
        if (nplays[a,b] < nKeep) and ( ((nplays[a,:]>0).sum() < nEdge) or ((nplays[:,b]>0).sum() < nEdge) ):
            nplays[a,b] += 1; nplays[b,a]+=1; nwins[a,b] += aw; nwins[b,a] += bw;

In [132]:
nplayers # number of unique players

999

In [133]:
playerid # map from playername -> playerID
playername[0] # list of playernames indexed by their ID
print(playername[0], "vs", playername[1])

MC vs Stats


In [134]:
nplays[0,1] # number of games between player 0 and player 1

2.0

In [135]:
nwins[0,1] # number of wins between player 0 and player 1
np.max(nwins) # maximum number of wins against a single opponent is 5 

5.0

In [136]:
np.sum(nplays)  # number of player vs player combinations

9354.0

In [137]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
%matplotlib inline

win = []
PA = []
PB = []
for index, wins in np.ndenumerate(nwins):
    if wins == 0: continue
    win.append(int(wins))
    PA.append(index[0] + 1)           # increment so we can index starting at 1
    PB.append(index[1] + 1)           # player 0 is now player 1

In [138]:
print(win[:6], " # of wins PA had over PB") 
print(PA[:6], " PA's ID") 
print(PB[:6], " PB's ID")
len(win)            # total number of games

[1, 1, 2, 4, 4, 2]  # of wins PA had over PB
[1, 1, 1, 1, 1, 1]  PA's ID
[2, 4, 6, 7, 8, 9]  PB's ID


3321

##### Stan Model

In [139]:
skill_model = """
data {
  int<lower=1> N;            
  int<lower=1> E;           
  real<lower=0> scale;       
  int<lower=1,upper=5> win[E];      
  int PA[E];                  
  int PB[E];                  
}
parameters {
  vector [N] skill;       
}

model{
  for (i in 1:N){ skill[i]~normal(0,5); }
  for (i in 1:E){
    win[i] ~ bernoulli_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
  }   // win probability is a binomial_logit function of skill difference (0-5)
}
"""

##### compile the model

In [140]:
import pickle
try:     # load it if already compiled
    sm = pickle.load(open('skill_model.pkl', 'rb'))
except:  # ow, compile and save compiled model
    sm = pystan.StanModel(model_code = skill_model)
    with open('skill_model.pkl', 'wb') as f: pickle.dump(sm, f)

ValueError: Failed to parse Stan model 'anon_model_c6545b8bafeb02a55161f2b887f1299a'. Error message:
SYNTAX ERROR, MESSAGE(S) FROM PARSER:
No matches for: 

  int ~ bernoulli_logit(int, real)

Available argument signatures for bernoulli_logit:

  int ~ bernoulli_logit(real)
  int ~ bernoulli_logit(real[ ])
  int ~ bernoulli_logit(vector)
  int ~ bernoulli_logit(row_vector)
  int[ ] ~ bernoulli_logit(real)
  int[ ] ~ bernoulli_logit(real[ ])
  int[ ] ~ bernoulli_logit(vector)
  int[ ] ~ bernoulli_logit(row_vector)

Real return type required for probability function.
 error in 'unknown file name' at line 17, column 70
  -------------------------------------------------
    15:   for (i in 1:N){ skill[i]~normal(0,5); }
    16:   for (i in 1:E){
    17:     win[i] ~ bernoulli_logit(5, (scale)*(skill[PA[i]]-skill[PB[i]]) );
                                                                             ^
    18:   }   // win probability is a binomial_logit function of skill difference (0-5)
  -------------------------------------------------



In [None]:
skill_data = {
    'N': 999,
    'E': 3321,
    'scale': 0.1,
    'win': win,
    'PA': PA,
    'PB': PB
}

Now, we can perform MCMC on the model, and extract the samples:

In [None]:
fit = sm.sampling(data=skill_data, iter=1000, chains=2)

In [None]:
samples = fit.extract()

If we just want the mean estimate for each player's skill level, just take the empirical average over the samples:

In [None]:
player_skills = samples['skill'].mean(0)
print(player_skills)


Finding the name of the player with the highest skill according to our model

In [None]:
ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

print("the highest skill level is: ", player_skills[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills[ind2[0]], " and his name is: ", playername[ind2[0]])

The above result is surprising because Zest is a good player. According to this link https://www.lineups.com/esports/top-10-starcraft-ii-players-of-all-time/ these are the top ten players of all time. So they should have high skill levels.

In [None]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills[id])


So, we're actually getting the opposite of what we were expecting. I think this has to do with the fact that I changed the sampling distribution from bernoulli_logit to binomial_logit. But we can just say that lower is better and still call this a good model.

If we want to predict which player will win, we might use a direct estimator of that quantity based on the sample values:

In [None]:
# Player 0 vs Player 1 prediction:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", prob * 100, "% chance of winning against ", playername[ind2[0]])

Based on online rankings, innovation should be the winner, not Sen, thus inversing them is a must.

##### inversing the skill_levels

In [None]:
player_skills2 = np.array([abs(5-x) for x in player_skills])

In [None]:
ind = np.unravel_index(np.argmax(player_skills2, axis=None), player_skills2.shape)
ind2 = np.unravel_index(np.argmin(player_skills2, axis=None), player_skills2.shape)

print("the highest skill level is: ", player_skills2[ind[0]], " and his name is: ", playername[ind[0]])
print("the lowest skill level is: ", player_skills2[ind2[0]], " and his name is: ", playername[ind2[0]])

In [None]:
top10 = np.array([
    playerid["Mvp"],
    playerid["Life"],
    playerid["TaeJa"],
    playerid["MC"],
    playerid["Polt"],
    playerid["INnoVation"],
    playerid["Zest"],
    playerid["NesTea"],
    playerid["MMA"],
    playerid["Rain"]
])

for id in top10:
    print(playername[id], "'s skill level is: ", player_skills2[id])


In [None]:
# Highest skilled player vs Lowest skilled player:
def logit(z): return 1./(1.+np.exp(-z))

# Use our model's win probability function (logistic of scaled difference)
#  using the predicted skill difference for each sample:
prob = logit( skill_data['scale']*(samples['skill'][:,ind[0]]-samples['skill'][:,ind2[0]]) ).mean()

print(playername[ind[0]], "has a ", (1-prob) * 100, "% chance of winning against ", playername[ind2[0]])

This looks more correct. Not how, despite innovation having an incredibly high skill number, still has a relatively low chance of winning against Sen. This is due to how we scaled our data, the skill difference being scaled to only .1

### Below we start using validation

In [None]:
import csv
with open("valid.csv", newline='') as i:
    reader = csv.reader(i)
    valid_data= list(reader)

In [None]:
#0=date
#1=p1name
#2=p1win?
#3=idk
#4=p2name
#5=p2win?
number_correct_skill=0
number_correct_prob=0
winner=0
for row in valid_data:
#     ind = np.unravel_index(np.argmax(player_skills, axis=None), player_skills.shape)
#     ind2 = np.unravel_index(np.argmin(player_skills, axis=None), player_skills.shape)

    p1name=row[1]
    p1win=row[2]
    p2name=row[4]
    prob = logit( skill_data['scale']*(samples['skill'][:,playerid[p1name]]-samples['skill'][:,playerid[p2name]])).mean()
    
    
    #using prob. as the factor in correctness
    if ((1-prob) * 100) > 50:
        winner=1
    else:
        winner=2
    #print(1-prob)
    
    #check to see if graphical model is correct
    if p1win=='[winner]' and winner==1:
        number_correct_prob+=1
    elif p1win=='[loser]' and winner==2:
        number_correct_prob+=1
    
    
    #using calculated skill as the factor in correctness
    if player_skills2[playerid[p1name]] > player_skills2[playerid[p2name]]:
        winner=1
    else:
        winner=2
    
    if p1win=='[winner]' and winner==1:
        number_correct_skill+=1
    elif p1win=='[loser]' and winner==2:
        number_correct_skill+=1      

In [None]:
prct_correct=(number_correct_skill/len(valid_data)) *100
print("For this graphical model, using only the calculate skill, it gets",prct_correct,"% correct")

In [None]:
prct_correct=(number_correct_prob/len(valid_data)) *100
print("For this graphical model, using only the calculated probability, it gets",prct_correct,"% correct")