## Preparing data

In [1]:
#import dataset we will be working with and reset index
import pandas as pd
players_df = pd.read_pickle("players_df.pkl")
players_df = players_df.reset_index()

In [2]:
#look for na in our dataset by column
pd.set_option('display.max_columns', None)
for col in players_df.columns:
    numNa = players_df[col].isnull().value_counts()
    try:
        if numNa.values[1] > 100:
            print(numNa)
    except:
        None

False    1409802
True      183348
Name: stats.neutralMinionsKilledTeamJungle, dtype: int64
False    1409802
True      183348
Name: stats.neutralMinionsKilledEnemyJungle, dtype: int64
False    1409766
True      183384
Name: stats.wardsPlaced, dtype: int64
False    1409766
True      183384
Name: stats.wardsKilled, dtype: int64
False    1589762
True        3388
Name: stats.firstBloodKill, dtype: int64
False    1589762
True        3388
Name: stats.firstBloodAssist, dtype: int64
False    1573922
True       19228
Name: stats.firstTowerKill, dtype: int64
False    1573922
True       19228
Name: stats.firstTowerAssist, dtype: int64
False    1208562
True      384588
Name: stats.firstInhibitorKill, dtype: int64
False    1208562
True      384588
Name: stats.firstInhibitorAssist, dtype: int64
False    1592869
True         281
Name: stats.perkSubStyle, dtype: int64
False    1591566
True        1584
Name: stats.statPerk0, dtype: int64
False    1591566
True        1584
Name: stats.statPerk1, dtype: in

We are interested in columns "stats.wardsPlaced" and "stats.wardsKilled" and they have NaN values, therefore we are going to drop rows to get these clean

In [2]:
players_df_gg = players_df.dropna(subset=["stats.wardsPlaced","stats.wardsKilled"])

In [3]:
#create subsets to work with, select columns we want
x = players_df_gg[["stats.kills","stats.deaths","stats.assists","stats.totalDamageDealt","stats.magicDamageDealt","stats.physicalDamageDealt","stats.trueDamageDealt","stats.totalDamageDealtToChampions","stats.magicDamageDealtToChampions","stats.physicalDamageDealtToChampions","stats.trueDamageDealtToChampions","stats.totalHeal","stats.damageSelfMitigated","stats.damageDealtToObjectives","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.magicalDamageTaken","stats.physicalDamageTaken","stats.trueDamageTaken","stats.goldEarned","stats.goldSpent","stats.turretKills","stats.inhibitorKills","stats.totalMinionsKilled","stats.neutralMinionsKilled","stats.champLevel","stats.wardsPlaced","stats.wardsKilled"]]
y = players_df_gg[["championId","spell1Id","spell2Id","timeline.role","timeline.lane"]]

In [5]:
#look for na in our dataset by column
pd.set_option('display.max_columns', None)
for col in x.columns:
    numNa = x[col].isnull().value_counts()
    try:
        if numNa.values[1] > 100:
            print(numNa)
    except:
        None

In [6]:
#look for na in our dataset by column
pd.set_option('display.max_columns', None)
for col in y.columns:
    numNa = y[col].isnull().value_counts()
    try:
        if numNa.values[1] > 100:
            print(numNa)
    except:
        None

We haven't got NaN values in the biggest subset of the dataset that we will try, we are good. All test will be reducing the amount of columns already selected in x and y

## Testing different models
In order to make testing in a more controlled manner we created classes and a function, (if you want to see the working process of creating this classes you can find the notebooks in the appendix folder)

KNN , PCA+KNN  and CORRELATION

In [35]:
from sklearn.model_selection import train_test_split
from sklearn import neighbors, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import warnings
from sklearn.exceptions import DataConversionWarning

class knn_predictor():
    def __init__(self,x,y,num_neighb = 1):
        self.values_in = x.columns.values
        self.values_out = y.columns.values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.25, random_state=42) #Keep 25% of the data as the test set
        self.n_neighbours = num_neighb # Number of neigbours we will use for the classification by default
        self.clf = neighbors.KNeighborsClassifier(self.n_neighbours, weights='distance', algorithm = 'kd_tree')
        self.clf.fit(self.X_train, np.ravel(self.y_train))
        self.clf.get_params()

    def predict(self,input):
        print(self.clf.predict([input]))
    
    def accuracy(self):
        #print("Train Accuracy : " + str(self.clf.score(self.X_train, self.y_train)))
        print("Test Accuracy : " + str(self.clf.score(self.X_test, self.y_test)))
        None


class pca_and_knn():
    def __init__(self,n_components,x,y,neighbors=1):
        #first we standarize all our data to prepare it for PCA
        self.scaler = StandardScaler()
        self.n_components = n_components
        self.Xcentred = self.scaler.fit_transform(x)
        self.y = y

        #we want to transform our data create PCA object
        self.pca = PCA(self.n_components)

        #calculate
        self.pca.fit(self.Xcentred)

        #new dataset 
        self.Xnew = self.pca.transform(self.Xcentred) #here it is a np array we want a pd dataframe
        self.Xnew = pd.DataFrame(self.Xnew)
        self.reduced_full = pd.merge(self.Xnew, self.y, left_index=True, right_index=True)

        self.knn_mod = knn_predictor(self.Xnew,self.y.to_frame(),neighbors)
    def variance_explained(self):
        print("\nThe components we have found, explain the following percentage of variance: \n", self.pca.explained_variance_ratio_) # explained_variance_ratio_ stores the percentage of variance each of our components explains
    
    def predict_knn(self,entry):
        #self.entry = self.pca.transform(self.scaler.fit_transform(entry))
        self.entry = self.scaler.transform(entry.to_frame().T)
        self.entry = self.pca.transform(self.entry)
        #we reuse the class we created before
        self.knn_mod.predict(self.entry[0])
        
    def knn_accuracy(self):
        self.knn_mod.accuracy()


def correlation_most_sim(x,y,input,methd="pearson",topN=1):
    similarity = x.corrwith(input, axis = "columns", method = methd) # This function calculates everything based on common items (e.g. mean of user is calculated based only on the common items with that other user we correlate with)
    similarity.sort_values(ascending = False, inplace = True)
    return [y.iloc[similarity[:topN].index],x.iloc[similarity[:topN].index],similarity[:topN],]


class kmeans_mod():
    def __init__(self,n_clusterss,x):
        #normalitzar dades
        self.scaler = StandardScaler()
        self.Xcentred = self.scaler.fit_transform(x)
        
        self.kmeans = KMeans(n_clusters=n_clusterss, random_state=0).fit(self.Xcentred)
        warnings.filterwarnings(action='ignore', category=UserWarning) #we filter 

    def labels(self):
        return self.kmeans.labels_
    
    def predict(self,entryes):
        self.entry = self.scaler.transform(entryes.to_frame().T)
        return self.kmeans.predict(self.entry)



### KNN----------------------------------

We will start with a knn classifier to predict the most suitable champion.

In [8]:
knn_pred = knn_predictor(x,pd.DataFrame(y["championId"]),1) #one neighbour
knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Test Accuracy : 0.7254470239074798


In [9]:
#We can also predict entries
knn_pred.predict(x.iloc[33])

[142]




We see that we can calculate its accuracy but this is only with one neighbour, lets try up to 5 for testing

In [10]:
for n in range(1,5):
    knn_pred = knn_predictor(x,pd.DataFrame(y["championId"]),n) #one neighbour
    print("Neighbours: "+format(n))
    knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Neighbours: 1
Test Accuracy : 0.7254470239074798
Neighbours: 2
Test Accuracy : 0.725444186561193
Neighbours: 3
Test Accuracy : 0.7284177254697227
Neighbours: 4
Test Accuracy : 0.7335107620544657


More neighbours are giving more test accuracy! let's see how far it goes

In [11]:
for n in range(15,30):
    knn_pred = knn_predictor(x,pd.DataFrame(y["championId"]),n) #one neighbour
    print("Neighbours: "+format(n))
    knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Neighbours: 15
Test Accuracy : 0.7556732739003864
Neighbours: 16
Test Accuracy : 0.7563003274297615
Neighbours: 17
Test Accuracy : 0.7565500139029968
Neighbours: 18
Test Accuracy : 0.7565386645178498
Neighbours: 19
Test Accuracy : 0.756498941669835
Neighbours: 20
Test Accuracy : 0.7563797731257909
Neighbours: 21
Test Accuracy : 0.7563372129314895
Neighbours: 22
Test Accuracy : 0.7564705682069673
Neighbours: 23
Test Accuracy : 0.7567571401819306
Neighbours: 24
Test Accuracy : 0.7565188030938423
Neighbours: 25
Test Accuracy : 0.7566408089841733
Neighbours: 26
Test Accuracy : 0.7565443392104233
Neighbours: 27
Test Accuracy : 0.7564336827052395
Neighbours: 28
Test Accuracy : 0.7564053092423718
Neighbours: 29
Test Accuracy : 0.7563343755852027


We see that

### PCA+KNN ------------------------

Now, using the same dataset lets try doing PCA and then classifying using knn again

In [6]:
pca_mod = pca_and_knn(15,x,y["championId"]) #we are reducing to 4 components
pca_mod.variance_explained()


The components we have found, explain the following percentage of variance: 
 [0.37250628 0.12018041 0.08157546 0.07175328 0.06426078 0.03873198
 0.03598934 0.02961064 0.02758447 0.02397564 0.02173817 0.01925794
 0.01624248 0.01433543 0.01382627]


In [13]:
##see that we can also make predictions
pca_mod.predict_knn(x.iloc[33])

[142]


In [14]:
#and we can also calculate the knn accuracy
pca_mod.knn_accuracy()

Test Accuracy : 0.5672280829186079


Looking for the best combination of columns for PCA

In [15]:
#we create this functions to recursively generate all combinations of columns
import numpy as np
def all_except(array):
    combinations = []
    for idx in range(len(array)):
        current_combination = np.delete(array,idx)
        combinations.append(list(current_combination))
        if len(current_combination) > 2:
            sub_comb = all_except(current_combination)
            for combi in sub_comb:
                if not (combi in combinations):
                    combinations.append(list(combi))
    return combinations

In [16]:
#for testing purposes:
all_except([1,2,3,4])

[[2, 3, 4],
 [3, 4],
 [2, 4],
 [2, 3],
 [1, 3, 4],
 [1, 4],
 [1, 3],
 [1, 2, 4],
 [1, 2],
 [1, 2, 3]]

Change of input data, we reduce the numer of columns!

In [17]:
#28 columns is an inmense amount of combinations that is why we are only going to use the combinations of the following selected columns
x2 = x[["stats.kills","stats.assists","stats.magicDamageDealt","stats.physicalDamageDealt","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.turretKills","stats.totalMinionsKilled","stats.totalHeal"]]
col_comb = all_except(list(x2.columns))

In [18]:
#Now we will do pca with 5 components for each combination
results = []
cnt = 0
lenght = len(col_comb)
for cols in col_comb:
    cnt +=1
    if len(cols) > 5:
        print(format(cnt)+"/"+format(lenght))
        pca_mod = pca_and_knn(4,x2[cols],y["championId"]) #we are reducing to 4 components
        results.append([cols,pca_mod.knn_mod.clf.score(pca_mod.knn_mod.X_test, pca_mod.knn_mod.y_test),pca_mod.pca.explained_variance_ratio_])

1/501
2/501
3/501
60/501
91/501
107/501
115/501
119/501
121/501
122/501
123/501
154/501
170/501
178/501
182/501
184/501
185/501
186/501
202/501
210/501
214/501
216/501
217/501
218/501
226/501
230/501
232/501
233/501
234/501
238/501
240/501
241/501
242/501
244/501
245/501
246/501
247/501
248/501
249/501
250/501
281/501
297/501
305/501
309/501
311/501
312/501
313/501
329/501
337/501
341/501
343/501
344/501
345/501
353/501
357/501
359/501
360/501
361/501
365/501
367/501
368/501
369/501
371/501
372/501
373/501
374/501
375/501
376/501
377/501
393/501
401/501
405/501
407/501
408/501
409/501
417/501
421/501
423/501
424/501
425/501
429/501
431/501
432/501
433/501
435/501
436/501
437/501
438/501
439/501
440/501
441/501
449/501
453/501
455/501
456/501
457/501
461/501
463/501
464/501
465/501
467/501
468/501
469/501
470/501
471/501
472/501
473/501
477/501
479/501
480/501
481/501
483/501
484/501
485/501
486/501
487/501
488/501
489/501
491/501
492/501
493/501
494/501
495/501
496/501
497/501
498/501


In [19]:
results_df = pd.DataFrame(results, columns = ['columns', 'precision_test',"variance_prop"])
results_df

Unnamed: 0,columns,precision_test,variance_prop
0,"[stats.assists, stats.magicDamageDealt, stats....",0.555822,"[0.35028980493212897, 0.24558752761669123, 0.1..."
1,"[stats.magicDamageDealt, stats.physicalDamageD...",0.584760,"[0.39064344181600347, 0.25899823954028484, 0.1..."
2,"[stats.physicalDamageDealt, stats.damageDealtT...",0.560262,"[0.45321768292304876, 0.24983101999913476, 0.1..."
3,"[stats.magicDamageDealt, stats.damageDealtToTu...",0.571524,"[0.3909182849380097, 0.28598142131697907, 0.13..."
4,"[stats.magicDamageDealt, stats.physicalDamageD...",0.603399,"[0.3771158712436026, 0.28249466414423874, 0.14..."
...,...,...,...
124,"[stats.kills, stats.assists, stats.magicDamage...",0.526529,"[0.3836344350157578, 0.2396916917465311, 0.144..."
125,"[stats.kills, stats.assists, stats.magicDamage...",0.542265,"[0.3784915635844839, 0.21724566489516542, 0.14..."
126,"[stats.kills, stats.assists, stats.magicDamage...",0.532811,"[0.3727184219514579, 0.23070123029435444, 0.12..."
127,"[stats.kills, stats.assists, stats.magicDamage...",0.523556,"[0.39481241601880546, 0.21623809417174816, 0.1..."


In [20]:
#asking for precision over 0.6 is showing only 2 combinations
results_df[results_df["precision_test"]>0.6]

Unnamed: 0,columns,precision_test,variance_prop
4,"[stats.magicDamageDealt, stats.physicalDamageD...",0.603399,"[0.3771158712436026, 0.28249466414423874, 0.14..."
53,"[stats.kills, stats.magicDamageDealt, stats.ph...",0.601526,"[0.41285686851522896, 0.267998486413778, 0.145..."


### CORRELATION_MOST_SIMILAR------------------------

In [39]:
#we will take as an input random entry from x
input = x.iloc[3333]
x_without_input = x.drop(index=3333)
correlation_most_sim(x.iloc[:300],y,input,methd="pearson",topN=1)[0] #using only 300 of x for quick testing

Unnamed: 0,championId,spell1Id,spell2Id,timeline.role,timeline.lane,name,partype,tag
233,64,11,4,DUO_SUPPORT,NONE,Lee Sin,Energy,Fighter


In [22]:
def testing_precision_corr_rec(x,y,Nevals,y_columns_targeted,method="pearson"):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) #Keep 25% of the data as the test set
    precision = 0

    eval_subset = X_test.iloc[:Nevals]
    eval_subset = eval_subset.reset_index()
    eval_subset_y = y_test.iloc[:Nevals]
    eval_subset_y = eval_subset_y.reset_index()
    for idx in range(Nevals):
        print(format(idx)+"/"+format(Nevals))
        entry = eval_subset.iloc[idx].drop("index")
        entry_y = eval_subset_y.iloc[idx]
        try:
            recommendation = correlation_most_sim(X_train,y_train,entry,methd=method,topN=1)[0]
            if(np.ravel(recommendation[y_columns_targeted])[0] == entry_y["championId"]):
                precision += 1
        except:
            None
            
    return precision/Nevals

### CORRELATION_RECOMMENDER------------------------

Let's think of the situation where given certain values like:
["stats.kills","stats.assists","stats.magicDamageDealt","stats.physicalDamageDealt","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.turretKills","stats.totalMinionsKilled","stats.totalHeal"] we want to predict one of them.
For example, how many turrets is going to kill someone with the other characteristics?
My intuition says that probably someone with a lot of physical damage will have more.

We will reuse the correlation_recommend function 

In [41]:
#grap top 100, 100 arbitrarely, could be larger and compute the expected value
input = x.iloc[3333]
x_without_input = x.drop(index=3333)
input = input.drop("stats.turretKills")
output_most_sim = correlation_most_sim(x.iloc[:300],y,input,methd="pearson",topN=100)
#similar_boyszz = output_most_sim[0] #using only 300 of x for quick testing

IndexError: positional indexers are out-of-bounds

In [32]:
similar_boyszz.head()

Unnamed: 0,championId,spell1Id,spell2Id,timeline.role,timeline.lane,name,partype,tag
233,64,11,4,DUO_SUPPORT,NONE,Lee Sin,Energy,Fighter
188,89,14,4,DUO_SUPPORT,NONE,Leona,Mana,Tank
85,56,4,12,NONE,JUNGLE,Nocturne,Mana,Assassin
169,246,4,11,NONE,JUNGLE,Qiyana,Mana,Assassin
195,163,4,11,NONE,JUNGLE,Taliyah,Mana,Mage


We could use this formula for predicting, but in our case it doesn't make sense to use the distance of the desired value rb,i with the mean of all b. This is due to that each column is its own metric with its own range of values. Not apples to apples

$ pred(a, i) = \hat{r_a} + \frac{\sum_{b \in N} sim(a, b) (r_{b,i} - \hat{r_b})}{\sum_{b \in N} sim(a,b)} $

This is why we will do the following:

$ pred(a, i) = \frac{\sum_{b \in N} sim(a, b) (r_{b,i})}{\sum_{b \in N} sim(a,b)} $

In [37]:
print(output_most_sim[2][0])

IndexError: list index out of range

In [None]:
nom = den = 0
for i in range(100):
    nom = output_most_sim[1][0] * similar_boyszz

### KMEANS--------------------------------------

We can also try and look for groups using kmeans, if we find that it is classifying some label in the y dataset we can use it directly as a classifier

In [11]:
#for testing purposes, lets try
kmeanssz = kmeans_mod(6,x)

In [13]:
#it is classifying as type 3
kmeanssz.predict(x.iloc[34994])

array([0], dtype=int32)

Lets see if it can classify something as we have labeled 

In [14]:
def kmeans_precision_testing(x,y,labels,Nevals=0):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) #Keep 25% of the data as the test set
    precision = 0
    
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    y_test = y_test.reset_index()[:Nevals]
    
    nfeatures = len(y[labels].value_counts().index)
    kmean_modelu = kmeans_mod(nfeatures,X_train)
    
    if Nevals == 0:
        Nevals = len(X_test)

    predictions = []
    for idx in range(len(y_test)):  
        #print(format(idx)+"/"+format(Nevals)) 
        try:
            predictions.append(int(kmean_modelu.predict(X_test.iloc[idx])))
        except:
            None
    y_test["classified_as"] = predictions
    
    results_cols = []
    for indexito in y[labels].value_counts().index:
        results_cols.append(indexito[0])

    results = pd.DataFrame(columns=results_cols)

    valcounts = []
    for label_value in y[labels].value_counts().index:
        value_counts = y_test[y_test[labels[0]]==label_value[0]]["classified_as"].value_counts()
        kmean_groups = value_counts.index.values
        
        for grp_val in kmean_groups:
            results.at[grp_val,label_value[0]] = value_counts[grp_val]

    results = results.sort_index()
    return results

As a test let's start with "timeline.lane"

In [15]:
kmeans_precision_testing(x,y,["timeline.lane"],50)

Unnamed: 0,NONE,BOTTOM,MIDDLE,JUNGLE,TOP
0,,6.0,1,,
1,2.0,3.0,3,1.0,1.0
2,,,1,3.0,2.0
3,,4.0,1,2.0,2.0
4,15.0,2.0,1,,


We see that group 2 is related to "lane" marked as NONE and BOTTOM, but those two seem to be equally marked as group 3. "lane" is not how this kmeans is grouping

Let's try a bunch of collumns and more iterations of testing

In [27]:
#haven't used championId or spells because it takes too much time to compute
columns = ["timeline.role","timeline.lane"]
for colu in columns:
    print(colu)
    df = kmeans_precision_testing(x,y,[colu],1000)
    print(df)
    for col in df.columns:
        print("------")
        print(str(col))
        print("------")
        print("max: "+format(df[col].max()))
        print("median: "+format(df[col].median()))
        print("mean: "+format(df[col].mean()))
        print("std: "+format(df[col].std()))
        print("min: "+format(df[col].min()))
        print("")
    
    print("") #as en enter, for aesthetics ;)

timeline.role
  DUO_SUPPORT SOLO NONE DUO_CARRY DUO
0          88   47   21        23  26
1          71   58   37        18  17
2          74   52   28        27  22
3          79   49   29        27  15
4          67   44   29        31  21
------
DUO_SUPPORT
------
max: 88
median: 74.0
mean: 75.8
std: 8.105553651663778
min: 67

------
SOLO
------
max: 58
median: 49.0
mean: 50.0
std: 5.338539126015656
min: 44

------
NONE
------
max: 37
median: 29.0
mean: 28.8
std: 5.674504383644443
min: 21

------
DUO_CARRY
------
max: 31
median: 27.0
mean: 25.2
std: 4.919349550499537
min: 18

------
DUO
------
max: 26
median: 21.0
mean: 20.2
std: 4.324349662087931
min: 15


timeline.lane
  NONE BOTTOM MIDDLE JUNGLE TOP
0   63     60     34     21  27
1   56     40     39     37  29
2   63     52     20     28  40
3   57     52     33     29  28
4   45     62     31     29  25
------
NONE
------
max: 63
median: 57.0
mean: 56.8
std: 7.362064927722384
min: 45

------
BOTTOM
------
max: 62
median: 52.0


We'll try with other data that we can get about our champions

In [19]:
#We add a new column to y with the type of champion for each entry
import json 
df = pd.read_json('https://ddragon.leagueoflegends.com/cdn/12.8.1/data/en_US/champion.json')

#this is not working
#pd.DataFrame.from_dict(df.iloc[0]["data"])

#i will do it manually
champ_info_df = pd.DataFrame(columns = df["data"].iloc[0].keys())
for idx in range(len(df)):
    aux = []
    for col in champ_info_df.columns:
        aux.append(df.iloc[idx]["data"][col])
        #print(df.iloc[idx]["data"][col])
    champ_info_df.loc[idx] =  aux

#for each entry in y we will add the champ name and the type
for idx in range(len(champ_info_df)):
    indexes = y[y["championId"] == int(champ_info_df.iloc[idx]["key"])].index #positions of the dataframe where the selected champid of champ_info_df lies
    y.loc[indexes,["name"]] = champ_info_df.iloc[idx]["name"]
    y.loc[indexes,["partype"]] = champ_info_df.iloc[idx]["partype"]
    y.loc[indexes,["tag"]] = champ_info_df.iloc[idx]["tags"][0] #we get only the first tag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[indexes,["name"]] = champ_info_df.iloc[idx]["name"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[indexes,["partype"]] = champ_info_df.iloc[idx]["partype"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[indexes,["tag"]] = champ_info_df.iloc[idx]["tags"][0] #we get only the first 

lets try again

In [49]:
y.head()

Unnamed: 0,championId,spell1Id,spell2Id,timeline.role,timeline.lane,name,partype,tag
0,39,12,4,SOLO,TOP,Irelia,Mana,Fighter
1,68,12,4,SOLO,MIDDLE,Rumble,Heat,Fighter
2,350,7,14,DUO_SUPPORT,BOTTOM,Yuumi,Mana,Support
3,81,12,4,DUO_CARRY,BOTTOM,Ezreal,Mana,Marksman
4,113,4,11,NONE,JUNGLE,Sejuani,Mana,Tank


In [21]:
#haven't used championId or spells because it takes too much time to compute
columns = ["partype","tag"]
for colu in columns:
    print(colu)
    df = kmeans_precision_testing(x,y,[colu],1000)
    print(df)
    print("")

partype
   Mana Energy None Grit Fury Rage Blood Well Flow Shield Heat Crimson Rush  \
0   112    NaN  NaN    4  NaN  NaN        NaN  NaN    NaN    1          NaN   
1   145      9    3  NaN    6    3        NaN    1      2    2          NaN   
2    38      1  NaN  NaN    1    1        NaN    2    NaN  NaN          NaN   
3    53    NaN    7    3    3  NaN        NaN    4    NaN  NaN          NaN   
4    50     15    2    2  NaN   10        NaN  NaN    NaN  NaN          NaN   
5    43     12    2    1    1    2        NaN  NaN    NaN  NaN          NaN   
6    23    NaN    1  NaN  NaN  NaN        NaN  NaN      1    1            2   
7   147      5    2    3    4    1          6    6      2    3            4   
8    82      1    1    5    7  NaN          6    4    NaN  NaN          NaN   
9    60      6    4  NaN  NaN  NaN        NaN  NaN      2   10            3   
10   11      1    4  NaN    4  NaN          2  NaN    NaN  NaN          NaN   
11   16    NaN    3    8  NaN  NaN        Na

CONCLUSION: we would hope to see some big values on only one column, big std but it is not the case. We haven't found any relation of the kmeans generated clusters and any class

# ----------------------------------
# OBJECTIVE: Predict ChampionID
# ----------------------------------

###  Lets see what model is the best for this pourpose: knn neighbours, or pca and then knn

In [31]:
#To start we will make sure we are using the same input data
x2 = x[["stats.kills","stats.assists","stats.magicDamageDealt","stats.physicalDamageDealt","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.turretKills","stats.totalMinionsKilled","stats.totalHeal"]]
y2 = y["championId"]

In [32]:
knn_pred = knn_predictor(x2,pd.DataFrame(y["championId"]),6) #as we saw it had more accuracy
knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Test Accuracy : 0.661186805204828


In [33]:
pca_mod = pca_and_knn(4,x2,y["championId"]) #we are reducing to 4 components
pca_mod.knn_accuracy()

Test Accuracy : 0.5455791307505916


Here, we see that knn has better precision.

To compare it to using correlation of entries we have to reduce the dataset size as this is computationally intensive, therefore 

In [34]:
x3 = x2[:300000]
y3 = pd.DataFrame(y2[:300000])

In [35]:
knn_pred = knn_predictor(x3,pd.DataFrame(y3["championId"]),6) #as we saw it had more accuracy
knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Test Accuracy : 0.41329333333333335


In [36]:
pca_mod = pca_and_knn(4,x3,y3["championId"]) #we are reducing to 4 components
pca_mod.knn_accuracy()

Test Accuracy : 0.15861333333333333


In [37]:
testing_precision_corr_rec(x3,y3,50,["championId"])

0/50
1/50
2/50
3/50
4/50
5/50
6/50
7/50
8/50
9/50
10/50
11/50
12/50
13/50
14/50
15/50
16/50
17/50
18/50
19/50
20/50
21/50
22/50
23/50
24/50
25/50
26/50
27/50
28/50
29/50
30/50
31/50
32/50
33/50
34/50
35/50
36/50
37/50
38/50
39/50
40/50
41/50
42/50
43/50
44/50
45/50
46/50
47/50
48/50
49/50


0.02

# ----------------------------------
# OBJECTIVE: Predict spells
# ----------------------------------

### SPELL1--------------------------------------

In [38]:
#To start we will make sure we are using the same input data
x4 = x[["stats.kills","stats.assists","stats.magicDamageDealt","stats.physicalDamageDealt","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.turretKills","stats.totalMinionsKilled","stats.totalHeal"]]
y4 = y["spell1Id"]

In [39]:
knn_pred = knn_predictor(x4,pd.DataFrame(y["spell1Id"]),6) #as we saw it had more accuracy
knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Test Accuracy : 0.676020451592035


In [40]:
pca_mod = pca_and_knn(4,x4,y["spell1Id"]) #we are reducing to 4 components
pca_mod.knn_accuracy()

Test Accuracy : 0.641189188575709


we'll use a smaller dataset for correlation

In [41]:
testing_precision_corr_rec(x4[:200000],y4[:200000],100,["spell1Id"])

0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100


0.0

### SPELL2--------------------------------------

In [42]:
#To start we will make sure we are using the same input data
x5 = x[["stats.kills","stats.assists","stats.magicDamageDealt","stats.physicalDamageDealt","stats.damageDealtToTurrets","stats.totalDamageTaken","stats.turretKills","stats.totalMinionsKilled","stats.totalHeal"]]
y5 = y["spell2Id"]

In [43]:
knn_pred = knn_predictor(x5,pd.DataFrame(y["spell2Id"]),6) #as we saw it had more accuracy
knn_pred.accuracy() #the class already does does train/test splitting of the input dataset

Test Accuracy : 0.7887765930280727


In [44]:
pca_mod = pca_and_knn(4,x5,y["spell1Id"]) #we are reducing to 4 components
pca_mod.knn_accuracy()

Test Accuracy : 0.641189188575709


we'll use a smaller dataset for correlation

In [45]:
testing_precision_corr_rec(x5[:200000],y[:200000],100,["spell2Id"])

0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100


0.0

# ----------------------------------
# OBJECTIVE: Just for fun, predict other classifications and see precision
# ----------------------------------

We'll make a for loop beacuse we don't want this notebook to be any longer

In [46]:
#To start we will make sure we are using the same input data
columns = ["timeline.role","timeline.lane","partype","tag"]
for col in columns:
    print("---------")
    print(col)
    print("---------")
    print("KNN")
    knn_pred = knn_predictor(x5,pd.DataFrame(y[col]),6) #as we saw it had more accuracy
    knn_pred.accuracy() #the class already does does train/test splitting of the input dataset
    print("")
    print("PCA+KNN")
    pca_mod = pca_and_knn(4,x5,y[col]) #we are reducing to 4 components
    pca_mod.knn_accuracy()
    print("")
    print("CORRELATION (on smaller dataset size=5000, test=100)")
    print("Test Accuracy : "+format(testing_precision_corr_rec(x5[:5000],y[:5000],100,[col])))
    print("")

---------
timeline.role
---------
KNN
Test Accuracy : 0.8352267890887011

PCA+KNN
Test Accuracy : 0.7776627076228145

CORRELATION (on smaller dataset size=5000, test=100)
0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100
Test Accuracy : 0.0

---------
timeline.lane
---------
KNN
Test Accuracy : 0.8252166313889945

PCA+KNN
Test Accuracy : 0.7604258289307176

