In [None]:
# Load necessary packages

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import secrets                              # imports secure module.

# from google.colab import files

%matplotlib inline

In [None]:
# uploaded = files.upload()

In [557]:
# Read data and present

train = pd.read_csv('trainingData.csv')
valid = pd.read_csv('validationData.csv')
train.head()

Unnamed: 0,deck,nofGames,nOfPlayers,winRate
0,archers;arrows;baby-dragon;balloon;bats;fireba...,44,2,0.58511
1,archers;arrows;baby-dragon;balloon;bomber;free...,143,3,0.53767
2,archers;arrows;baby-dragon;balloon;bomber;gian...,61,3,0.34375
3,archers;arrows;baby-dragon;balloon;cannon;free...,162,1,0.49394
4,archers;arrows;baby-dragon;balloon;electro-wiz...,57,1,0.55833


In [None]:
valid.head()

In [558]:
# Helper functions to preprocess data to bag-of-cards format

def unnest(df, col):
    unnested = (df.apply(lambda x: pd.Series(x[col]), axis=1)
                .stack()
                .reset_index(level=1, drop=True))
    unnested.name = col
    return df.drop(col, axis=1).join(unnested)

def to_bag_of_cards(df):
    df['ind'] = np.arange(df.shape[0]) + 1
    df_orig = df.copy()
    df['deck'] = df['deck'].apply(lambda d: d.split(';'))
    df = unnest(df, 'deck')
    df['value'] = 1
    df_bag = df.pivot(index='ind', columns='deck', values='value')
    df_bag[df_bag.isna()] = 0
    df_bag = df_bag.astype('int')
    return pd.concat([df_orig.set_index('ind'), df_bag], axis=1)

In [None]:
train = to_bag_of_cards(train)
valid = to_bag_of_cards(valid)
train.head()
train

In [None]:
valid.head()

In [None]:
# Sort data by number of games played

train_s = train.sort_values('nofGames', ascending=False)
valid_s = valid.sort_values('nofGames', ascending=False)

In [None]:
# Specify example model fitting function and R squared metric

from sklearn.svm import SVR

def R2(x, y):
    return 1 - np.sum(np.square(x - y)) / np.sum(np.square(y - np.mean(y)))

def fit_svm(data):
    svr = SVR(kernel='rbf', gamma=1.0/90, C=1.0, epsilon=0.02, shrinking=False)
    svr.fit(data.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), data['winRate'])
    return svr

sizes = (np.arange(10) + 6) * 100

In [None]:
# Fit and predict on models of various training sizes

fit_list = list(map(lambda size: fit_svm(train.iloc[:size]), sizes))
pred_list = list(map(lambda fit: fit.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)),
                     fit_list))

In [None]:
from sklearn.svm import NuSVR

nuSVM = NuSVR(kernel='rbf', gamma=1.0/90, C=1.0, nu=0.01, shrinking=False)
nuSVM.fit(train.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), train['winRate'], sample_weight=train['nofGames']/np.max(train['nofGames']))
preds = nuSVM.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
print(R2(preds, valid['winRate']))
print(len(nuSVM.support_))

train2 = train.loc[nuSVM.support_]
nuSVM2 = NuSVR(kernel='rbf', gamma=1.0/90, C=1.0, nu=0.01, shrinking=False)
nuSVM2.fit(train2.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1), train2['winRate'], sample_weight=train2['nofGames']/np.max(train2['nofGames']))
preds2 = nuSVM2.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1))
print(R2(preds2, valid['winRate']))
print(len(nuSVM2.support_))

In [None]:
np.max(train['nofGames']/np.max(train['nofGames']))


In [None]:
# Calculate R squared scores

r2 = list(map(lambda p: R2(p, valid['winRate']), pred_list))
r2

In [None]:
_ = plt.plot(sizes, r2)

In [None]:
np.mean(r2)

In [None]:
# Save hyperparameteres and selected indices in submission format

with open('example_sub_python.txt', 'a') as f:
    for size in sizes:
        ind_text = ','.join(list(map(str, train.index.values[:size])))
        text = ';'.join(['0.02', '1.0', str(1.0 / 90), ind_text])
        f.write(text + '\n')

In [None]:
def sim(x,y):
    return (16 - x.ne(y).sum())

In [None]:
def differ(x,y):
    return x.ne(y).sum()

def distance(data,n_row,n_rand):
    dist= pd.DataFrame(index=data.iloc[:n_row].index, columns=np.arange(n_rand))
    secure_random = secrets.SystemRandom()
    indeks=data.iloc[:n_row].index.tolist()
   
    for row in range(len(indeks)):
          
        random_rows=secure_random.sample(indeks,n_rand)
        for other in range(len(random_rows)):
        
            dist.iloc[row,other]=differ(data.iloc[row,:],data.loc[random_rows[other]])
    
    return dist

In [None]:

def distance2(data,n_row,n_rand):
    dist= pd.DataFrame(index=data.iloc[:n_row].index, columns=["sum"])#tworze macierz
    secure_random = secrets.SystemRandom()
    indeks=data.iloc[:n_row].index.tolist()#indeksy obecne w danych do n_row
    win=data["winRate"] # kolumna z winRate
    data=data.drop(["winRate"],axis=1)
   
    for row in range(len(indeks)):
          
        random_rows=secure_random.sample(indeks,n_rand)#losuje n_rand wektorów z danych do dalszego liczenia odległości
        odleglosc=0
        win_diff=0
        for other in range(len(random_rows)):
            odleglosc+=differ(data.iloc[row,:],data.loc[random_rows[other]]) # roznica w kartach miedzy dekami
            win_diff+=abs(win.iloc[row] - win.loc[random_rows[other]]) # roznica na winRate
        dist.iloc[row,0]=(win_diff*100 + odleglosc)/(n_rand+1) # łaczna srednia "odmiennosc" od losowych dekow
    
    return dist

In [None]:
def similar(data,n_row,n_rand):
    blis= pd.DataFrame(index=data.iloc[:n_row].index, columns=["simi"])#tworze macierz
    secure_random = secrets.SystemRandom()
    indeks=data.iloc[:n_row].index.tolist()#indeksy obecne w danych do n_row
    win=data["winRate"] # kolumna z winRate
    data=data.drop(["winRate"],axis=1)
   
    for row in range(len(indeks)):
          
        random_rows=secure_random.sample(indeks,n_rand)#losuje n_rand wektorów z danych do dalszego liczenia odległości
        bliskosc=0
        win_diff=0
        for other in range(len(random_rows)):
           
            bliskosc+=sim(data.iloc[row,:],data.loc[random_rows[other]]) # roznica w kartach miedzy dekami
            win_diff+=abs(win.iloc[row] - win.loc[random_rows[other]]) # roznica na winRate
        blis.iloc[row,0]=((1/win_diff)*10 + bliskosc)/(n_rand+1) # łaczna srednia "podobnosci" od losowych dekow
    
    return blis
    

In [562]:
sort_games=train.sort_values('nofGames', ascending=False).drop(['deck','nofGames','nOfPlayers'], axis=1)


In [563]:
sims=similar(sort_games,5000,100)

In [564]:
sims_sort=sims.sort_values("simi",ascending=False)

In [565]:
podobne_ind=sims_sort.iloc[:4000].index.tolist()

In [566]:
different=distance2(sort_games.loc[podobne_ind],3000,100)

In [None]:
train["sum"]=different["sum"]

In [None]:
posort=train.sort_values('sum', ascending=False).drop(['sum'])

In [None]:

fit_list = list(map(lambda size: fit_svm(posort.iloc[:size]), sizes))
pred_list = list(map(lambda fit: fit.predict(valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)),
                     fit_list))

In [None]:
r2 = list(map(lambda p: R2(p, valid['winRate']), pred_list))
r2

In [None]:
_ = plt.plot(sizes, r2)

In [None]:
np.mean(r2)