# Valorant Victory Prediction

## Data Pre-processing

In [5]:
# Import libraries
import numpy as np
import pandas as pd
#For plotting
import matplotlib.pyplot as plt
import seaborn as sns
# For clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

## Pre-process match stats

### Dataset overview

The match dataset contains the map name, team ids and names, score, wins and losses during attack and defense, players, and team-composition

In [8]:
s_results = pd.read_csv('./Data files/val-series.csv')
m_results = pd.read_csv('./Data files/val-matches.csv')

In [9]:
m_results['Winner'] = np.where(m_results['Team 1 Score'] > m_results['Team 2 Score'], 1, 0)
# Sort rows by match id
m_results = m_results.sort_values(by=['Series Id'], ascending=True)
m_results = m_results.drop([
    'Match Id',
    'Series Id',
    'Event Name',
    'Win Condition',
    'Date',
    'Total Count',
    'Team 1 Id',
    'Team 2 Id',
    #'Team 1 Player Ids',
    #'Team 2 Player Ids',
    'Team 1 Score',
    'Team 2 Score',
    'Team 1 Score At Half',
    'Team 2 Score At Half', 
    "Team 1 Pistol Wins",
    "Team 1 Attacking Pistol Wins",
    "Team 1 Defending Pistol Wins",
    "Team 1 Attacking Wins",
    "Team 1 Attacking Losses",
    "Team 1 Defending Wins",
    "Team 1 Defending Losses",
    "Team 2 Pistol Wins",
    "Team 2 Attacking Pistol Wins",
    "Team 2 Defending Pistol Wins",
    "Team 2 Attacking Wins",
    "Team 2 Attacking Losses",
    "Team 2 Defending Wins",
    "Team 2 Defending Losses"
    ], axis=1)

#export df to csv
m_results.to_csv('./Data files/val-matches-cleaned.csv', index=False)

m_results.head(10)

Unnamed: 0,Map Name,Team 1 Name,Team 2 Name,Attacking First Team Number,Team 1 Player Ids,Team 2 Player Ids,Team 1 Agents,Team 2 Agents,Winner
1543,Breeze,Team Liquid,Gambit Esports,2,"[1403,17788,1690,2056,10211]","[1964,1081,3135,2528,274]","Sova,Viper,Skye,KAY/O,Chamber","Sova,Viper,Jett,Skye,Chamber",0
1544,Icebox,Team Liquid,Gambit Esports,2,"[2056,17788,1403,1690,10211]","[1964,274,1081,3135,2528]","Sova,Viper,Sage,Reyna,Chamber","Sova,Viper,Sage,Jett,KAY/O",0
1540,Split,G2 Esports (Inactive),FNATIC,1,"[1826,2377,2594,2769,200]","[2937,154,2290,76,2011]","Raze,Cypher,Sage,Omen,Jett","Raze,Viper,Sage,Astra,KAY/O",0
1542,Icebox,G2 Esports (Inactive),FNATIC,2,"[1826,2377,2769,2594,200]","[2290,154,76,2011,2937]","Sova,Killjoy,Viper,Sage,Jett","Sova,Killjoy,Viper,Sage,Jett",0
1541,Ascent,G2 Esports (Inactive),FNATIC,1,"[1826,2377,200,2769,2594]","[2290,154,2937,76,2011]","Sova,Killjoy,Jett,Astra,KAY/O","Sova,Killjoy,Jett,Astra,KAY/O",1
1533,Split,FunPlus Phoenix,BIG,1,"[1256,2225,3043,2335,2359]","[1538,3142,2326,360,4956]","Raze,Cypher,Sova,Viper,Astra","Cypher,Sage,Jett,Skye,Astra",1
1535,Bind,FunPlus Phoenix,BIG,2,"[1256,3043,2335,2225,2359]","[1538,3142,2326,360,4956]","Raze,Sova,Viper,Skye,Astra","Raze,Sova,Viper,Skye,Astra",0
1534,Haven,FunPlus Phoenix,BIG,1,"[2359,2225,3043,2335,1256]","[3142,360,2326,4956,1538]","Sova,Killjoy,Jett,Astra,KAY/O","Sova,Killjoy,Jett,Astra,KAY/O",1
1532,Ascent,SuperMassive,Acend,1,"[9642,2969,3725,9640,3233]","[1334,3208,732,691,2391]","Sova,Killjoy,Sage,Jett,Astra","Sova,Killjoy,Jett,Astra,KAY/O",0
1530,Bind,SuperMassive,Acend,2,"[9640,9642,2969,3725,3233]","[2391,3208,1334,732,691]","Raze,Sova,Viper,Skye,Astra","Raze,Viper,Brimstone,Sage,Skye",0


In [50]:
  


def agents_to_list(results, team_number):
    list_of_agent_lists = []
    for index, row in results.iterrows():
        agents = row['Team ' + team_number + ' Agents'].split(',')
        for x in range(len(agents)):
            agents[x] = agents[x] + '_' + team_number
        list_of_agent_lists.append(agents)
    return list_of_agent_lists



In [51]:
cat_encoder = OneHotEncoder()# instantiate a class
# code the feature called default which takes yes, no value
results_encoded = pd.get_dummies(m_results, columns=[
    'Map Name'
])

In [53]:

#m_results['Team 1 Player Ids'] = player_id_list(m_results, '1')
#m_results['Team 2 Player Ids'] = player_id_list(m_results, '2')
results_encoded['Team 1 Agents'] = agents_to_list(m_results,'1')
results_encoded['Team 2 Agents'] = agents_to_list(m_results,'2')
print(results_encoded.columns.tolist())

['Team 1 Name', 'Team 2 Name', 'Attacking First Team Number', 'Team 1 Agents', 'Team 2 Agents', 'Winner', 'Map Name_Ascent', 'Map Name_Bind', 'Map Name_Breeze', 'Map Name_Fracture', 'Map Name_Haven', 'Map Name_Icebox', 'Map Name_Pearl', 'Map Name_Split']


In [54]:
mlb = MultiLabelBinarizer()
results_encoded = results_encoded.join(pd.DataFrame(mlb.fit_transform(results_encoded.pop('Team 1 Agents')),
                          columns=mlb.classes_,
                          index=m_results.index))
results_encoded = results_encoded.join(pd.DataFrame(mlb.fit_transform(results_encoded.pop('Team 2 Agents')),
                          columns=mlb.classes_,
                          index=m_results.index))

In [55]:
# Number of records having team 1 as the victor
team1 = results_encoded[results_encoded.Winner==1].shape[0]
# Number of records having team 2 as the victor
team2 = results_encoded[results_encoded.Winner==0].shape[0]

print(f'There is {team1} examples with team 1 winning, which is {round(team1/results_encoded.shape[0],2)*100}%')
print(f'There is {team2} examples with team 2 winning, which is {round(team2/results_encoded.shape[0],2)*100}%')

There is 847 examples with team 1 winning, which is 53.0%
There is 741 examples with team 2 winning, which is 47.0%


### Pre-process player stats

### Pre-process team stats

In [56]:
team_attack_speed = pd.read_csv('./Data files/team-attack-speed.csv')
team_comp = pd.read_csv('./Data files/team-comp.csv')
team_map_performance = pd.read_csv('./Data files/team-map-performance.csv')
team_post_plants = pd.read_csv('./Data files/team-post-plants.csv')
team_round_performance = pd.read_csv('./Data files/team-round-performance.csv')
team_win_conditions = pd.read_csv('./Data files/team-win-conditions.csv')
team_xvy_performance = pd.read_csv('./Data files/team-xvy-performance.csv')

#df_team = pd.merge(team_map_performance, team_comp, on='Team')
df_team = pd.merge(team_round_performance, team_post_plants, on='Team')
#df_team = pd.merge(df_team, team_round_performance, on='Team')
#df_team = pd.merge(df_team, team_win_conditions, on='Team')
#df_team = pd.merge(df_team, team_xvy_performance, on='Team')




In [57]:
# Create a dictionary of teams and their elo
team_elo_dict = {}
for index, row in df_team.iterrows():
    if row['Team'] not in team_elo_dict:
        team_elo_dict[row['Team']] = 1000
    
# Modify elo based on win rate kd and acs
for index, row in results_encoded.iterrows():
    if row['Winner'] == 1:
        team_elo_dict[row['Team 1 Name']] += 100
        team_elo_dict[row['Team 2 Name']] -= 100
    else:
        team_elo_dict[row['Team 1 Name']] -= 100
        team_elo_dict[row['Team 2 Name']] += 100 

## Prepare features for testing: previous match data ONLY

In [59]:
Y = results_encoded['Winner']
X = results_encoded.drop("Winner", axis=1)
X.head(3)

Unnamed: 0,Team 1 Name,Team 2 Name,Attacking First Team Number,Map Name_Ascent,Map Name_Bind,Map Name_Breeze,Map Name_Fracture,Map Name_Haven,Map Name_Icebox,Map Name_Pearl,...,Neon_2,Omen_2,Phoenix_2,Raze_2,Reyna_2,Sage_2,Skye_2,Sova_2,Viper_2,Yoru_2
1543,Team Liquid,Gambit Esports,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1544,Team Liquid,Gambit Esports,2,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,1,0
1540,G2 Esports (Inactive),FNATIC,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0


In [60]:
#Import necessary library
from sklearn import preprocessing
#Normalize the data
normalized_X= preprocessing.normalize(X)
#Check normalization
print(normalized_X)

ValueError: could not convert string to float: 'Team Liquid'

In [61]:
from sklearn.model_selection import train_test_split
#Split testing into 20%
#Set shuffle to false to make sure past data is not used for testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1904703, shuffle=False)

In [62]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

#Create an instance of RandomForestClassifier
rf=RandomForestClassifier(random_state=1904703, n_estimators=100)
gb= GradientBoostingClassifier(random_state=1904703, n_estimators=100)
dt= DecisionTreeClassifier(random_state=1904703)
et= ExtraTreeClassifier(random_state=1904703)
lr= LogisticRegression(random_state=1904703)
svc= SVC(random_state=1904703)

#Fit model with training data
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
dt.fit(X_train, y_train)
et.fit(X_train, y_train)
lr.fit(X_train, y_train)
svc.fit(X_train, y_train)

ValueError: could not convert string to float: 'Team Liquid'

In [63]:
# Prediction
rf_predictions = rf.predict(X_test)
gb_predictions = gb.predict(X_test)
dt_predictions = dt.predict(X_test)
et_predictions = et.predict(X_test)
lr_predictions = lr.predict(X_test)
svc_predictions = svc.predict(X_test)
#Results
accuracy_rf = accuracy_score(y_test, rf_predictions)
accuracy_gb = accuracy_score(y_test, gb_predictions)
accuracy_dt = accuracy_score(y_test, dt_predictions)
accuracy_et = accuracy_score(y_test, et_predictions)
accuracy_lr = accuracy_score(y_test, lr_predictions)
accuracy_svc = accuracy_score(y_test, svc_predictions)
print(f'The overall accuracy of RF is {np.round(accuracy_rf*100,2)}%')
print(f'The overall accuracy of GB is {np.round(accuracy_gb*100,2)}%')
print(f'The overall accuracy of DT is {np.round(accuracy_dt*100,2)}%')
print(f'The overall accuracy of ET is {np.round(accuracy_et*100,2)}%')
print(f'The overall accuracy of LR is {np.round(accuracy_lr*100,2)}%')
print(f'The overall accuracy of SVC is {np.round(accuracy_svc*100,2)}%')

ValueError: could not convert string to float: 'LOUD'

## Prepare features for testing: With Team ELO

In [74]:
for index, row in results_encoded.iterrows():
    #replace team names with elo
    # if value is a string, replace it with the elo
    if isinstance(row['Team 1 Name'], str):
        results_encoded.at[index, 'Team 1 Name'] = team_elo_dict[row['Team 1 Name']]
        print(results_encoded.loc[index, 'Team 1 Name'])
        results_encoded.at[index, 'Team 2 Name'] = team_elo_dict[row['Team 2 Name']]



In [75]:
Y = results_encoded['Winner']
X = results_encoded.drop("Winner", axis=1)
X.head(3)

Unnamed: 0,Team 1 Name,Team 2 Name,Attacking First Team Number,Map Name_Ascent,Map Name_Bind,Map Name_Breeze,Map Name_Fracture,Map Name_Haven,Map Name_Icebox,Map Name_Pearl,...,Neon_2,Omen_2,Phoenix_2,Raze_2,Reyna_2,Sage_2,Skye_2,Sova_2,Viper_2,Yoru_2
1543,1900,1100,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
1544,1900,1100,2,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,1,0
1540,1500,2500,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0


In [76]:
normalized_X= preprocessing.normalize(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1904703, shuffle=False)

#Fit model with training data
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
dt.fit(X_train, y_train)
et.fit(X_train, y_train)
lr.fit(X_train, y_train)
svc.fit(X_train, y_train)


# Prediction
rf_predictions = rf.predict(X_test)
gb_predictions = gb.predict(X_test)
dt_predictions = dt.predict(X_test)
et_predictions = et.predict(X_test)
lr_predictions = lr.predict(X_test)
svc_predictions = svc.predict(X_test)
#Results
accuracy_rf = accuracy_score(y_test, rf_predictions)
accuracy_gb = accuracy_score(y_test, gb_predictions)
accuracy_dt = accuracy_score(y_test, dt_predictions)
accuracy_et = accuracy_score(y_test, et_predictions)
accuracy_lr = accuracy_score(y_test, lr_predictions)
accuracy_svc = accuracy_score(y_test, svc_predictions)
print(f'The overall accuracy of RF is {np.round(accuracy_rf*100,2)}%')
print(f'The overall accuracy of GB is {np.round(accuracy_gb*100,2)}%')
print(f'The overall accuracy of DT is {np.round(accuracy_dt*100,2)}%')
print(f'The overall accuracy of ET is {np.round(accuracy_et*100,2)}%')
print(f'The overall accuracy of LR is {np.round(accuracy_lr*100,2)}%')
print(f'The overall accuracy of SVC is {np.round(accuracy_svc*100,2)}%')

The overall accuracy of RF is 58.81%
The overall accuracy of GB is 65.41%
The overall accuracy of DT is 58.18%
The overall accuracy of ET is 52.52%
The overall accuracy of LR is 66.35%
The overall accuracy of SVC is 66.98%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
