# SVM model including Type 2 in the Training:
Uses the exact same code as the other file, but includes Type 2 information. Thus this data inputs to the SVM input data of length 86, while the other model takes input data of length 50. Clearly this data, while including more information, is also higher dimensional. Since SVM models are good at handling higher dimensional data, we may be able to find higher accuracy including all of the data. 

The highest accuracy we have from the model excluding type 2 is Accuracy: 0.9128 Weighted F1 Score:  0.9127919425258147. We will try to beat this accuracy now by including type 2 and generation information.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pokemon = pd.read_csv('pokemon.csv', encoding='latin1')	# Ensures special characters are read correctly
combat = pd.read_csv('combats.csv', encoding='latin1')

# Normalize numeric values
names = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
# Equivalent to using MinMaxScaler()
for col in names:
    pokemon[col] = (pokemon[col] - pokemon[col].min()) / (pokemon[col].max() - pokemon[col].min()) 

# Performs one hot encoding on the 'Type 1' column
pokemon = pd.get_dummies(pokemon, columns=['Type 1', ], dtype=int)
pokemon = pd.get_dummies(pokemon, columns=['Type 2',], dtype=int) # Will treat NaN in type 2 as 0 across all type 2 columns

#Dropping Name because it is not related to the winner
pokemon = pokemon.drop('Name', axis=1)

# To convert 'Legendary' column from boolean to integer type
pokemon['Legendary'] = pokemon['Legendary'].astype(int)

# Sets label
y = combat['Winner']
combat=combat.drop('Winner', axis=1)

In [2]:
# Encode y as true or false. True if the first pokemon wins and false if the second pokemon wins.
for i in range(len(combat)):
    if y[i] == combat['First_pokemon'][i]:
        y[i]=1
    else:
        y[i]=0

In [3]:
# Merge combat data with pokemon data for First_pokemon
cols = pokemon.drop('#', axis=1).columns
merged1 = combat.merge(pokemon, left_on='First_pokemon', right_on='#', how='left')
merged1 = merged1.drop(columns=['#'])
merged1.columns = ['First_pokemon', 'Second_pokemon'] + [col + '1' for col in cols]

# Merge with pokemon data for Second_pokemon
merged2 = merged1.merge(pokemon, left_on='Second_pokemon', right_on='#', how='left')
merged2 = merged2.drop(columns=['#', 'First_pokemon', 'Second_pokemon'])
merged2.columns = merged1.columns.tolist()[2:] + [col + '2' for col in cols]

# Final DataFrame where each row represents a single match between two Pokémon
df = merged2
df.head()

Unnamed: 0,HP1,Attack1,Defense1,Sp. Atk1,Sp. Def1,Speed1,Generation1,Legendary1,Type 1_Bug1,Type 1_Dark1,...,Type 2_Ghost2,Type 2_Grass2,Type 2_Ground2,Type 2_Ice2,Type 2_Normal2,Type 2_Poison2,Type 2_Psychic2,Type 2_Rock2,Type 2_Steel2,Type 2_Water2
0,0.192913,0.318919,0.2,0.190217,0.142857,0.205714,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.354331,0.459459,0.297778,0.434783,0.519048,0.588571,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.212598,0.189189,0.355556,0.380435,0.404762,0.2,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.153543,0.189189,0.155556,0.326087,0.095238,0.085714,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.271654,0.297297,0.533333,0.570652,0.238095,0.285714,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Implementing the SVC Model

### Creating models of each function before hyperparameter tuning to see which kernels should be investigated

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Creating training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Will track the models
models = []
# Model 1: Linear Model
model1 = SVC(kernel='linear')  # You can choose different kernels like 'linear', 'rbf', 'poly', or 'sigmoid'
model1.fit(X_train, y_train)
models.append(model1)

y_pred = model1.predict(X_test)

# Calculating and printing accuracy scores
accuracy_scores = []
accuracy = accuracy_score(y_test, y_pred)
accuracy_scores.append(accuracy)
print(f"Linear Accuracy: {accuracy}")

# Model 2: RBF Model
model2 = SVC(kernel='rbf')
model2.fit(X_train, y_train)
models.append(model2)

y_pred = model2.predict(X_test)

# Calculating and printing accuracy scores
accuracy = accuracy_score(y_test, y_pred)
accuracy_scores.append(accuracy)
print(f"RBF Accuracy: {accuracy}")

# Model 3: Poly Model
model3 = SVC(kernel='poly')
model3.fit(X_train, y_train)
models.append(model3)

y_pred = model3.predict(X_test)

# Calculating and printing accuracy scores
accuracy = accuracy_score(y_test, y_pred)
accuracy_scores.append(accuracy)
print(f"Poly Accuracy: {accuracy}")

# Model 4: Sigmoid Model
model4 = SVC(kernel='sigmoid')
model4.fit(X_train, y_train)
models.append(model4)

y_pred = model4.predict(X_test)

# Calculating and printing accuracy scores
accuracy = accuracy_score(y_test, y_pred)
accuracy_scores.append(accuracy)
print(f"Sigmoid Accuracy: {accuracy}")

Linear Accuracy: 0.9112
RBF Accuracy: 0.9032
Poly Accuracy: 0.8934
Sigmoid Accuracy: 0.54


Performance in all kernels is worse than they were when not including Type 2 and generation. I will do hyperparameter tuning with kernel=linear, since that has the best performance. 
## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42) # initializing train/test groups

param_grid = {
    'kernel': ['linear'],
    'C': [0.1, 1,50, 100, 150], # only hyperparameter for linear
}
grid = GridSearchCV(SVC(), param_grid, scoring='accuracy', cv=3, verbose=3)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best accuracy:", grid.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.905 total time= 1.0min
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.900 total time=  57.6s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.897 total time=  58.2s
[CV 1/3] END ................C=1, kernel=linear;, score=0.909 total time= 1.2min
[CV 2/3] END ................C=1, kernel=linear;, score=0.908 total time= 1.1min
[CV 3/3] END ................C=1, kernel=linear;, score=0.903 total time= 1.0min
[CV 1/3] END ...............C=50, kernel=linear;, score=0.910 total time=10.5min
[CV 2/3] END ...............C=50, kernel=linear;, score=0.908 total time=10.4min
[CV 3/3] END ...............C=50, kernel=linear;, score=0.904 total time=10.4min
[CV 1/3] END ..............C=100, kernel=linear;, score=0.910 total time=19.6min
[CV 2/3] END ..............C=100, kernel=linear;, score=0.908 total time=19.5min
[CV 3/3] END ..............C=100, kernel=linear;,

In [None]:
from sklearn.metrics import accuracy_score, f1_score

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test) # testing best model on testing data

# Calculating and printing accuracy scores
accuracy = accuracy_score(y_test, y_pred)
print(f"Grid Search Best Model Accuracy on Test Data: {accuracy}")
r2 = f1_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

Grid Search Best Model Accuracy on Test Data: 0.9116
R^2 Score: 0.9066525871172122


## Results
Although this model gets close to the accuracy and R2 of the best model not including Type 2, it is still slightly lower. In addition, it takes significantly longer to train due to the increasesd dimesionality of the data. Thus, the SVM model we will use to compare to our other models will be the model excluding Type 2 and Generation as input. 