In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV  
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
dataframe = pd.read_csv("types_stats_data.csv")

To be able to use the categoric data of Pokemon type we're going to need to utilise One Hot Encoding to split up the eightenn possible categeories into boolen values. Though there are methods to do this automatically, due to their being null values present in the type2 column and because we are combining the data from both columns in the same step, I have chosen to do this manualy.

In [None]:
type1_uniques = dataframe["type1"].unique().tolist()
type2_uniques = dataframe["type2"].unique().tolist()

unique_types = type1_uniques + type2_uniques
unique_types = list(set(unique_types)) 

for pokemon_type in unique_types:
    if type(pokemon_type) == float: continue # Deal with null entry
    dataframe[f"type_{pokemon_type.lower()}"] = False

for index, row in dataframe.iterrows():
    if row['type1'] in unique_types:
        dataframe.loc[index, f"type_{row['type1'].lower()}"] = True

    if type(row['type2']) == float or row['type2'] not in unique_types: continue

    dataframe.loc[index, f"type_{row['type2'].lower()}"] = True

dataframe = dataframe.drop('type1', axis=1)
dataframe = dataframe.drop('type2', axis=1)

Next we are going to take the new columns and use them as out dependent (input) values, and combine them with the stats data which will act as the independent (predictor) values. We'll then split the data up into train data and test data, so that we'll have some way to validate the created model afterwards.

In [4]:
x = dataframe.loc[:, ~dataframe.columns.isin(["hp","defense","attack","speed","sp_defense","sp_attack"])]
y = dataframe[["hp","defense","attack","speed","sp_defense","sp_attack"]]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

After fitting out training data to a linear regression model we move on to validate it using the test data. By fitting this test data to our model we get some predicted values which we can compare to the actual values that the x_test data lines up with. Shown below we can see that out r2 score is incredibly low at less than 0.1, which means that model is incredibly innaccurate at predicting results to the point that the there may be very little correlation between the inputs and outputs. Similarly the mean squared error and root mean squared error are very high, which one again suggest the model is very innacurate.

In [5]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(x_train, y_train)

y_pred = lin_reg_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 1/2

print(f"R2: {r2:.4f}\nMSE: {mse:.4f}\nRMSE {rmse:.4f}")

R2: 0.0948
MSE: 711.2997
RMSE 355.6498
