# Import data and libraries 

In [1]:
# libraries for data handling
import numpy as np
import pandas as pd

# libraries for modelling 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier


Source of data: https://www.kaggle.com/datasets/abcsds/pokemon

In [2]:
# import data
pokemon = pd.read_csv("Pokemon.csv", index_col = "#")

# Overview 

In [3]:
pokemon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 721
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        800 non-null    object
 1   Type 1      800 non-null    object
 2   Type 2      414 non-null    object
 3   Total       800 non-null    int64 
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int64 
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int64(8), object(3)
memory usage: 75.8+ KB


In [4]:
pokemon.head()

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [5]:
pokemon.describe()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


# Skewed data 

In [6]:
print("There are", sum(pokemon["Legendary"]== True), "legendary Pokemon in the datatset.")
print("This is", (sum(pokemon["Legendary"]== True)/pokemon.shape[0])*100, "% of the dataset.")

There are 65 legendary Pokemon in the datatset.
This is 8.125 % of the dataset.


In [7]:
pd.pivot_table(pokemon,index=["Generation"])

Unnamed: 0_level_0,Attack,Defense,HP,Legendary,Sp. Atk,Sp. Def,Speed,Total
Generation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,76.638554,70.861446,65.819277,0.036145,71.819277,69.090361,72.584337,426.813253
2,72.028302,73.386792,71.207547,0.04717,65.943396,73.90566,61.811321,418.283019
3,81.625,74.1,66.54375,0.1125,75.80625,71.225,66.925,436.225
4,82.867769,78.132231,73.082645,0.107438,76.404959,77.190083,71.338843,459.016529
5,82.066667,72.327273,71.787879,0.090909,71.987879,68.739394,68.078788,434.987879
6,75.804878,76.682927,68.268293,0.097561,74.292683,74.890244,66.439024,436.378049


In [8]:
pivot = pd.pivot_table(pokemon,index=["Type 1"],)
pivot["Legendary"].sort_values(ascending = False).head()

Type 1
Flying     0.500000
Dragon     0.375000
Psychic    0.245614
Steel      0.148148
Ground     0.125000
Name: Legendary, dtype: float64

# Modelling the data

## Overall model

### Catogirical string variables

In [9]:
print("number of different type 1s - ", pokemon["Type 1"].nunique())

number of different type 1s -  18


In [10]:
print("number of different type 2s -", pokemon["Type 2"].nunique())

number of different type 2s - 18


In [11]:
# import library to encourd type data
from sklearn.preprocessing import LabelEncoder

In [12]:
# encord type variables
le = LabelEncoder()
pokemon['Type 1 Code'] = le.fit_transform(pokemon['Type 1'])
pokemon['Type 2 Code'] = le.fit_transform(pokemon['Type 2'])
pokemon = pokemon.drop(["Type 1", "Type 2"], axis =1)

In [13]:
# drop name variable as it cannot be used in the model
pokemon.drop('Name',axis=1,inplace=True)

In [14]:
#revised dataset
pokemon.head()

Unnamed: 0_level_0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Type 1 Code,Type 2 Code
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,318,45,49,49,65,65,45,1,False,9,13
2,405,60,62,63,80,80,60,1,False,9,13
3,525,80,82,83,100,100,80,1,False,9,13
3,625,80,100,123,122,120,80,1,False,9,13
4,309,39,52,43,60,50,65,1,False,6,18


### Creating the model

In [15]:
# establish variable
X = pokemon.drop('Legendary',axis=1)
y = pokemon['Legendary']

In [16]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [17]:
# create model
FullRFC = RandomForestClassifier()
FullRFC.fit(X_train, y_train)

RandomForestClassifier()

In [18]:
# predict outcomes
FullRFC_pred = FullRFC.predict(X_test)

In [19]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,FullRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,FullRFC_pred))

Confusion Matrix 
 [[216   3]
 [  8  13]] 

Classification report
               precision    recall  f1-score   support

       False       0.96      0.99      0.98       219
        True       0.81      0.62      0.70        21

    accuracy                           0.95       240
   macro avg       0.89      0.80      0.84       240
weighted avg       0.95      0.95      0.95       240



## Model 1 - Stats only
Variables: 
* HP
* Attack
* Defence
* Sp. Atk
* Sp. Def
* Speed

In [20]:
# select cross sesction of data 
Stats = pokemon[[ "Total"]]

In [21]:
# establish variable
X = Stats
y = pokemon['Legendary']

In [22]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [23]:
# create model
StatsRFC = RandomForestClassifier()
StatsRFC.fit(X_train, y_train)

RandomForestClassifier()

In [24]:
# predict outcomes
StatsRFC_pred = StatsRFC.predict(X_test)

In [25]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,StatsRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,StatsRFC_pred))


Confusion Matrix 
 [[215   4]
 [  8  13]] 

Classification report
               precision    recall  f1-score   support

       False       0.96      0.98      0.97       219
        True       0.76      0.62      0.68        21

    accuracy                           0.95       240
   macro avg       0.86      0.80      0.83       240
weighted avg       0.95      0.95      0.95       240



# Model 2 - Combat data
Variables:
* Attack
* Defence
* HP

In [26]:
# select cross sesction of data 
combat = pokemon[["Attack","Defense","HP"]]

In [27]:
# establish variable
X = combat
y = pokemon['Legendary']

In [28]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [29]:
# create model
CombatRFC = RandomForestClassifier()
CombatRFC.fit(X_train, y_train)

RandomForestClassifier()

In [30]:
# predict outcomes
CombatRFC_pred = CombatRFC.predict(X_test)

In [31]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,CombatRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,CombatRFC_pred))

Confusion Matrix 
 [[214   5]
 [ 16   5]] 

Classification report
               precision    recall  f1-score   support

       False       0.93      0.98      0.95       219
        True       0.50      0.24      0.32        21

    accuracy                           0.91       240
   macro avg       0.72      0.61      0.64       240
weighted avg       0.89      0.91      0.90       240



# Model 3 stats + generation
Variables:
* Total
* Generation

In [32]:
# select cross sesction of data 
StatsG = pokemon[[ "Total","Generation"]]

In [33]:
# establish variable
X = StatsG
y = pokemon['Legendary']

In [34]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [35]:
# create model
StatsGRFC = RandomForestClassifier()
StatsGRFC.fit(X_train, y_train)

RandomForestClassifier()

In [36]:
# predict outcomes
StatsGRFC_pred = StatsGRFC.predict(X_test)

In [37]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,StatsGRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,StatsGRFC_pred))

Confusion Matrix 
 [[211   8]
 [  2  19]] 

Classification report
               precision    recall  f1-score   support

       False       0.99      0.96      0.98       219
        True       0.70      0.90      0.79        21

    accuracy                           0.96       240
   macro avg       0.85      0.93      0.88       240
weighted avg       0.97      0.96      0.96       240



# Model 4 - Key stats + generation
Variables: 
* HP
* Attack
* Defence
* Speed
* Generation


In [38]:
pokemon["check"] = pokemon[['Sp. Atk', 'Sp. Def']].sum(axis = 1)


if sum(pokemon[pokemon["check"]!=pokemon["Speed"]].any()) == 0: 
    print("Total equals sum of all other stats.")
else:
    print("Total includes other variables not included in the dataset.")

pokemon.drop("check", axis = 1, inplace = True)

Total includes other variables not included in the dataset.


In [39]:
# select cross sesction of data 
keydataG = pokemon.drop(['Total','Sp. Atk', 'Type 2 Code', 'Sp. Def',"Type 1 Code"], axis = 1)

In [40]:
# establish variable
X = keydataG.drop('Legendary',axis=1)
y = keydataG['Legendary']

In [41]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [42]:
# create model
KeydataGRFC = RandomForestClassifier()
KeydataGRFC.fit(X_train, y_train)

RandomForestClassifier()

In [43]:
# predict outcomes
KeydataGRFC_pred = KeydataGRFC.predict(X_test)

In [44]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,KeydataGRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,KeydataGRFC_pred))


Confusion Matrix 
 [[216   3]
 [ 12   9]] 

Classification report
               precision    recall  f1-score   support

       False       0.95      0.99      0.97       219
        True       0.75      0.43      0.55        21

    accuracy                           0.94       240
   macro avg       0.85      0.71      0.76       240
weighted avg       0.93      0.94      0.93       240



# Model 5
Variables:
* Type 1 
* Type 2 
* HP
* Attack
* Defence 
* Speed


In [45]:
# select cross sesction of data 
keydata = pokemon.drop(['Sp. Atk', 'Sp. Def',"Generation"], axis = 1)

In [46]:
# establish variable
X = keydata.drop('Legendary',axis=1)
y = keydata['Legendary']

In [47]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101)

In [48]:
# create model
KeydataRFC = RandomForestClassifier()
KeydataRFC.fit(X_train, y_train)

RandomForestClassifier()

In [49]:
# predict outcomes
KeydataRFC_pred = KeydataRFC.predict(X_test)

In [50]:
# assess model using predicted data and actual data
print("Confusion Matrix \n", confusion_matrix(y_test,KeydataRFC_pred), "\n")
print("Classification report\n", classification_report(y_test,KeydataRFC_pred))


Confusion Matrix 
 [[214   5]
 [  7  14]] 

Classification report
               precision    recall  f1-score   support

       False       0.97      0.98      0.97       219
        True       0.74      0.67      0.70        21

    accuracy                           0.95       240
   macro avg       0.85      0.82      0.84       240
weighted avg       0.95      0.95      0.95       240



# Models and data

## Overall model

In [51]:
print(confusion_matrix(y_test,FullRFC_pred))
print(classification_report(y_test,FullRFC_pred))

[[216   3]
 [  8  13]]
              precision    recall  f1-score   support

       False       0.96      0.99      0.98       219
        True       0.81      0.62      0.70        21

    accuracy                           0.95       240
   macro avg       0.89      0.80      0.84       240
weighted avg       0.95      0.95      0.95       240



## Model 1

In [52]:
print(confusion_matrix(y_test,StatsRFC_pred))
print(classification_report(y_test,StatsRFC_pred))

[[215   4]
 [  8  13]]
              precision    recall  f1-score   support

       False       0.96      0.98      0.97       219
        True       0.76      0.62      0.68        21

    accuracy                           0.95       240
   macro avg       0.86      0.80      0.83       240
weighted avg       0.95      0.95      0.95       240



## Model 2

In [53]:
print(confusion_matrix(y_test,CombatRFC_pred))
print(classification_report(y_test,CombatRFC_pred))

[[214   5]
 [ 16   5]]
              precision    recall  f1-score   support

       False       0.93      0.98      0.95       219
        True       0.50      0.24      0.32        21

    accuracy                           0.91       240
   macro avg       0.72      0.61      0.64       240
weighted avg       0.89      0.91      0.90       240



# Model 3

In [54]:
print(confusion_matrix(y_test,StatsGRFC_pred))
print(classification_report(y_test,StatsGRFC_pred))

[[211   8]
 [  2  19]]
              precision    recall  f1-score   support

       False       0.99      0.96      0.98       219
        True       0.70      0.90      0.79        21

    accuracy                           0.96       240
   macro avg       0.85      0.93      0.88       240
weighted avg       0.97      0.96      0.96       240



# Model 4

In [55]:
print(confusion_matrix(y_test,KeydataGRFC_pred))
print(classification_report(y_test,KeydataGRFC_pred))

[[216   3]
 [ 12   9]]
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       219
        True       0.75      0.43      0.55        21

    accuracy                           0.94       240
   macro avg       0.85      0.71      0.76       240
weighted avg       0.93      0.94      0.93       240



# Model 5

In [56]:
print(confusion_matrix(y_test,KeydataRFC_pred))
print(classification_report(y_test,KeydataRFC_pred))

[[214   5]
 [  7  14]]
              precision    recall  f1-score   support

       False       0.97      0.98      0.97       219
        True       0.74      0.67      0.70        21

    accuracy                           0.95       240
   macro avg       0.85      0.82      0.84       240
weighted avg       0.95      0.95      0.95       240

