In [51]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, RocCurveDisplay

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [52]:
df = pd.read_csv("pokemon_combined.csv")
print(df.shape)
df.head()

(1014, 17)


Unnamed: 0,Name,Type,Species,Height,Weight,Abilities,Catch rate,Base Friendship,Base Exp.,Growth Rate,Gender,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,Bulbasaur,Grass Poison,Seed Pokémon,0.7,6.9,1. OvergrowChlorophyll (hidden ability),45,50,64,Medium Slow,"87.5% male, 12.5% female",45,49,49,65,65,45
1,Ivysaur,Grass Poison,Seed Pokémon,1.0,13.0,1. OvergrowChlorophyll (hidden ability),45,50,142,Medium Slow,"87.5% male, 12.5% female",60,62,63,80,80,60
2,Venusaur,Grass Poison,Seed Pokémon,2.4,155.5,Thick Fat,45,50,281,Medium Slow,"87.5% male, 12.5% female",80,100,123,122,120,80
3,Charmander,Fire,Lizard Pokémon,0.6,8.5,1. BlazeSolar Power (hidden ability),45,50,62,Medium Slow,"87.5% male, 12.5% female",39,52,43,60,50,65
4,Charmeleon,Fire,Flame Pokémon,1.1,19.0,1. BlazeSolar Power (hidden ability),45,50,142,Medium Slow,"87.5% male, 12.5% female",58,64,58,80,65,80


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             1014 non-null   object 
 1   Type             1014 non-null   object 
 2   Species          1014 non-null   object 
 3   Height           1014 non-null   float64
 4   Weight           1014 non-null   float64
 5   Abilities        1014 non-null   object 
 6   Catch rate       1014 non-null   int64  
 7   Base Friendship  1014 non-null   int64  
 8   Base Exp.        1014 non-null   int64  
 9   Growth Rate      1014 non-null   object 
 10  Gender           1014 non-null   object 
 11  HP               1014 non-null   int64  
 12  Attack           1014 non-null   int64  
 13  Defense          1014 non-null   int64  
 14  Sp. Atk          1014 non-null   int64  
 15  Sp. Def          1014 non-null   int64  
 16  Speed            1014 non-null   int64  
dtypes: float64(2),

In [54]:
df.drop(["Name","Type","Species","Abilities",],axis=1,inplace=True)

In [55]:
df.columns

Index(['Height', 'Weight', 'Catch rate', 'Base Friendship', 'Base Exp.',
       'Growth Rate', 'Gender', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed'],
      dtype='object')

In [56]:
df.head()

Unnamed: 0,Height,Weight,Catch rate,Base Friendship,Base Exp.,Growth Rate,Gender,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,0.7,6.9,45,50,64,Medium Slow,"87.5% male, 12.5% female",45,49,49,65,65,45
1,1.0,13.0,45,50,142,Medium Slow,"87.5% male, 12.5% female",60,62,63,80,80,60
2,2.4,155.5,45,50,281,Medium Slow,"87.5% male, 12.5% female",80,100,123,122,120,80
3,0.6,8.5,45,50,62,Medium Slow,"87.5% male, 12.5% female",39,52,43,60,50,65
4,1.1,19.0,45,50,142,Medium Slow,"87.5% male, 12.5% female",58,64,58,80,65,80


In [57]:
df.head()

Unnamed: 0,Height,Weight,Catch rate,Base Friendship,Base Exp.,Growth Rate,Gender,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,0.7,6.9,45,50,64,Medium Slow,"87.5% male, 12.5% female",45,49,49,65,65,45
1,1.0,13.0,45,50,142,Medium Slow,"87.5% male, 12.5% female",60,62,63,80,80,60
2,2.4,155.5,45,50,281,Medium Slow,"87.5% male, 12.5% female",80,100,123,122,120,80
3,0.6,8.5,45,50,62,Medium Slow,"87.5% male, 12.5% female",39,52,43,60,50,65
4,1.1,19.0,45,50,142,Medium Slow,"87.5% male, 12.5% female",58,64,58,80,65,80


In [38]:
#df['Growth Rate_Fast'] = df['Growth Rate_Fast'].astype(int)
#df['Growth Rate_Fluctuating'] = df['Growth Rate_Fluctuating'].astype(int)
#df['Growth Rate_Medium Fast'] = df['Growth Rate_Medium Fast'].astype(int)
#df['Growth Rate_Medium Slow'] = df['Growth Rate_Medium Slow'].astype(int)
#df['Growth Rate_Slow'] = df['Growth Rate_Slow'].astype(int)
#df['Gender_100% male, 0% female'] = df['Gender_100% male, 0% female'].astype(int)
#df['Gender_12.5% male, 87.5% female'] = df['Gender_12.5% male, 87.5% female'].astype(int)
#df['Gender_25% male, 75% female'] = df['Gender_25% male, 75% female'].astype(int)
#df['Gender_50% male, 50% female'] = df['Gender_50% male, 50% female'].astype(int)
#df['Gender_75% male, 25% female'] = df['Gender_75% male, 25% female'].astype(int)
#df['Gender_87.5% male, 12.5% female'] = df['Gender_87.5% male, 12.5% female'].astype(int)

In [39]:
#binary_features = ['Growth Rate_Fast', 'Growth Rate_Fluctuating', 'Growth Rate_Medium Fast', 'Growth Rate_Medium Slow', 'Growth Rate_Slow', 'Gender_100% male, 0% female', 'Gender_12.5% male, 87.5% female', 'Gender_25% male, 75% female', 'Gender_50% male, 50% female', 'Gender_75% male, 25% female', 'Gender_87.5% male, 12.5% female']
#binary_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),
#    ('label', OrdinalEncoder())])

In [40]:
numeric_features = ['Height', 'Weight', 'Catch rate', 'Base Friendship', 'Base Exp.', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [29]:
categorical_features = ['Growth Rate', 'Gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [42]:
preprocessed_X_train = preprocessor.fit_transform(df)

In [43]:
encoded_feature_names = (numeric_features + binary_features +
                         list(preprocessor.transformers_[2][1]['onehot'].get_feature_names_out(categorical_features)))

TypeError: string indices must be integers, not 'str'

In [44]:
X=df.drop(columns=["Catch rate"],axis=1)
Y=df["Catch rate"]

In [45]:
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42) 
print(x_train.shape, x_test.shape)

(811, 22) (203, 22)


In [46]:
x_test.head()

Unnamed: 0,Height,Weight,Base Friendship,Base Exp.,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Growth Rate_Fast,Growth Rate_Fluctuating,Growth Rate_Medium Fast,Growth Rate_Medium Slow,Growth Rate_Slow,"Gender_100% male, 0% female","Gender_12.5% male, 87.5% female","Gender_25% male, 75% female","Gender_50% male, 50% female","Gender_75% male, 25% female","Gender_87.5% male, 12.5% female",Gender_Genderless
752,0.3,3.2,50,42,42,30,38,30,38,32,0,0,0,1,0,0,0,0,0,0,0,False
519,0.4,2.1,50,65,65,45,43,55,43,72,0,0,1,0,0,0,0,0,1,0,0,False
210,1.8,125.8,50,175,90,130,75,75,75,55,0,0,1,0,0,0,0,0,1,0,0,False
611,0.9,20.0,50,70,45,85,50,55,50,65,0,0,0,1,0,0,0,0,1,0,0,False
914,0.3,10.9,50,62,37,55,70,30,55,65,0,0,0,1,0,0,0,0,1,0,0,False


In [47]:
df_final = pd.DataFrame(preprocessed_X_train, columns=encoded_feature_names)
df_final["Transported"] = df.Transported
df_final.head()

NameError: name 'encoded_feature_names' is not defined

In [48]:
corrs = df_final.corr()
corrs

NameError: name 'df_final' is not defined