# Pokemon Classification

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
from pandas import isnull
import config
import json
import requests
from collections import defaultdict, Counter
import time
import random

import os
import re
import emoji
from nltk.corpus import stopwords
from string import punctuation
from wordcloud import WordCloud 
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer



from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import joblib
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


### API Request
Source Documentation [https://pokeapi.co/docs/v2#pokemon, https://pokeapi.co/docs/v2#pokemon-colors]

The data being pulled using the pokemon api are the pokemon statistics and characteristics. 


In [None]:
api_url = "https://pokeapi.co/api/v2/pokemon/"
params = {"limit": "1281"} # The total amount of pokemon
pokemon_api = []
res = requests.request("GET", api_url, params=params)
data = res.json()
print("Pokemon and Pokemon APIs have been pulled")
for pokemon in data["results"]:
    for k, v in pokemon.items():
        pokemon_api.append(v)
print("List of Pokemon and Pokemon API have been created")

In [None]:
# Separate Names and APIs into two lists
pokemon_name = pokemon_api[::2]
pokemon_API = pokemon_api[1::2]

# Merge into a Dictionary with Pokemon Names as Key and APIs as Value
pokemon_all = {}
for key in pokemon_name:
    for value in pokemon_API:
        pokemon_all[key] = value
        pokemon_API.remove(value)
        break
print("Dictionary of All Pokemon and Their APIs is: ", pokemon_all)

In [None]:
# List of all pokemon as well as the api url call
pokemon_all

In [None]:
# Check to see if there are 1281 pokemon
print("The number of pokemon called should be 1281: ", len(pokemon_all))

In [None]:
# Dictionary set up for Pokemon Statistics
pokemon_data = defaultdict(list)

for pokemon, api in pokemon_all.items() :
    # request the page and sleep
    r = requests.request("GET", str(api))
    time.sleep(5 + 10*random.random())

    # Add to ensure that request was successful
    #print("If 200, request was successful: ", r.status_code)

    d = r.json()
    pokemon_data[pokemon].append(d)


In [None]:
# API to pull Pokemon Colors

color_api = "https://pokeapi.co/api/v2/pokemon-color/"
res1 = requests.request("GET", color_api)
colors = []
data1 = res1.json()
for color in data1["results"]:
    for k, v in color.items():
        colors.append(v)

# Separate Names and APIs into two lists
pokemon_color = colors[::2]
pokemon_color_API = colors[1::2]

In [None]:
#Merge into a Dictionary with Pokemon Colors as Key and APIs as Value
pokemon_colors = {}
for key in pokemon_color:
    for value in pokemon_color_API:
        pokemon_colors[key] = value
        pokemon_color_API.remove(value)
        break
print("Dictionary of All Pokemon Colors and Their APIs is: ", pokemon_colors)

In [None]:
# Dictionary set up for Pokemon Colors
pokemon_colors_data = defaultdict(list)

for color, api in pokemon_colors.items() :
    # request the page and sleep
    r = requests.request("GET", str(api))
    time.sleep(5 + 10*random.random())

    # Add to ensure that request was successful
    #print("If 200, request was successful: ", r.status_code)

    d = r.json()
    pokemon_colors_data[color].append(d)

In [None]:
print(len(pokemon_colors_data))

Export Pokemon Statistics API data to txt file to store data as the API call took about 7.5 hours.

Export of Pokemon Colors API data to txt file to store data. API call time approx. 2 minutes.

In [None]:
with open('Pokemon_Data.txt', 'w') as Pokemon_Data:
     Pokemon_Data.write(json.dumps(pokemon_data))

In [None]:
with open('Pokemon_Colors.txt', 'w') as Pokemon_Colors:
     Pokemon_Colors.write(json.dumps(pokemon_colors_data))

### Load Data

In [None]:
f = open('Pokemon_data.txt')
data = json.load(f)
print(len(data))

In [None]:
f1 = open('Pokemon_colors.txt')
data1 = json.load(f1)
print(len(data1))

### Data Ingestion and Pre-Processing

In [None]:
# Some punctuation variations
punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation - {'''#{}[]'''}

# Stopwords
sw = stopwords.words("english")

# Two useful regex
whitespace_pattern = re.compile(r"\s+")
hashtag_pattern = re.compile(r"^#[0-9a-zA-Z]+")


# and now our functions
def descriptive_stats(tokens, num_words = 5, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """
    
    # Fill in the correct values here. 
    num_tokens = len(tokens)
    num_unique_tokens = len(Counter(tokens).keys())
    lexical_diversity = round((num_unique_tokens/num_tokens),2) # Rounded to 2 Digits to match format of existing decimal rounding below
    num_characters = sum(len(i) for i in tokens)
    most_common = Counter(tokens).most_common(num_words)
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
    
        # print the five most common tokens
        print(f"The most common tokens are {most_common}.")
        
    return

# Removing URL's
def remove_URL(text):
    return re.sub(r"'url ' : 'http\S+", "", text)

def remove_stop(tokens) :
    tokens = [file for file in tokens if file not in sw]
    return(tokens)
 
def remove_punctuation(text, punct_set=tw_punct) : 
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    """ Splitting on whitespace rather than the book's tokenize function. That 
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """
    
    text = [file.lower().strip() for file in text.split()]

    return(text)

def prepare(text, pipeline) : 
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

In [None]:
pokemon_list = []
for pokemon in data:
    for features in data[pokemon]:
        #print(a) # a is the whole section of combined features
        for feature, description in features.items():
            # b is the dictionaries such as ability, weight, stats that are pulled
            #print(c) # Farthest I can break down for now.
            poke_dict = {'pokemon': pokemon, 'feature': feature, 'description': description}
            pokemon_list.append(poke_dict)

In [None]:
my_pipeline = [str.lower,  remove_URL,tokenize, remove_punctuation, tokenize]
cleaned_data = []
for row in pokemon_list :
    text = " ".join(prepare(row, pipeline = my_pipeline))
    if text :
        cleaned_data.append(text)

In [None]:
df = pd.DataFrame.from_dict(pokemon_list)
df

In [None]:
# Transpose Dataframe for Pokemon Statistics

tdf = df.pivot(columns = 'feature', values = 'description', index = 'pokemon')
tdf

In [None]:
# Reset Index as ID for the dataframe

tdf.reset_index(inplace=True)
tdf.set_index('id', inplace = True)
tdf.info()

In [None]:
pokemon_colors_list = []
for colors in data1:
    for features in data1[colors]:
        #print(a) # a is the whole section of combined features
        for feature, description in features.items():
            # b is the dictionaries such as ability, weight, stats that are pulled
            #print(c) # Farthest I can break down for now.
            poke_dict1 = {'color': colors, 'features': feature, 'descriptions': description}
            pokemon_colors_list.append(poke_dict1)

In [None]:
# Transpose Dataframe

df1 = pd.DataFrame.from_dict(pokemon_colors_list)
cdf = df1.pivot(columns = 'features', values = 'descriptions', index = 'color')
cdf

In [None]:
# Drop Unnecessary Columns

cdf = cdf.drop(columns = ['id', 'names'])
cdf = cdf.reset_index(drop = True)
cdf.info()

In [None]:
cdf

### Descriptive Statistics

In [None]:
# Dictionary
descriptive_stats(cleaned_data)

In [None]:
# DataFrame
descriptive_stats(tdf)

### Data Cleaning

`Moves`, `Abilities`, and `Types` features have a list of dictionaries within each pokemon type. For getting it ready to model, cleaning is done to separate out the data to have a list of terms.


In [None]:
tdf

Clean Types into list of Types

In [None]:
pokemon_types = []
for val in tdf['types']:
    #print(val)
    res = [sub['type'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_types.append(results)

In [None]:
types_df = pd.DataFrame(columns = ["types"])
poke_types = []
for row in pokemon_types[:1281]:
    row_dict = {'types': row}
    poke_types.append(row_dict)

types_df = pd.DataFrame.from_dict(poke_types)
types_df.reset_index(drop=True)

Clean Abilities Column from List of Dictionaries into list of abilities

In [None]:
pokemon_abilities = []
for val in tdf['abilities']:
    #print(val)
    res = [sub['ability'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_abilities.append(results)

In [None]:
abilities_df = pd.DataFrame(columns = ["abilities"])
poke_abilities = []
for row in pokemon_abilities[:1281]:
    row_dict = {'abilities': row}
    poke_abilities.append(row_dict)

abilities_df = pd.DataFrame.from_dict(poke_abilities)
abilities_df.reset_index(drop=True)

Drop Old Abilities Column to prepare for Cleaned Abilities to Dataframe

In [None]:
tdf = tdf.drop(columns=['abilities'])

Clean moves into list of moves

In [None]:
pokemon_moves = []
for val in tdf['moves']:
    #print(val)
    res = [sub['move'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_moves.append(results)

In [None]:
moves_df = pd.DataFrame(columns = ["moves"])
poke_moves = []
for row in pokemon_moves[:1281]:
    row_dict = {'moves': row}
    poke_moves.append(row_dict)

moves_df = pd.DataFrame.from_dict(poke_moves)
moves_df.reset_index(drop=True)

Clean Game Indices

In [None]:
pokemon_game = []
for val in tdf['game_indices']:
    #print(val)
    res = [sub['version'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_game.append(results)

In [None]:
game_df = pd.DataFrame(columns = ["pokemon_games"])
poke_game = []
for row in pokemon_game[:1281]:
    row_dict = {'pokemon_games': row}
    poke_game.append(row_dict)

game_df = pd.DataFrame.from_dict(poke_game)
game_df.reset_index(drop=True)

In [None]:
tdf = tdf.reset_index(drop=True)

In [None]:
clean_df = pd.merge(tdf, abilities_df, left_index = True, right_index = True)
clean_df = clean_df.drop(columns=['moves', 'types', 'game_indices'])
clean_df = pd.merge(clean_df, moves_df, left_index = True, right_index = True)
clean_df = pd.merge(clean_df, types_df, left_index = True, right_index = True)
clean_df = pd.merge(clean_df, game_df, left_index = True, right_index = True)

In [None]:
clean_df = clean_df.drop(columns=['forms', 'held_items', 'is_default', 'location_area_encounters', 'past_types', 'species', 'sprites', 'stats'])
clean_df

In [None]:
clean_df = clean_df.drop(columns = ['pokemon_games']) # Dropped since too many empty lists.

#### Cleaning of Pokemon Colors and Species

In [None]:
pokemon = []
for val in cdf['pokemon_species']:
    # print(val.type) # Val is list
    res = [sub['name'] for sub in val]
    #print(res)
    pokemon.append(res)

In [None]:
black = pokemon[0]
blue = pokemon[1]
brown = pokemon[2]
gray = pokemon[3] 
green = pokemon[4]
pink = pokemon[5]
purple = pokemon[6]
red = pokemon[7]
white = pokemon[8]
yellow = pokemon[9]

In [None]:
pokemon_colors = pd.DataFrame((list(zip(black, blue, brown, gray, green, pink, purple, red, white, yellow))), columns = ['black', 'blue', 'brown', 'gray', 'green', 'pink', 'purple', 'red', 'white', 'yellow'])
pokemon_colors

In [None]:
# Transform Pokemon Colors Dataframe to have two columns, pokemon and color

pokemon_colors = pokemon_colors.reset_index(inplace = False)

pc_df = pokemon_colors.melt(id_vars='index', var_name='color', value_name='pokemon')
print(pc_df.sample(5)) # Check to ensure that the melt was done correctly and colors still match with the pokemon

Merge Pokemon Colors to Pokemon Statistics Dataframe

In [None]:
final_df = pd.merge(clean_df, pc_df, on = 'pokemon')

In [None]:
clean_df

In [None]:
clean_df.to_csv('Cleaned_Data_No_Colors.csv')

In [17]:
clean_df = pd.read_csv('Cleaned_Data_No_Colors.csv', index_col = [0])

In [18]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1281 entries, 0 to 1280
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pokemon          1281 non-null   object 
 1   base_experience  1126 non-null   float64
 2   height           1281 non-null   int64  
 3   name             1281 non-null   object 
 4   order            1281 non-null   int64  
 5   weight           1281 non-null   int64  
 6   abilities        1281 non-null   object 
 7   moves            1281 non-null   object 
 8   types            1281 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 100.1+ KB


In [19]:
# Drop Missing Data and Remove Brackets

clean_df = clean_df.dropna()

clean_df['abilities'] = clean_df['abilities'].apply(lambda x: x[1:-1])
clean_df['moves'] = clean_df['moves'].apply(lambda x: x[1:-1])
clean_df['types'] = clean_df['types'].apply(lambda x: x[1:-1])


In [20]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1126 entries, 0 to 1280
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pokemon          1126 non-null   object 
 1   base_experience  1126 non-null   float64
 2   height           1126 non-null   int64  
 3   name             1126 non-null   object 
 4   order            1126 non-null   int64  
 5   weight           1126 non-null   int64  
 6   abilities        1126 non-null   object 
 7   moves            1126 non-null   object 
 8   types            1126 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 88.0+ KB


In [21]:
clean_df.to_csv('Final_Data.csv')

In [22]:
clean_df

Unnamed: 0,pokemon,base_experience,height,name,order,weight,abilities,moves,types
0,abomasnow,173.0,22,abomasnow,585,1355,"'snow-warning', 'soundproof'","'mega-punch', 'ice-punch', 'swords-dance', 'me...","'grass', 'ice'"
1,abomasnow-mega,208.0,27,abomasnow-mega,586,1850,'snow-warning',"'ice-punch', 'swords-dance', 'leer', 'mist', '...","'grass', 'ice'"
2,abra,62.0,9,abra,103,195,"'synchronize', 'inner-focus', 'magic-guard'","'mega-punch', 'fire-punch', 'ice-punch', 'thun...",'psychic'
3,absol,163.0,12,absol,478,470,"'pressure', 'super-luck', 'justified'","'scratch', 'razor-wind', 'swords-dance', 'cut'...",'dark'
4,absol-mega,198.0,12,absol-mega,479,490,'magic-bounce',"'scratch', 'razor-wind', 'swords-dance', 'cut'...",'dark'
...,...,...,...,...,...,...,...,...,...
1276,zygarde-10,243.0,12,zygarde-10,859,335,'aura-break',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"
1277,zygarde-10-power-construct,243.0,12,zygarde-10-power-construct,860,335,'power-construct',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"
1278,zygarde-50,300.0,50,zygarde-50,858,3050,'aura-break',"'bind', 'body-slam', 'bite', 'hyper-beam', 'st...","'dragon', 'ground'"
1279,zygarde-50-power-construct,300.0,50,zygarde-50-power-construct,861,3050,'power-construct',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"


In [23]:
y_df = clean_df['types']
print(y_df.unique())

["'grass', 'ice'" "'psychic'" "'dark'" "'bug'" "'steel', 'ghost'"
 "'rock', 'flying'" "'steel', 'rock'" "'steel'" "'normal'" "'fairy'"
 "'water'" "'dragon', 'flying'" "'dragon', 'fairy'" "'rock', 'ice'"
 "'grass', 'poison'" "'electric'" "'electric', 'dragon'" "'rock', 'bug'"
 "'grass', 'dragon'" "'water', 'bug'" "'poison'" "'fire'" "'water', 'ice'"
 "'electric', 'ice'" "'bug', 'poison'" "'ice', 'flying'"
 "'psychic', 'flying'" "'normal', 'fairy'" "'ice'" "'dragon'"
 "'water', 'fairy'" "'ground', 'psychic'" "'ghost'" "'rock', 'water'"
 "'water', 'ground'" "'rock', 'steel'" "'grass'" "'bug', 'flying'"
 "'steel', 'psychic'" "'normal', 'fighting'" "'normal', 'water'"
 "'dark', 'steel'" "'fire', 'ghost'" "'fire', 'fighting'" "'rock'"
 "'normal', 'flying'" "'grass', 'fighting'" "'water', 'psychic'"
 "'bug', 'fighting'" "'grass', 'dark'" "'psychic', 'grass'"
 "'psychic', 'ice'" "'psychic', 'ghost'" "'fire', 'ground'"
 "'rock', 'fairy'" "'rock', 'fire'" "'water', 'rock'" "'water', 'dark'"
 "'s

In [None]:
#types = ['grass', 'ice', 'psychic', 'dark', 'bug', 'steel', 'ghost', 'rock', 'flying', 'fairy', 'dragon', 'fire', 'water', 'poison', 'normal', 'fighting']

In [None]:
y_df = y_df.to_frame()

In [None]:
y_df['tokens'] = y_df['types'].apply(prepare, pipeline = my_pipeline)
y_df['num_tokens'] = y_df['tokens'].map(len) 

In [None]:
types_corpus = [x for y in y_df['tokens'] for x in y]

In [None]:
# Most Common Pokemon Types 

tc = Counter(types_corpus)

for k, v in tc.most_common(5):
    print('%s: %i' % (k, v))

### Build Corpora and Word Clouds

In [None]:
# Combine all text into one large corpus
clean_df['corpus'] = clean_df[['pokemon','abilities', 'moves','types']].agg(' '.join, axis = 1)

In [None]:
my_pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]

clean_df['tokens'] = clean_df['corpus'].apply(prepare,pipeline=my_pipeline)
clean_df['num_tokens'] = clean_df['tokens'].map(len) 

In [None]:
cleantext_df = clean_df
cleantext_df = cleantext_df.drop(columns = ['base_experience', 'height', 'order', 'weight'])

In [None]:
pokemon_corpus = [x for y in clean_df['tokens'] for x in y]
pokemon_corpus

In [None]:
# Most common words

pc = Counter(pokemon_corpus)

for k, v in pc.most_common(10):
    print('%s: %i' % (k, v))

In [None]:
from matplotlib import pyplot as plt

def wordcloud(word_freq, title=None, max_words=200, stopwords=None):

    wc = WordCloud(width=800, height=400, 
                   background_color= "black", colormap="Paired", 
                   max_font_size=150, max_words=max_words)
    
    # convert data frame into dict
    if type(word_freq) == pd.Series:
        counter = Counter(word_freq.fillna(0).to_dict())
    else:
        counter = word_freq

    # filter stop words in frequency counter
    if sw is not None:
        counter = {token:freq for (token, freq) in counter.items() 
                              if token not in sw}
    wc.generate_from_frequencies(counter)
 
    plt.title(title) 

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    
    
def count_words(df, column='tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # transform counter into data frame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'
    
    return freq_df.sort_values('freq', ascending=False)

In [None]:
word_count = count_words(cleantext_df)

In [None]:
# Build Word Clouds for Corpus

print('''Pokemon Word Cloud''')
wordcloud(word_count.freq)

### Modeling

In [24]:
Final_Data = pd.read_csv('Final_Data.csv', index_col = [0])
Final_Data

Unnamed: 0,pokemon,base_experience,height,name,order,weight,abilities,moves,types
0,abomasnow,173.0,22,abomasnow,585,1355,"'snow-warning', 'soundproof'","'mega-punch', 'ice-punch', 'swords-dance', 'me...","'grass', 'ice'"
1,abomasnow-mega,208.0,27,abomasnow-mega,586,1850,'snow-warning',"'ice-punch', 'swords-dance', 'leer', 'mist', '...","'grass', 'ice'"
2,abra,62.0,9,abra,103,195,"'synchronize', 'inner-focus', 'magic-guard'","'mega-punch', 'fire-punch', 'ice-punch', 'thun...",'psychic'
3,absol,163.0,12,absol,478,470,"'pressure', 'super-luck', 'justified'","'scratch', 'razor-wind', 'swords-dance', 'cut'...",'dark'
4,absol-mega,198.0,12,absol-mega,479,490,'magic-bounce',"'scratch', 'razor-wind', 'swords-dance', 'cut'...",'dark'
...,...,...,...,...,...,...,...,...,...
1276,zygarde-10,243.0,12,zygarde-10,859,335,'aura-break',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"
1277,zygarde-10-power-construct,243.0,12,zygarde-10-power-construct,860,335,'power-construct',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"
1278,zygarde-50,300.0,50,zygarde-50,858,3050,'aura-break',"'bind', 'body-slam', 'bite', 'hyper-beam', 'st...","'dragon', 'ground'"
1279,zygarde-50-power-construct,300.0,50,zygarde-50-power-construct,861,3050,'power-construct',"'bind', 'body-slam', 'bite', 'hyper-beam', 'ea...","'dragon', 'ground'"


Random Forest Multi-Label Classifier

In [27]:
pokemon_data_encoded = Final_Data.copy()

# MultiLabelBinarizer encode 'abilities'
mlb_abilities = MultiLabelBinarizer()
abilities_encoded = mlb_abilities.fit_transform(pokemon_data_encoded['abilities'])
abilities_encoded_df = pd.DataFrame(abilities_encoded, columns=mlb_abilities.classes_)
pokemon_data_encoded = pd.concat([pokemon_data_encoded.drop('abilities', axis=1), abilities_encoded_df], axis=1)

# Separate features from the target
X = pokemon_data_encoded.drop('types', axis=1)  
y = Final_Data['types']  

# MultiLabelBinarizer encode 'types'
mlb_types = MultiLabelBinarizer()
y_encoded = mlb_types.fit_transform(y)

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# One hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align 
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Handle NaN 
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# RFC
clf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

# Get best estimator
best_clf = grid_search.best_estimator_

# Predictions
y_pred = best_clf.predict(X_test)

# Print
print(classification_report(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [1263, 1126]

Moves added to encoding to enhance features

In [None]:
pokemon_data_encoded = Final_Data.copy()

# Handle missing 'moves' data
pokemon_data_encoded['moves'] = pokemon_data_encoded['moves'].apply(lambda x: x if isinstance(x, list) else [])

# MultiLabelBinarizer encode 'abilities'
mlb_abilities = MultiLabelBinarizer()
abilities_encoded = mlb_abilities.fit_transform(pokemon_data_encoded['abilities'])
abilities_encoded_df = pd.DataFrame(abilities_encoded, columns=mlb_abilities.classes_)
pokemon_data_encoded = pd.concat([pokemon_data_encoded.drop('abilities', axis=1), abilities_encoded_df], axis=1)

# MultiLabelBinarizer encode 'moves'
mlb_moves = MultiLabelBinarizer()
moves_encoded = mlb_moves.fit_transform(pokemon_data_encoded['moves'])
moves_encoded_df = pd.DataFrame(moves_encoded, columns=mlb_moves.classes_)
pokemon_data_encoded = pd.concat([pokemon_data_encoded.drop('moves', axis=1), moves_encoded_df], axis=1)

# Separate features from the target
X = pokemon_data_encoded.drop('types', axis=1)  
y = pokemon_data['types']  

# MultiLabelBinarizer encode 'types'
mlb_types = MultiLabelBinarizer()
y_encoded = mlb_types.fit_transform(y)

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# One hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align 
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Handle NaN 
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# RFC
clf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

# Get best estimator
best_clf = grid_search.best_estimator_

# Predictions
y_pred = best_clf.predict(X_test)

# Print
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

___________________________

### Creation of Flask App

In [None]:
import joblib
joblib.dump(clf, 'Pokemon_model.pkl')

In [None]:
Pokemon_model = open('Pokemon_model.pkl','rb')
clf_model = joblib.load(Pokemon_model)

Testing of other model methods

### Return Pokemon Data Feature


In [29]:
pokemon_request = input("Enter Pokemon Name: ")

In [30]:
pokemon_request

'bulbasaur'

In [31]:
pokemon_output = Final_Data.loc[Final_Data['pokemon'] == pokemon_request]


       pokemon  base_experience  height       name  order  weight  \
117  bulbasaur             64.0       7  bulbasaur      1      69   

                     abilities  \
117  'overgrow', 'chlorophyll'   

                                                 moves              types  
117  'razor-wind', 'swords-dance', 'cut', 'bind', '...  'grass', 'poison'  


In [33]:
pokemon_output

Unnamed: 0,pokemon,base_experience,height,name,order,weight,abilities,moves,types
117,bulbasaur,64.0,7,bulbasaur,1,69,"'overgrow', 'chlorophyll'","'razor-wind', 'swords-dance', 'cut', 'bind', '...","'grass', 'poison'"
