# Pokemon Classification

### Import Libraries

In [11]:
import numpy as np
import pandas as pd
from pandas import isnull
import json
import requests
from collections import defaultdict, Counter
import time
import random

import os
import re
import emoji
from nltk.corpus import stopwords
from string import punctuation
from wordcloud import WordCloud 
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


### API Request
Source Documentation [https://pokeapi.co/docs/v2#pokemon, https://pokeapi.co/docs/v2#pokemon-colors]

The data being pulled using the pokemon api are the pokemon statistics and characteristics. 


In [12]:
api_url = "https://pokeapi.co/api/v2/pokemon/"
params = {"limit": "1281"} # The total amount of pokemon
pokemon_api = []
res = requests.request("GET", api_url, params=params)
data = res.json()
print("Pokemon and Pokemon APIs have been pulled")
for pokemon in data["results"]:
    for k, v in pokemon.items():
        pokemon_api.append(v)
print("List of Pokemon and Pokemon API have been created")

Pokemon and Pokemon APIs have been pulled
List of Pokemon and Pokemon API have been created


In [13]:
# Separate Names and APIs into two lists
pokemon_name = pokemon_api[::2]
pokemon_API = pokemon_api[1::2]

# Merge into a Dictionary with Pokemon Names as Key and APIs as Value
pokemon_all = {}
for key in pokemon_name:
    for value in pokemon_API:
        pokemon_all[key] = value
        pokemon_API.remove(value)
        break
print("Dictionary of All Pokemon and Their APIs is: ", pokemon_all)

Dictionary of All Pokemon and Their APIs is:  {'bulbasaur': 'https://pokeapi.co/api/v2/pokemon/1/', 'ivysaur': 'https://pokeapi.co/api/v2/pokemon/2/', 'venusaur': 'https://pokeapi.co/api/v2/pokemon/3/', 'charmander': 'https://pokeapi.co/api/v2/pokemon/4/', 'charmeleon': 'https://pokeapi.co/api/v2/pokemon/5/', 'charizard': 'https://pokeapi.co/api/v2/pokemon/6/', 'squirtle': 'https://pokeapi.co/api/v2/pokemon/7/', 'wartortle': 'https://pokeapi.co/api/v2/pokemon/8/', 'blastoise': 'https://pokeapi.co/api/v2/pokemon/9/', 'caterpie': 'https://pokeapi.co/api/v2/pokemon/10/', 'metapod': 'https://pokeapi.co/api/v2/pokemon/11/', 'butterfree': 'https://pokeapi.co/api/v2/pokemon/12/', 'weedle': 'https://pokeapi.co/api/v2/pokemon/13/', 'kakuna': 'https://pokeapi.co/api/v2/pokemon/14/', 'beedrill': 'https://pokeapi.co/api/v2/pokemon/15/', 'pidgey': 'https://pokeapi.co/api/v2/pokemon/16/', 'pidgeotto': 'https://pokeapi.co/api/v2/pokemon/17/', 'pidgeot': 'https://pokeapi.co/api/v2/pokemon/18/', 'ratta

In [14]:
# List of all pokemon as well as the api url call
pokemon_all

{'bulbasaur': 'https://pokeapi.co/api/v2/pokemon/1/',
 'ivysaur': 'https://pokeapi.co/api/v2/pokemon/2/',
 'venusaur': 'https://pokeapi.co/api/v2/pokemon/3/',
 'charmander': 'https://pokeapi.co/api/v2/pokemon/4/',
 'charmeleon': 'https://pokeapi.co/api/v2/pokemon/5/',
 'charizard': 'https://pokeapi.co/api/v2/pokemon/6/',
 'squirtle': 'https://pokeapi.co/api/v2/pokemon/7/',
 'wartortle': 'https://pokeapi.co/api/v2/pokemon/8/',
 'blastoise': 'https://pokeapi.co/api/v2/pokemon/9/',
 'caterpie': 'https://pokeapi.co/api/v2/pokemon/10/',
 'metapod': 'https://pokeapi.co/api/v2/pokemon/11/',
 'butterfree': 'https://pokeapi.co/api/v2/pokemon/12/',
 'weedle': 'https://pokeapi.co/api/v2/pokemon/13/',
 'kakuna': 'https://pokeapi.co/api/v2/pokemon/14/',
 'beedrill': 'https://pokeapi.co/api/v2/pokemon/15/',
 'pidgey': 'https://pokeapi.co/api/v2/pokemon/16/',
 'pidgeotto': 'https://pokeapi.co/api/v2/pokemon/17/',
 'pidgeot': 'https://pokeapi.co/api/v2/pokemon/18/',
 'rattata': 'https://pokeapi.co/api

In [15]:
# Check to see if there are 1281 pokemon
print("The number of pokemon called should be 1281: ", len(pokemon_all))

The number of pokemon called should be 1281:  1281


In [None]:
# Dictionary set up for Pokemon Statistics
pokemon_data = defaultdict(list)

for pokemon, api in pokemon_all.items() :
    # request the page and sleep
    r = requests.request("GET", str(api))
    time.sleep(5 + 10*random.random())

    # Add to ensure that request was successful
    #print("If 200, request was successful: ", r.status_code)

    d = r.json()
    pokemon_data[pokemon].append(d)


In [19]:
with open('C:\\Users\\micha\\applied text mining\\final project\\Pokemon_Data.txt', 'r') as Pokemon_Data:
     pokemon_data = json.load(Pokemon_Data)

In [20]:
# API to pull Pokemon Colors

color_api = "https://pokeapi.co/api/v2/pokemon-color/"
res1 = requests.request("GET", color_api)
colors = []
data1 = res1.json()
for color in data1["results"]:
    for k, v in color.items():
        colors.append(v)

# Separate Names and APIs into two lists
pokemon_color = colors[::2]
pokemon_color_API = colors[1::2]

In [21]:
#Merge into a Dictionary with Pokemon Colors as Key and APIs as Value
pokemon_colors = {}
for key in pokemon_color:
    for value in pokemon_color_API:
        pokemon_colors[key] = value
        pokemon_color_API.remove(value)
        break
print("Dictionary of All Pokemon Colors and Their APIs is: ", pokemon_colors)

Dictionary of All Pokemon Colors and Their APIs is:  {'black': 'https://pokeapi.co/api/v2/pokemon-color/1/', 'blue': 'https://pokeapi.co/api/v2/pokemon-color/2/', 'brown': 'https://pokeapi.co/api/v2/pokemon-color/3/', 'gray': 'https://pokeapi.co/api/v2/pokemon-color/4/', 'green': 'https://pokeapi.co/api/v2/pokemon-color/5/', 'pink': 'https://pokeapi.co/api/v2/pokemon-color/6/', 'purple': 'https://pokeapi.co/api/v2/pokemon-color/7/', 'red': 'https://pokeapi.co/api/v2/pokemon-color/8/', 'white': 'https://pokeapi.co/api/v2/pokemon-color/9/', 'yellow': 'https://pokeapi.co/api/v2/pokemon-color/10/'}


In [22]:
# Dictionary set up for Pokemon Colors
pokemon_colors_data = defaultdict(list)

for color, api in pokemon_colors.items() :
    # request the page and sleep
    r = requests.request("GET", str(api))
    time.sleep(5 + 10*random.random())

    # Add to ensure that request was successful
    #print("If 200, request was successful: ", r.status_code)

    d = r.json()
    pokemon_colors_data[color].append(d)

In [23]:
print(len(pokemon_colors_data))

10


In [31]:
with open('Pokemon_Data.txt', 'w') as Pokemon_Data:
     Pokemon_Data.write(json.dumps(pokemon_data))

NameError: name 'pokemon_data' is not defined

Export Pokemon Statistics API data to txt file to store data as the API call took about 7.5 hours.

Export of Pokemon Colors API data to txt file to store data. API call time approx. 2 minutes.

In [32]:
with open('Pokemon_Colors.txt', 'w') as Pokemon_Colors:
     Pokemon_Colors.write(json.dumps(pokemon_colors_data))

### Load Data

In [24]:
f = open('Pokemon_data.txt')
data = json.load(f)
print(len(data))

335


In [25]:
f1 = open('Pokemon_colors.txt')
data1 = json.load(f1)
print(len(data1))

10


### Data Ingestion and Pre-Processing

In [26]:
# Some punctuation variations
punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation - {'''#{}[]'''}

# Stopwords
sw = stopwords.words("english")

# Two useful regex
whitespace_pattern = re.compile(r"\s+")
hashtag_pattern = re.compile(r"^#[0-9a-zA-Z]+")


# and now our functions
def descriptive_stats(tokens, num_words = 5, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """
    
    # Fill in the correct values here. 
    num_tokens = len(tokens)
    num_unique_tokens = len(Counter(tokens).keys())
    lexical_diversity = round((num_unique_tokens/num_tokens),2) # Rounded to 2 Digits to match format of existing decimal rounding below
    num_characters = sum(len(i) for i in tokens)
    most_common = Counter(tokens).most_common(num_words)
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
    
        # print the five most common tokens
        print(f"The most common tokens are {most_common}.")
        
    return

# Removing URL's
def remove_URL(text):
    return re.sub(r"'url ' : 'http\S+", "", text)

def remove_stop(tokens) :
    tokens = [file for file in tokens if file not in sw]
    return(tokens)
 
def remove_punctuation(text, punct_set=tw_punct) : 
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    """ Splitting on whitespace rather than the book's tokenize function. That 
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """
    
    text = [file.lower().strip() for file in text.split()]

    return(text)

def prepare(text, pipeline) : 
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

In [27]:
pokemon_list = []
for pokemon in data:
    for features in data[pokemon]:
        #print(a) # a is the whole section of combined features
        for feature, description in features.items():
            # b is the dictionaries such as ability, weight, stats that are pulled
            #print(c) # Farthest I can break down for now.
            poke_dict = {'pokemon': pokemon, 'feature': feature, 'description': description}
            pokemon_list.append(poke_dict)

In [28]:
my_pipeline = [str.lower,  remove_URL,tokenize, remove_punctuation, tokenize]
cleaned_data = []
for row in pokemon_list :
    text = " ".join(prepare(row, pipeline = my_pipeline))
    if text :
        cleaned_data.append(text)

In [29]:
df = pd.DataFrame.from_dict(pokemon_list)
df

Unnamed: 0,pokemon,feature,description
0,bulbasaur,abilities,"[{'ability': {'name': 'overgrow', 'url': 'http..."
1,bulbasaur,base_experience,64
2,bulbasaur,forms,"[{'name': 'bulbasaur', 'url': 'https://pokeapi..."
3,bulbasaur,game_indices,"[{'game_index': 153, 'version': {'name': 'red'..."
4,bulbasaur,height,7
...,...,...,...
6109,zangoose,species,"{'name': 'zangoose', 'url': 'https://pokeapi.c..."
6110,zangoose,sprites,{'back_default': 'https://raw.githubuserconten...
6111,zangoose,stats,"[{'base_stat': 73, 'effort': 0, 'stat': {'name..."
6112,zangoose,types,"[{'slot': 1, 'type': {'name': 'normal', 'url':..."


In [30]:
# Transpose Dataframe for Pokemon Statistics

tdf = df.pivot(columns = 'feature', values = 'description', index = 'pokemon')
tdf

feature,abilities,base_experience,forms,game_indices,height,held_items,id,is_default,location_area_encounters,moves,name,order,past_abilities,past_types,species,sprites,stats,types,weight
pokemon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
abra,"[{'ability': {'name': 'synchronize', 'url': 'h...",62,"[{'name': 'abra', 'url': 'https://pokeapi.co/a...","[{'game_index': 148, 'version': {'name': 'red'...",9,"[{'item': {'name': 'twisted-spoon', 'url': 'ht...",63,True,https://pokeapi.co/api/v2/pokemon/63/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",abra,103,,[],"{'name': 'abra', 'url': 'https://pokeapi.co/ap...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 25, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'psychic', 'url'...",195
aerodactyl,"[{'ability': {'name': 'rock-head', 'url': 'htt...",180,"[{'name': 'aerodactyl', 'url': 'https://pokeap...","[{'game_index': 171, 'version': {'name': 'red'...",18,[],142,True,https://pokeapi.co/api/v2/pokemon/142/encounters,"[{'move': {'name': 'razor-wind', 'url': 'https...",aerodactyl,232,[],[],"{'name': 'aerodactyl', 'url': 'https://pokeapi...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 80, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'rock', 'url': '...",590
aggron,"[{'ability': {'name': 'sturdy', 'url': 'https:...",265,"[{'name': 'aggron', 'url': 'https://pokeapi.co...","[{'game_index': 384, 'version': {'name': 'ruby...",21,"[{'item': {'name': 'hard-stone', 'url': 'https...",306,True,https://pokeapi.co/api/v2/pokemon/306/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",aggron,411,[],[],"{'name': 'aggron', 'url': 'https://pokeapi.co/...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 70, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'steel', 'url': ...",3600
aipom,"[{'ability': {'name': 'run-away', 'url': 'http...",72,"[{'name': 'aipom', 'url': 'https://pokeapi.co/...","[{'game_index': 190, 'version': {'name': 'gold...",8,[],190,True,https://pokeapi.co/api/v2/pokemon/190/encounters,"[{'move': {'name': 'double-slap', 'url': 'http...",aipom,285,,[],"{'name': 'aipom', 'url': 'https://pokeapi.co/a...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 55, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",115
alakazam,"[{'ability': {'name': 'synchronize', 'url': 'h...",250,"[{'name': 'alakazam', 'url': 'https://pokeapi....","[{'game_index': 149, 'version': {'name': 'red'...",15,"[{'item': {'name': 'twisted-spoon', 'url': 'ht...",65,True,https://pokeapi.co/api/v2/pokemon/65/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",alakazam,105,,[],"{'name': 'alakazam', 'url': 'https://pokeapi.c...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 55, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'psychic', 'url'...",480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yanma,"[{'ability': {'name': 'speed-boost', 'url': 'h...",78,"[{'name': 'yanma', 'url': 'https://pokeapi.co/...","[{'game_index': 193, 'version': {'name': 'gold...",12,"[{'item': {'name': 'wide-lens', 'url': 'https:...",193,True,https://pokeapi.co/api/v2/pokemon/193/encounters,"[{'move': {'name': 'wing-attack', 'url': 'http...",yanma,289,,[],"{'name': 'yanma', 'url': 'https://pokeapi.co/a...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 65, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'bug', 'url': 'h...",380
zangoose,"[{'ability': {'name': 'immunity', 'url': 'http...",160,"[{'name': 'zangoose', 'url': 'https://pokeapi....","[{'game_index': 380, 'version': {'name': 'ruby...",13,"[{'item': {'name': 'quick-claw', 'url': 'https...",335,True,https://pokeapi.co/api/v2/pokemon/335/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",zangoose,448,,[],"{'name': 'zangoose', 'url': 'https://pokeapi.c...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 73, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",403
zapdos,"[{'ability': {'name': 'pressure', 'url': 'http...",290,"[{'name': 'zapdos', 'url': 'https://pokeapi.co...","[{'game_index': 75, 'version': {'name': 'red',...",16,[],145,True,https://pokeapi.co/api/v2/pokemon/145/encounters,"[{'move': {'name': 'razor-wind', 'url': 'https...",zapdos,238,,[],"{'name': 'zapdos', 'url': 'https://pokeapi.co/...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 90, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'electric', 'url...",526
zigzagoon,"[{'ability': {'name': 'pickup', 'url': 'https:...",56,"[{'name': 'zigzagoon', 'url': 'https://pokeapi...","[{'game_index': 288, 'version': {'name': 'ruby...",4,"[{'item': {'name': 'potion', 'url': 'https://p...",263,True,https://pokeapi.co/api/v2/pokemon/263/encounters,"[{'move': {'name': 'cut', 'url': 'https://poke...",zigzagoon,361,,[],"{'name': 'zigzagoon', 'url': 'https://pokeapi....",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 38, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",175


In [31]:
# Reset Index as ID for the dataframe

tdf.reset_index(inplace=True)
tdf.set_index('id', inplace = True)
tdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335 entries, 63 to 41
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   pokemon                   335 non-null    object
 1   abilities                 335 non-null    object
 2   base_experience           335 non-null    object
 3   forms                     335 non-null    object
 4   game_indices              335 non-null    object
 5   height                    335 non-null    object
 6   held_items                335 non-null    object
 7   is_default                335 non-null    object
 8   location_area_encounters  335 non-null    object
 9   moves                     335 non-null    object
 10  name                      335 non-null    object
 11  order                     335 non-null    object
 12  past_abilities            84 non-null     object
 13  past_types                335 non-null    object
 14  species                   

In [32]:
pokemon_colors_list = []
for colors in data1:
    for features in data1[colors]:
        #print(a) # a is the whole section of combined features
        for feature, description in features.items():
            # b is the dictionaries such as ability, weight, stats that are pulled
            #print(c) # Farthest I can break down for now.
            poke_dict1 = {'color': colors, 'features': feature, 'descriptions': description}
            pokemon_colors_list.append(poke_dict1)

In [33]:
# Transpose Dataframe

df1 = pd.DataFrame.from_dict(pokemon_colors_list)
cdf = df1.pivot(columns = 'features', values = 'descriptions', index = 'color')
cdf

features,id,name,names,pokemon_species
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,1,black,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'murkrow', 'url': 'https://pokeapi.c..."
blue,2,blue,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'squirtle', 'url': 'https://pokeapi...."
brown,3,brown,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'weedle', 'url': 'https://pokeapi.co..."
gray,4,gray,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'machop', 'url': 'https://pokeapi.co..."
green,5,green,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'bulbasaur', 'url': 'https://pokeapi..."
pink,6,pink,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'slowpoke', 'url': 'https://pokeapi...."
purple,7,purple,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'rattata', 'url': 'https://pokeapi.c..."
red,8,red,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'charmander', 'url': 'https://pokeap..."
white,9,white,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'seel', 'url': 'https://pokeapi.co/a..."
yellow,10,yellow,"[{'language': {'name': 'ja-Hrkt', 'url': 'http...","[{'name': 'sandshrew', 'url': 'https://pokeapi..."


In [34]:
# Drop Unnecessary Columns

cdf = cdf.drop(columns = ['id', 'names'])
cdf = cdf.reset_index(drop = True)
cdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             10 non-null     object
 1   pokemon_species  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [35]:
cdf

features,name,pokemon_species
0,black,"[{'name': 'murkrow', 'url': 'https://pokeapi.c..."
1,blue,"[{'name': 'squirtle', 'url': 'https://pokeapi...."
2,brown,"[{'name': 'weedle', 'url': 'https://pokeapi.co..."
3,gray,"[{'name': 'machop', 'url': 'https://pokeapi.co..."
4,green,"[{'name': 'bulbasaur', 'url': 'https://pokeapi..."
5,pink,"[{'name': 'slowpoke', 'url': 'https://pokeapi...."
6,purple,"[{'name': 'rattata', 'url': 'https://pokeapi.c..."
7,red,"[{'name': 'charmander', 'url': 'https://pokeap..."
8,white,"[{'name': 'seel', 'url': 'https://pokeapi.co/a..."
9,yellow,"[{'name': 'sandshrew', 'url': 'https://pokeapi..."


### Descriptive Statistics

In [36]:
# Dictionary
descriptive_stats(cleaned_data)

There are 6114 tokens in the data.
There are 6114 unique tokens in the data.
There are 69910052 characters in the data.
The lexical diversity is 1.000 in the data.
The most common tokens are [("{'pokemon':'bulbasaur','feature':'abilities','description':[{'ability':{'name':'overgrow','url':'https://pokeapi.co/api/v2/ability/65/'},'is_hidden':false,'slot':1},{'ability':{'name':'chlorophyll','url':'https://pokeapi.co/api/v2/ability/34/'},'is_hidden':true,'slot':3}]}", 1), ("{'pokemon':'bulbasaur','feature':'base_experience','description':64}", 1), ("{'pokemon':'bulbasaur','feature':'forms','description':[{'name':'bulbasaur','url':'https://pokeapi.co/api/v2/pokemon-form/1/'}]}", 1), ("{'pokemon':'bulbasaur','feature':'game_indices','description':[{'game_index':153,'version':{'name':'red','url':'https://pokeapi.co/api/v2/version/1/'}},{'game_index':153,'version':{'name':'blue','url':'https://pokeapi.co/api/v2/version/2/'}},{'game_index':153,'version':{'name':'yellow','url':'https://pokeapi.

In [37]:
# DataFrame
descriptive_stats(tdf)

There are 335 tokens in the data.
There are 19 unique tokens in the data.
There are 166 characters in the data.
The lexical diversity is 0.060 in the data.
The most common tokens are [('pokemon', 1), ('abilities', 1), ('base_experience', 1), ('forms', 1), ('game_indices', 1)].


### Data Cleaning

`Moves`, `Abilities`, and `Types` features have a list of dictionaries within each pokemon type. For getting it ready to model, cleaning is done to separate out the data to have a list of terms.


In [38]:
tdf

feature,pokemon,abilities,base_experience,forms,game_indices,height,held_items,is_default,location_area_encounters,moves,name,order,past_abilities,past_types,species,sprites,stats,types,weight
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
63,abra,"[{'ability': {'name': 'synchronize', 'url': 'h...",62,"[{'name': 'abra', 'url': 'https://pokeapi.co/a...","[{'game_index': 148, 'version': {'name': 'red'...",9,"[{'item': {'name': 'twisted-spoon', 'url': 'ht...",True,https://pokeapi.co/api/v2/pokemon/63/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",abra,103,,[],"{'name': 'abra', 'url': 'https://pokeapi.co/ap...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 25, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'psychic', 'url'...",195
142,aerodactyl,"[{'ability': {'name': 'rock-head', 'url': 'htt...",180,"[{'name': 'aerodactyl', 'url': 'https://pokeap...","[{'game_index': 171, 'version': {'name': 'red'...",18,[],True,https://pokeapi.co/api/v2/pokemon/142/encounters,"[{'move': {'name': 'razor-wind', 'url': 'https...",aerodactyl,232,[],[],"{'name': 'aerodactyl', 'url': 'https://pokeapi...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 80, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'rock', 'url': '...",590
306,aggron,"[{'ability': {'name': 'sturdy', 'url': 'https:...",265,"[{'name': 'aggron', 'url': 'https://pokeapi.co...","[{'game_index': 384, 'version': {'name': 'ruby...",21,"[{'item': {'name': 'hard-stone', 'url': 'https...",True,https://pokeapi.co/api/v2/pokemon/306/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",aggron,411,[],[],"{'name': 'aggron', 'url': 'https://pokeapi.co/...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 70, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'steel', 'url': ...",3600
190,aipom,"[{'ability': {'name': 'run-away', 'url': 'http...",72,"[{'name': 'aipom', 'url': 'https://pokeapi.co/...","[{'game_index': 190, 'version': {'name': 'gold...",8,[],True,https://pokeapi.co/api/v2/pokemon/190/encounters,"[{'move': {'name': 'double-slap', 'url': 'http...",aipom,285,,[],"{'name': 'aipom', 'url': 'https://pokeapi.co/a...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 55, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",115
65,alakazam,"[{'ability': {'name': 'synchronize', 'url': 'h...",250,"[{'name': 'alakazam', 'url': 'https://pokeapi....","[{'game_index': 149, 'version': {'name': 'red'...",15,"[{'item': {'name': 'twisted-spoon', 'url': 'ht...",True,https://pokeapi.co/api/v2/pokemon/65/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",alakazam,105,,[],"{'name': 'alakazam', 'url': 'https://pokeapi.c...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 55, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'psychic', 'url'...",480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,yanma,"[{'ability': {'name': 'speed-boost', 'url': 'h...",78,"[{'name': 'yanma', 'url': 'https://pokeapi.co/...","[{'game_index': 193, 'version': {'name': 'gold...",12,"[{'item': {'name': 'wide-lens', 'url': 'https:...",True,https://pokeapi.co/api/v2/pokemon/193/encounters,"[{'move': {'name': 'wing-attack', 'url': 'http...",yanma,289,,[],"{'name': 'yanma', 'url': 'https://pokeapi.co/a...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 65, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'bug', 'url': 'h...",380
335,zangoose,"[{'ability': {'name': 'immunity', 'url': 'http...",160,"[{'name': 'zangoose', 'url': 'https://pokeapi....","[{'game_index': 380, 'version': {'name': 'ruby...",13,"[{'item': {'name': 'quick-claw', 'url': 'https...",True,https://pokeapi.co/api/v2/pokemon/335/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",zangoose,448,,[],"{'name': 'zangoose', 'url': 'https://pokeapi.c...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 73, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",403
145,zapdos,"[{'ability': {'name': 'pressure', 'url': 'http...",290,"[{'name': 'zapdos', 'url': 'https://pokeapi.co...","[{'game_index': 75, 'version': {'name': 'red',...",16,[],True,https://pokeapi.co/api/v2/pokemon/145/encounters,"[{'move': {'name': 'razor-wind', 'url': 'https...",zapdos,238,,[],"{'name': 'zapdos', 'url': 'https://pokeapi.co/...",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 90, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'electric', 'url...",526
263,zigzagoon,"[{'ability': {'name': 'pickup', 'url': 'https:...",56,"[{'name': 'zigzagoon', 'url': 'https://pokeapi...","[{'game_index': 288, 'version': {'name': 'ruby...",4,"[{'item': {'name': 'potion', 'url': 'https://p...",True,https://pokeapi.co/api/v2/pokemon/263/encounters,"[{'move': {'name': 'cut', 'url': 'https://poke...",zigzagoon,361,,[],"{'name': 'zigzagoon', 'url': 'https://pokeapi....",{'back_default': 'https://raw.githubuserconten...,"[{'base_stat': 38, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'normal', 'url':...",175


Clean Types into list of Types

In [39]:
pokemon_types = []
for val in tdf['types']:
    #print(val)
    res = [sub['type'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_types.append(results)

In [40]:
types_df = pd.DataFrame(columns = ["types"])
poke_types = []
for row in pokemon_types[:1281]:
    row_dict = {'types': row}
    poke_types.append(row_dict)

types_df = pd.DataFrame.from_dict(poke_types)
types_df.reset_index(drop=True)

Unnamed: 0,types
0,[psychic]
1,"[rock, flying]"
2,"[steel, rock]"
3,[normal]
4,[psychic]
...,...
330,"[bug, flying]"
331,[normal]
332,"[electric, flying]"
333,[normal]


Clean Abilities Column from List of Dictionaries into list of abilities

In [41]:
pokemon_abilities = []
for val in tdf['abilities']:
    #print(val)
    res = [sub['ability'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_abilities.append(results)

In [42]:
abilities_df = pd.DataFrame(columns = ["abilities"])
poke_abilities = []
for row in pokemon_abilities[:1281]:
    row_dict = {'abilities': row}
    poke_abilities.append(row_dict)

abilities_df = pd.DataFrame.from_dict(poke_abilities)
abilities_df.reset_index(drop=True)

Unnamed: 0,abilities
0,"[synchronize, inner-focus, magic-guard]"
1,"[rock-head, pressure, unnerve]"
2,"[sturdy, rock-head, heavy-metal]"
3,"[run-away, pickup, skill-link]"
4,"[synchronize, inner-focus, magic-guard]"
...,...
330,"[speed-boost, compound-eyes, frisk]"
331,"[immunity, toxic-boost]"
332,"[pressure, static]"
333,"[pickup, gluttony, quick-feet]"


Drop Old Abilities Column to prepare for Cleaned Abilities to Dataframe

In [43]:
tdf = tdf.drop(columns=['abilities'])

Clean moves into list of moves

In [44]:
pokemon_moves = []
for val in tdf['moves']:
    #print(val)
    res = [sub['move'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_moves.append(results)

In [45]:
moves_df = pd.DataFrame(columns = ["moves"])
poke_moves = []
for row in pokemon_moves[:1281]:
    row_dict = {'moves': row}
    poke_moves.append(row_dict)

moves_df = pd.DataFrame.from_dict(poke_moves)
moves_df.reset_index(drop=True)

Unnamed: 0,moves
0,"[mega-punch, fire-punch, ice-punch, thunder-pu..."
1,"[razor-wind, wing-attack, whirlwind, fly, head..."
2,"[mega-punch, fire-punch, ice-punch, thunder-pu..."
3,"[double-slap, mega-punch, fire-punch, ice-punc..."
4,"[mega-punch, fire-punch, ice-punch, thunder-pu..."
...,...
330,"[wing-attack, whirlwind, headbutt, tackle, dou..."
331,"[mega-punch, fire-punch, ice-punch, thunder-pu..."
332,"[razor-wind, whirlwind, fly, headbutt, take-do..."
333,"[cut, sand-attack, headbutt, tackle, body-slam..."


Clean Game Indices

In [46]:
pokemon_game = []
for val in tdf['game_indices']:
    #print(val)
    res = [sub['version'] for sub in val]
    #print(res) # list of dictionaries of moves
    results = [sub['name'] for sub in res]
    #print(results) # Woo! Output of Moves
    pokemon_game.append(results)

In [47]:
game_df = pd.DataFrame(columns = ["pokemon_games"])
poke_game = []
for row in pokemon_game[:1281]:
    row_dict = {'pokemon_games': row}
    poke_game.append(row_dict)

game_df = pd.DataFrame.from_dict(poke_game)
game_df.reset_index(drop=True)

Unnamed: 0,pokemon_games
0,"[red, blue, yellow, gold, silver, crystal, rub..."
1,"[red, blue, yellow, gold, silver, crystal, rub..."
2,"[ruby, sapphire, emerald, firered, leafgreen, ..."
3,"[gold, silver, crystal, ruby, sapphire, emeral..."
4,"[red, blue, yellow, gold, silver, crystal, rub..."
...,...
330,"[gold, silver, crystal, ruby, sapphire, emeral..."
331,"[ruby, sapphire, emerald, firered, leafgreen, ..."
332,"[red, blue, yellow, gold, silver, crystal, rub..."
333,"[ruby, sapphire, emerald, firered, leafgreen, ..."


In [48]:
tdf = tdf.reset_index(drop=True)

In [49]:
clean_df = pd.merge(tdf, abilities_df, left_index = True, right_index = True)
clean_df = clean_df.drop(columns=['moves', 'types', 'game_indices'])
clean_df = pd.merge(clean_df, moves_df, left_index = True, right_index = True)
clean_df = pd.merge(clean_df, types_df, left_index = True, right_index = True)
clean_df = pd.merge(clean_df, game_df, left_index = True, right_index = True)

In [50]:
clean_df = clean_df.drop(columns=['forms', 'held_items', 'is_default', 'location_area_encounters', 'past_types', 'species', 'sprites', 'stats'])
clean_df

Unnamed: 0,pokemon,base_experience,height,name,order,past_abilities,weight,abilities,moves,types,pokemon_games
0,abra,62,9,abra,103,,195,"[synchronize, inner-focus, magic-guard]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[psychic],"[red, blue, yellow, gold, silver, crystal, rub..."
1,aerodactyl,180,18,aerodactyl,232,[],590,"[rock-head, pressure, unnerve]","[razor-wind, wing-attack, whirlwind, fly, head...","[rock, flying]","[red, blue, yellow, gold, silver, crystal, rub..."
2,aggron,265,21,aggron,411,[],3600,"[sturdy, rock-head, heavy-metal]","[mega-punch, fire-punch, ice-punch, thunder-pu...","[steel, rock]","[ruby, sapphire, emerald, firered, leafgreen, ..."
3,aipom,72,8,aipom,285,,115,"[run-away, pickup, skill-link]","[double-slap, mega-punch, fire-punch, ice-punc...",[normal],"[gold, silver, crystal, ruby, sapphire, emeral..."
4,alakazam,250,15,alakazam,105,,480,"[synchronize, inner-focus, magic-guard]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[psychic],"[red, blue, yellow, gold, silver, crystal, rub..."
...,...,...,...,...,...,...,...,...,...,...,...
330,yanma,78,12,yanma,289,,380,"[speed-boost, compound-eyes, frisk]","[wing-attack, whirlwind, headbutt, tackle, dou...","[bug, flying]","[gold, silver, crystal, ruby, sapphire, emeral..."
331,zangoose,160,13,zangoose,448,,403,"[immunity, toxic-boost]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[normal],"[ruby, sapphire, emerald, firered, leafgreen, ..."
332,zapdos,290,16,zapdos,238,,526,"[pressure, static]","[razor-wind, whirlwind, fly, headbutt, take-do...","[electric, flying]","[red, blue, yellow, gold, silver, crystal, rub..."
333,zigzagoon,56,4,zigzagoon,361,,175,"[pickup, gluttony, quick-feet]","[cut, sand-attack, headbutt, tackle, body-slam...",[normal],"[ruby, sapphire, emerald, firered, leafgreen, ..."


In [51]:
clean_df = clean_df.drop(columns = ['pokemon_games']) # Dropped since too many empty lists.

#### Cleaning of Pokemon Colors and Species

In [52]:
pokemon = []
for val in cdf['pokemon_species']:
    # print(val.type) # Val is list
    res = [sub['name'] for sub in val]
    #print(res)
    pokemon.append(res)

In [53]:
black = pokemon[0]
blue = pokemon[1]
brown = pokemon[2]
gray = pokemon[3] 
green = pokemon[4]
pink = pokemon[5]
purple = pokemon[6]
red = pokemon[7]
white = pokemon[8]
yellow = pokemon[9]

In [54]:
pokemon_colors = pd.DataFrame((list(zip(black, blue, brown, gray, green, pink, purple, red, white, yellow))), columns = ['black', 'blue', 'brown', 'gray', 'green', 'pink', 'purple', 'red', 'white', 'yellow'])
pokemon_colors

Unnamed: 0,black,blue,brown,gray,green,pink,purple,red,white,yellow
0,murkrow,squirtle,weedle,machop,bulbasaur,slowpoke,rattata,charmander,seel,sandshrew
1,unown,nidoran-f,pidgey,magnemite,caterpie,exeggcute,ekans,paras,togepi,meowth
2,sneasel,oddish,spearow,onix,bellsprout,lickitung,nidoran-m,krabby,mareep,psyduck
3,houndour,poliwag,vulpix,rhyhorn,scyther,porygon,zubat,voltorb,smeargle,ponyta
4,mawile,tentacool,diglett,misdreavus,chikorita,mew,venonat,goldeen,lugia,drowzee
5,spoink,tangela,mankey,pineco,spinarak,cleffa,grimer,magikarp,wingull,zapdos
6,seviper,horsea,growlithe,qwilfish,natu,igglybuff,shellder,ledyba,ralts,moltres
7,shuppet,lapras,abra,remoraid,larvitar,hoppip,gastly,yanma,zangoose,cyndaquil
8,duskull,omanyte,geodude,skarmory,celebi,snubbull,koffing,slugma,absol,pichu
9,chatot,articuno,farfetchd,poochyena,treecko,corsola,ditto,delibird,pachirisu,sunkern


In [55]:
# Transform Pokemon Colors Dataframe to have two columns, pokemon and color

pokemon_colors = pokemon_colors.reset_index(inplace = False)

pc_df = pokemon_colors.melt(id_vars='index', var_name='color', value_name='pokemon')
print(pc_df.sample(5)) # Check to ensure that the melt was done correctly and colors still match with the pokemon

     index   color    pokemon
326     38  purple   indeedee
360     24     red      throh
72      24    blue     wynaut
20      20   black  pyukumuku
119     23   brown      entei


Merge Pokemon Colors to Pokemon Statistics Dataframe

In [56]:
final_df = pd.merge(clean_df, pc_df, on = 'pokemon')

In [57]:
clean_df

Unnamed: 0,pokemon,base_experience,height,name,order,past_abilities,weight,abilities,moves,types
0,abra,62,9,abra,103,,195,"[synchronize, inner-focus, magic-guard]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[psychic]
1,aerodactyl,180,18,aerodactyl,232,[],590,"[rock-head, pressure, unnerve]","[razor-wind, wing-attack, whirlwind, fly, head...","[rock, flying]"
2,aggron,265,21,aggron,411,[],3600,"[sturdy, rock-head, heavy-metal]","[mega-punch, fire-punch, ice-punch, thunder-pu...","[steel, rock]"
3,aipom,72,8,aipom,285,,115,"[run-away, pickup, skill-link]","[double-slap, mega-punch, fire-punch, ice-punc...",[normal]
4,alakazam,250,15,alakazam,105,,480,"[synchronize, inner-focus, magic-guard]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[psychic]
...,...,...,...,...,...,...,...,...,...,...
330,yanma,78,12,yanma,289,,380,"[speed-boost, compound-eyes, frisk]","[wing-attack, whirlwind, headbutt, tackle, dou...","[bug, flying]"
331,zangoose,160,13,zangoose,448,,403,"[immunity, toxic-boost]","[mega-punch, fire-punch, ice-punch, thunder-pu...",[normal]
332,zapdos,290,16,zapdos,238,,526,"[pressure, static]","[razor-wind, whirlwind, fly, headbutt, take-do...","[electric, flying]"
333,zigzagoon,56,4,zigzagoon,361,,175,"[pickup, gluttony, quick-feet]","[cut, sand-attack, headbutt, tackle, body-slam...",[normal]


In [59]:
clean_df = clean_df.drop(columns = ['index_x', 'color_x', 'index_y', 'color_y', 'index'])
clean_df

KeyError: "['index_x' 'color_x' 'index_y' 'color_y' 'index'] not found in axis"

In [60]:
clean_df.to_csv('Cleaned_Data.csv')

In [79]:
file_path = 'C:\\Users\\micha\\applied text mining\\final project\\Final_Data.csv'

pokemon_data = pd.read_csv(file_path)

In [81]:
print(pokemon_data.columns)

Index(['Unnamed: 0', 'pokemon', 'base_experience', 'height', 'name', 'order',
       'weight', 'abilities', 'moves', 'types'],
      dtype='object')


In [93]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

pokemon_data_encoded = pokemon_data.copy()

# Handle missing 'moves' data
pokemon_data_encoded['moves'] = pokemon_data_encoded['moves'].apply(lambda x: x if isinstance(x, list) else [])

# MultiLabelBinarizer encode 'abilities'
mlb_abilities = MultiLabelBinarizer()
abilities_encoded = mlb_abilities.fit_transform(pokemon_data_encoded['abilities'])
abilities_encoded_df = pd.DataFrame(abilities_encoded, columns=mlb_abilities.classes_)
pokemon_data_encoded = pd.concat([pokemon_data_encoded.drop('abilities', axis=1), abilities_encoded_df], axis=1)

# MultiLabelBinarizer encode 'moves'
mlb_moves = MultiLabelBinarizer()
moves_encoded = mlb_moves.fit_transform(pokemon_data_encoded['moves'])
moves_encoded_df = pd.DataFrame(moves_encoded, columns=mlb_moves.classes_)
pokemon_data_encoded = pd.concat([pokemon_data_encoded.drop('moves', axis=1), moves_encoded_df], axis=1)

# Separate features from the target
X = pokemon_data_encoded.drop('types', axis=1)  
y = pokemon_data['types']  

# MultiLabelBinarizer encode 'types'
mlb_types = MultiLabelBinarizer()
y_encoded = mlb_types.fit_transform(y)

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# One hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align 
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Handle NaN 
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# RFC
clf = RandomForestClassifier(random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid search
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

# Get best estimator
best_clf = grid_search.best_estimator_

# Predictions
y_pred = best_clf.predict(X_test)

# Print
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 288 candidates, totalling 864 fits
              precision    recall  f1-score   support

           0       0.82      0.71      0.76       127
           1       1.00      1.00      1.00       226
           2       0.82      0.71      0.76       127
           3       0.80      0.92      0.86       117
           4       1.00      0.16      0.27        19
           5       0.86      0.32      0.46        79
           6       0.89      0.21      0.34        38
           7       0.79      0.62      0.69        91
           8       0.82      0.45      0.58        69
           9       0.85      0.79      0.82       121
          10       0.90      0.30      0.44        61
          11       0.84      0.90      0.87       141
          12       1.00      0.21      0.35        28
          13       0.92      0.46      0.62        71
          14       1.00      0.43      0.60        21
          15       0.74      0.79      0.76        96
          16      