# Pokedex Project

**Goal:** create a web-based interface that returns similar Pokemon when Pokemon name is inputted based on vectorized Pokedex descriptions.
    
    - Use cosine similarity to power search index
    
## Steps

### Data Collection:

 - Scrape data from Pokedata database website (https://pokemondb.net/pokedex/all): 
     
     - Get list of Pokemon names first from main page
     
     - Then scrape Pokedex entries from each Pokemon's respective page
     
### Search Index: Cosine Similarity

- Take code from Recommender System Project to create vectorized Pokedex entries for each Pokemon

### Data Storage:

 - Use Azure Blob Storage to create Vector Database to store embeddings

In [1]:
# install libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import string
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from scipy import stats
from sklearn.metrics.pairwise import linear_kernel # for cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import requests
import csv

# !pip install bs4 pandas requests

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Collection: Web Scraping

Collect the data from pokemon database website

In [3]:
# build scraper for pokemone names and put into CSV

# Establish link to page and store source content
print('Connecting...')

page_limit = 100 # how many pages desired, change val here

pages = range(0,page_limit+1) # first 65 pages of games on metacritic game reviews

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

# declare list where data will be stored
name_data = []


page = requests.get('https://pokemondb.net/pokedex/all', # url
                    headers=headers)
src = page.content

# Verify connection to page is good

print('Connection established') if page.status_code == 200 else 'Error'

print('Reading page...')

# create BS4 object
soup = BeautifulSoup(src, 'html.parser')

names = soup.find_all('a', class_='ent-name')

ids = soup.find_all('span', class_='infocard-cell-data')

pokemon = {id_.text:name.text for id_, name in zip(ids, names)}

#pokemon
print(len(pokemon))

Connecting...
Connection established
Reading page...
1024


In [147]:
df

Unnamed: 0,name,pokedex_desc,poke_desc_proc
0001,Bulbasaur,[A strange seed was planted on its back at bir...,"[single, grows, us, eating, right, energy, mor..."
0002,Ivysaur,"[When the bulb on its back grows large, it app...","[sturdy, grows, trunk, energy, nutrient, indic..."
0003,Venusaur,[The plant blooms when it is absorbing solar e...,"[powerful, summertime, bewitching, absorbed, c..."
0004,Charmander,"[Obviously prefers hot places. When it rains, ...","[waver, force, flame, bit, thing, burn, born, ..."
0005,Charmeleon,"[When it swings its burning tail, it elevates ...","[unbearably, constantly, flame, tough, surroun..."
...,...,...,...
1020,Gouging Fire,[],[]
1021,Raging Bolt,[],[]
1022,Iron Boulder,[],[]
1023,Iron Crown,[],[]


In [7]:
# safe pokemon names and ids to csv

df = pd.DataFrame(index = pokemon.keys(), data = pokemon.values(), columns=['name'])

df.to_csv("pokemon_names.csv", index=True, index_label="id")

df

Unnamed: 0,name
0001,Bulbasaur
0002,Ivysaur
0003,Venusaur
0004,Charmander
0005,Charmeleon
...,...
1020,Gouging Fire
1021,Raging Bolt
1022,Iron Boulder
1023,Iron Crown


In [4]:
# get pokemon entries

basepath = "https://pokemondb.net/pokedex/"

poke_list = df.name.str.lower().to_list()

poke_list = [name.replace(" ", "-").replace(".", "").replace("'", '') if ' ' in name else name.replace(".", "").replace("'", '') for name in poke_list]

poke_list = [name.replace("♀", "-f") if '♀' in name else name.replace("♂", "-m").replace("é", "e").replace(":","") for name in poke_list]



In [5]:
poke_list[:5]

['bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon']

In [80]:
# build scraper code

import re

test_path = basepath + "pichu"#poke_list[0]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

page = requests.get(test_path, # url
                    headers=headers)
src = page.content

# Verify connection to page is good

print('Connection established') if page.status_code == 200 else 'Error'

print('Reading page...')

# create BS4 object
soup = BeautifulSoup(src, 'html.parser')

# pokedex entries

entries = soup.find_all('td', class_='cell-med-text')

entries = [entry.text for entry in entries]

# species

species = soup.find("table", class_='vitals-table').find_all("td")[2].text

# abilities

abilities = soup.find("table", class_='vitals-table').find_all("td")[5].text.replace("(hidden ability)", "")[2:].strip()

# type

types = soup.find("table", class_='vitals-table').find_all("td")[1].text.replace("\n", "").strip()

# egg_group

egg_group = soup.find_all("table", class_='vitals-table')[2].find_all("td")[0].text

# evolution tree

evolutions = soup.find_all("span", class_="infocard-lg-data text-muted")

evolutions = [i.text.split(" ")[1].lower() for i in evolutions]

try: 
    evolutions.remove("pichu")
except ValueError:
    pass

# height

h_pattern = '(\d+.\d+)'

height = soup.find_all("table", class_='vitals-table')[0].find_all("td")[3].text

h_match = re.search(h_pattern, height).group(0)

# weight

w_pattern = '(\d+.\d+)'

weight = soup.find_all("table", class_='vitals-table')[0].find_all("td")[4].text#[:4]

w_match = re.search(w_pattern, weight).group(0)

# gender_dist

gender = soup.find_all("table", class_='vitals-table')[2].find_all("td")[1].text

print(gender)

if gender == "Genderless":
    
    male = np.nan
    
    female = np.nan
    
else:

    male = gender.split(',')[0].split(" ")#[0][:-1])

    male = [i for i in male if i]

    male = float(male[0][:-1])/100

    female = gender.split(',')[1].split(" ")#[0]#[:-1])/100

    female = [i for i in female if i]

    female = float(female[0][:-1])/100

# entry_len = [len(entry) for entry in entries]

#ids = soup.find_all('span', class_='infocard-cell-data')

#entries

#species

#abilities

#types

# print(w_match, h_match)

gender
print(male)
print(female)
print(h_match)
print(w_match)

#bulb_series = pd.Series(entries)



Connection established
Reading page...
50% male, 50% female
0.5
0.5
0.3
2.0


In [25]:
bulb_series = pd.Series(entries, name="entries")

bulb_series

0     A strange seed was planted on its back at birt...
1     It can go for days without eating a single mor...
2     The seed on its back is filled with nutrients....
3     It carries a seed on its back right from birth...
4     While it is young, it uses the nutrients that ...
5     BULBASAUR can be seen napping in bright sunlig...
6     There is a plant seed on its back right from t...
7     A strange seed was planted on its back at birt...
8     For some time after its birth, it grows by gai...
9     The seed on its back is filled with nutrients....
10    It carries a seed on its back right from birth...
11    For some time after its birth, it grows by gai...
12    A strange seed was planted on its back at birt...
13    For some time after its birth, it grows by gai...
14    Bulbasaur can be seen napping in bright sunlig...
15    It can go for days without eating a single mor...
16    There is a plant seed on its back right from t...
17    While it is young, it uses the nutrients t

In [82]:
# loop through links and get entries for each pokemon
# convert entries to list for each pokemon
# add as new column to df

pokedex_entries = []

pokedex_species = []

pokedex_abilities = []

pokedex_types = []

pokedex_eggs = []

pokemon_evolutions = []

pokemon_height = []

pokemon_weight = []

pokemon_male = []

pokemon_female = []

counter = 0

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

for pokemon in poke_list:
    
    path = basepath+pokemon
    
    page = requests.get(path, # url
                    headers=headers)
    
    src = page.content

    # Verify connection to page is good

#     print('Connection established') if page.status_code == 200 else 'Error'

#     print('Reading page...')

    # create BS4 object
    soup = BeautifulSoup(src, 'html.parser')

    # entries
    
    entries = soup.find_all('td', class_='cell-med-text')

    entries = [entry.text for entry in entries]
    
    # species

    try:
    
        species = soup.find("table", class_='vitals-table').find_all("td")[2].text
        
    except AttributeError:
        
        print(f"not found for {pokemon}")
        pass

    # abilities

    abilities = soup.find("table", class_='vitals-table').find_all("td")[5].text.replace("(hidden ability)", "")[2:].strip()

    # type

    types = soup.find("table", class_='vitals-table').find_all("td")[1].text.replace("\n", "").strip()

    # egg_group

    egg_group = soup.find_all("table", class_='vitals-table')[2].find_all("td")[0].text
    
    # evolution tree

    evolutions = soup.find_all("span", class_="infocard-lg-data text-muted")

    evolutions = [i.text.split(" ")[1].lower() for i in evolutions]
    
    try:
        evolutions.remove(pokemon)
    except ValueError:
        #print(f"{pokemon} not in list, continuing...")
        pass
        
    # height

    h_pattern = '(\d+.\d+)'

    height = soup.find_all("table", class_='vitals-table')[0].find_all("td")[3].text

    h_match = re.search(h_pattern, height).group(0)

    # weight

    w_pattern = '(\d+.\d+)'

    weight = soup.find_all("table", class_='vitals-table')[0].find_all("td")[4].text#[:4]

    w_match = re.search(w_pattern, weight).group(0)

    # gender_dist

    gender = soup.find_all("table", class_='vitals-table')[2].find_all("td")[1].text

    #print(gender)

    if gender == "Genderless":

        male = np.nan

        female = np.nan
        
        

    else:

        male = gender.split(',')[0].split(" ")#[0][:-1])

        male = [i for i in male if i]

        male = float(male[0][:-1])/100

        female = gender.split(',')[1].split(" ")#[0]#[:-1])/100

        female = [i for i in female if i]

        female = float(female[0][:-1])/100
    
    pokedex_entries.append(entries)
    pokedex_species.append(species)
    pokedex_abilities.append(abilities)
    pokedex_types.append(types)
    pokedex_eggs.append(egg_group)
    pokemon_evolutions.append(evolutions)
    pokemon_height.append(h_match)
    pokemon_weight.append(w_match)
    pokemon_male.append(male)
    pokemon_female.append(female)
    
    #print(pokemon)
    
df["pokedex_desc"] = pokedex_entries
df["species"] = pokedex_species
df["ability"] = pokedex_abilities
df["type"] = pokedex_types
df["egg_group"] = pokedex_eggs
df["evolution_tree"] = pokemon_evolutions
df["height"] = pokemon_height
df["weight"] = pokemon_weight
df["male_dist"] = pokemon_male
df["female_dist"] = pokemon_female

df

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type,egg_group,evolution_tree,height,weight,male_dist,female_dist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Bulbasaur,[A strange seed was planted on its back at bir...,Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[ivysaur, venusaur]",0.7,6.9,0.875,0.125
2,Ivysaur,"[When the bulb on its back grows large, it app...",Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[bulbasaur, venusaur]",1.0,13.0,0.875,0.125
3,Venusaur,[The plant blooms when it is absorbing solar e...,Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[bulbasaur, ivysaur]",2.0,100.0,0.875,0.125
4,Charmander,"[Obviously prefers hot places. When it rains, ...",Lizard Pokémon,BlazeSolar Power,Fire,"Dragon, Monster","[charmeleon, charizard]",0.6,8.5,0.875,0.125
5,Charmeleon,"[When it swings its burning tail, it elevates ...",Flame Pokémon,BlazeSolar Power,Fire,"Dragon, Monster","[charmander, charizard]",1.1,19.0,0.875,0.125
...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire Dragon,Undiscovered,[],3.5,590.0,,
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric Dragon,Undiscovered,[],5.2,480.0,,
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock Psychic,Undiscovered,[],1.5,162.5,,
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel Psychic,Undiscovered,[],1.6,156.0,,


In [83]:
# save df

df.to_csv("pokedex_data.csv", index=True, index_label="id")

In [182]:


species_list = [[s] for s in df.species]

species_list[:10]

[['Seed Pokémon'],
 ['Seed Pokémon'],
 ['Seed Pokémon'],
 ['Lizard Pokémon'],
 ['Flame Pokémon'],
 ['Flame Pokémon'],
 ['Tiny Turtle Pokémon'],
 ['Turtle Pokémon'],
 ['Shellfish Pokémon'],
 ['Worm Pokémon']]

In [22]:
species_proc = df.species.str.lower().str.replace(" ", "").str.replace("pokémon", "")

ability_proc = df.ability.str.lower().str.replace(" ", "").str.replace(".", "")

type_proc = df.type.str.lower().str.replace(" ", "")#.str.replace(".", "")

egg_proc = df.egg_group.str.lower().str.replace(",", "").str.replace("-", "").str.replace("1", "").str.replace("2", "").str.replace("3", "").str.replace(" ", "")

egg_proc.values

  ability_proc = pd.read_csv("pokedex_data.csv", index_col="id").ability.str.lower().str.replace(" ", "").str.replace(".", "")


array(['grassmonster', 'grassmonster', 'grassmonster', ...,
       'undiscovered', 'undiscovered', 'undiscovered'], dtype=object)

In [85]:
# clean/process corpus

# Function to streamline NLP Process

def nlp(df, text):
    # Load string
    # raw_data = pd.read_csv(file + '.csv')
    
    df['dummy'] = df[text].astype(str)
    # Convert to lowercase
    dummy_gen = (post.lower() for post in df['dummy'])
    df['dummy'] = [i for i in dummy_gen]

    # Word & Sentence Tokenization
    token_post = (word_tokenize(post) for post in df['dummy'])
    token_post = [i for i in token_post]

#     token_post = [sent_tokenize(post) for post in token_post]
    
    #df['token_post'] = token_post
    
    # Remove Punctuation
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

    no_punc = []

    for filt in token_post:
        review = []
        for token in filt:
            new_token = reg.sub(u'', token)
            if not new_token == u'':
                review.append(new_token)
        no_punc.append(review)
        
#     df['token_post'] = no_punc
        
    # Remove Stopwords
    no_stop = []

    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)

        no_stop.append(new_term_vector)
        
#     df['token_post'] = no_stop
        
    # Stemming & Lemmatization
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()

    preproc_text = []

    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))

        preproc_text.append(final_text)
        
    #df['token_post'] = preproc_text
        
    # create final data set
    #data = df.copy()

    #new_col = pd.Series(preproc_text)
    df['poke_desc_proc'] = [list(set(i)) for i in preproc_text]
    df.drop('dummy', axis=1, inplace=True)
    return df

# proc_data = nlp(bulb_series, 'entries')

# proc_data

In [94]:
df.to_csv("pokemon_names.csv", index=True, index_label="id")

In [9]:
#df.pokedex_desc.isnull().sum()

# convert pokedex descriptions using nlp function

proc_poke_data = nlp(proc_poke_data, 'pokedex_desc')

proc_poke_data

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"[larger, filled, time, nutrient, energy, grow,..."
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"[blossom, bud, appears, sweet, spends, larger,..."
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"[used, bewitching, become, larger, venusaur, p..."
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"[obviously, tail, hear, fiercely, thing, tip, ..."
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"[tail, calm, tip, night, blow, claw, enemy, bl..."
...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire,Dragon,Undiscovered,,True,False,[]
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric,Dragon,Undiscovered,,True,False,[]
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock,Psychic,Undiscovered,,True,False,[]
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel,Psychic,Undiscovered,,True,False,[]


In [87]:
proc_data = pd.read_csv("pokedex_data_proc.csv", index_col='id')

proc_data["evolution_tree"] = pokemon_evolutions
proc_data["height"] = pokemon_height
proc_data["weight"] = pokemon_weight
proc_data["male_dist"] = pokemon_male
proc_data["female_dist"] = pokemon_female

proc_data

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['larger', 'filled', 'time', 'nutrient', 'ener...","[ivysaur, venusaur]",0.7,6.9,0.875,0.125
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['blossom', 'bud', 'appears', 'sweet', 'spends...","[bulbasaur, venusaur]",1.0,13.0,0.875,0.125
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['used', 'bewitching', 'become', 'larger', 've...","[bulbasaur, ivysaur]",2.0,100.0,0.875,0.125
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['obviously', 'tail', 'hear', 'fiercely', 'thi...","[charmeleon, charizard]",0.6,8.5,0.875,0.125
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['tail', 'calm', 'tip', 'night', 'blow', 'claw...","[charmander, charizard]",1.1,19.0,0.875,0.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire,Dragon,Undiscovered,,True,False,[],[],3.5,590.0,,
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric,Dragon,Undiscovered,,True,False,[],[],5.2,480.0,,
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock,Psychic,Undiscovered,,True,False,[],[],1.5,162.5,,
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel,Psychic,Undiscovered,,True,False,[],[],1.6,156.0,,


In [116]:
proc_data.query('name == "Virizion"')

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
640,Virizion,['This Pokémon fought humans in order to prote...,Grassland Pokémon,Justified,Grass,Fighting,Undiscovered,,True,False,"['confounds', 'swiftly', 'unova', 'legend', 'o...",[],2.0,200.0,,


In [120]:
# add legendary column

legendaries = []

endpoint = "https://www.serebii.net/pokemon/legendary.shtml" # website with list of legendary pokemon

page = requests.get(endpoint, # url
                    headers=headers)
    
src = page.content

# Verify connection to page is good

#     print('Connection established') if page.status_code == 200 else 'Error'

#     print('Reading page...')

# create BS4 object
soup = BeautifulSoup(src, 'html.parser')

legends = soup.find_all('td', attrs={'align':'center'})

legends = [i.text for i in legends if i.text in proc_data.name.tolist()]#'\n' not in i and 'Until' not in i]

#legends = [i for i in legends if i in proc_data.name.tolist()]

legends

['Articuno',
 'Zapdos',
 'Moltres',
 'Raikou',
 'Entei',
 'Suicune',
 'Regirock',
 'Regice',
 'Registeel',
 'Latias',
 'Latios',
 'Uxie',
 'Mesprit',
 'Azelf',
 'Heatran',
 'Regigigas',
 'Cresselia',
 'Cobalion',
 'Terrakion',
 'Virizion',
 'Tornadus',
 'Thundurus',
 'Landorus',
 'Type: Null',
 'Silvally',
 'Tapu Koko',
 'Tapu Lele',
 'Tapu Bulu',
 'Tapu Fini',
 'Kubfu',
 'Urshifu',
 'Regieleki',
 'Regidrago',
 'Glastrier',
 'Spectrier',
 'Enamorus',
 'Wo-Chien',
 'Chien-Pao',
 'Ting-Lu',
 'Chi-Yu',
 'Okidogi',
 'Munkidori',
 'Fezandipiti',
 'Ogerpon',
 'Mewtwo',
 'Lugia',
 'Kyogre',
 'Groudon',
 'Rayquaza',
 'Dialga',
 'Palkia',
 'Giratina',
 'Reshiram',
 'Zekrom',
 'Kyurem',
 'Xerneas',
 'Yveltal',
 'Zygarde',
 'Cosmog',
 'Cosmoem',
 'Solgaleo',
 'Lunala',
 'Necrozma',
 'Zacian',
 'Zamazenta',
 'Eternatus',
 'Calyrex',
 'Koraidon',
 'Miraidon',
 'Terapagos',
 'Mew',
 'Celebi',
 'Jirachi',
 'Deoxys',
 'Phione',
 'Manaphy',
 'Darkrai',
 'Shaymin',
 'Arceus',
 'Victini',
 'Keldeo',
 'Me

In [132]:
proc_data['is_legendary'] = [True if pokemon in set(legends) else False for pokemon in proc_data.name]

proc_data

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist,is_legendary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['larger', 'filled', 'time', 'nutrient', 'ener...","[ivysaur, venusaur]",0.7,6.9,0.875,0.125,False
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['blossom', 'bud', 'appears', 'sweet', 'spends...","[bulbasaur, venusaur]",1.0,13.0,0.875,0.125,False
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['used', 'bewitching', 'become', 'larger', 've...","[bulbasaur, ivysaur]",2.0,100.0,0.875,0.125,False
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['obviously', 'tail', 'hear', 'fiercely', 'thi...","[charmeleon, charizard]",0.6,8.5,0.875,0.125,False
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['tail', 'calm', 'tip', 'night', 'blow', 'claw...","[charmander, charizard]",1.1,19.0,0.875,0.125,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire,Dragon,Undiscovered,,True,False,[],[],3.5,590.0,,,False
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric,Dragon,Undiscovered,,True,False,[],[],5.2,480.0,,,False
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock,Psychic,Undiscovered,,True,False,[],[],1.5,162.5,,,False
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel,Psychic,Undiscovered,,True,False,[],[],1.6,156.0,,,False


In [134]:
proc_data.is_legendary.unique()

array([False,  True])

In [133]:
#proc_data['poke_desc_proc'].iloc[0]

proc_data.query('name == "Mew"')

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist,is_legendary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
151,Mew,['So rare that it is still said to be a mirage...,New Species Pokémon,Synchronize,Psychic,,Undiscovered,,False,False,"['south', 'appears', 'contain', 'dna', 'posse'...",[],0.4,4.0,,,True


In [98]:
#list(set(proc_poke_data.token_post[0]))

proc_poke_data.poke_desc_proc[0]

['single',
 'grows',
 'us',
 'eating',
 'right',
 'energy',
 'morsel',
 'born',
 'nutrient',
 'slowly',
 'steadily',
 'nourishment',
 'carry',
 'pokmon',
 'bulb',
 'sun',
 'seed',
 'time',
 'young',
 'gaining',
 'without',
 'taking',
 'stored',
 'planted',
 'back',
 'day',
 'bulbasaur',
 'soaking',
 'order',
 'sunlight',
 'birth',
 'plant',
 'seen',
 'napping',
 'older',
 'larger',
 'also',
 'grow',
 'ray',
 'filled',
 'body',
 'store',
 'progressively',
 'go',
 'sprout',
 'strange',
 'bright']

In [137]:
# save results

proc_data.to_csv("pokedex_data_proc.csv", index=True, index_label="id")
#.drop("token_post", axis=1, inplace=True)


In [7]:
proc_data = pd.read_csv("pokedex_data_proc.csv", index_col="id")#.poke_desc_proc

proc_data

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist,is_legendary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['larger', 'filled', 'time', 'nutrient', 'ener...","['ivysaur', 'venusaur']",0.7,6.9,0.875,0.125,False
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['blossom', 'bud', 'appears', 'sweet', 'spends...","['bulbasaur', 'venusaur']",1.0,13.0,0.875,0.125,False
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['used', 'bewitching', 'become', 'larger', 've...","['bulbasaur', 'ivysaur']",2.0,100.0,0.875,0.125,False
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['obviously', 'tail', 'hear', 'fiercely', 'thi...","['charmeleon', 'charizard']",0.6,8.5,0.875,0.125,False
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['tail', 'calm', 'tip', 'night', 'blow', 'claw...","['charmander', 'charizard']",1.1,19.0,0.875,0.125,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire,Dragon,Undiscovered,,True,False,[],[],3.5,590.0,,,False
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric,Dragon,Undiscovered,,True,False,[],[],5.2,480.0,,,False
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock,Psychic,Undiscovered,,True,False,[],[],1.5,162.5,,,False
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel,Psychic,Undiscovered,,True,False,[],[],1.6,156.0,,,False


In [367]:
# split type and egg group into separate dimensions

proc_poke_data = pd.read_csv("pokedex_data.csv", index_col="id")

proc_poke_data[["type_1", "type_2"]] = proc_poke_data["type"].str.split(expand=True)

proc_poke_data[["egg_group_1", "egg_group_2"]] = proc_poke_data["egg_group"].str.split(',', expand=True)

proc_poke_data['dual_type'] = [True if type_2 != None else False for type_2 in proc_poke_data.type_2]

proc_poke_data['dual_egg_group'] = [True if group_2 != None else False for group_2 in proc_poke_data.egg_group_2]

proc_poke_data['poke_desc_proc'] = proc_poke_desc

proc_poke_data['evolution_tree'] = df['evolution_tree']

#proc_poke_data.drop(['type', 'egg_group'], axis=1, inplace=True)

proc_poke_data

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type,egg_group,evolution_tree,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[ivysaur, venusaur]",Grass,Poison,Grass,Monster,True,True,"['larger', 'filled', 'time', 'nutrient', 'ener..."
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[bulbasaur, venusaur]",Grass,Poison,Grass,Monster,True,True,"['blossom', 'bud', 'appears', 'sweet', 'spends..."
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass Poison,"Grass, Monster","[bulbasaur, ivysaur]",Grass,Poison,Grass,Monster,True,True,"['used', 'bewitching', 'become', 'larger', 've..."
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,"Dragon, Monster","[charmeleon, charizard]",Fire,,Dragon,Monster,False,True,"['obviously', 'tail', 'hear', 'fiercely', 'thi..."
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,"Dragon, Monster","[charmander, charizard]",Fire,,Dragon,Monster,False,True,"['tail', 'calm', 'tip', 'night', 'blow', 'claw..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire Dragon,Undiscovered,[],Fire,Dragon,Undiscovered,,True,False,[]
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric Dragon,Undiscovered,[],Electric,Dragon,Undiscovered,,True,False,[]
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock Psychic,Undiscovered,[],Rock,Psychic,Undiscovered,,True,False,[]
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel Psychic,Undiscovered,[],Steel,Psychic,Undiscovered,,True,False,[]


In [None]:
pokemon_evolutions

In [8]:
# more data processing

species_proc = proc_data.species.str.lower().str.replace(" ", "").str.replace("pokémon", "")

ability_proc = proc_data.ability.str.lower().str.replace(" ", "").str.replace(".", "")

type_proc_1 = proc_data.type_1.str.lower().str.replace(" ", "")#.str.replace(".", "")

type_proc_2 = proc_data.type_2.str.lower().str.replace(" ", "")

egg_proc_1 = proc_data.egg_group_1.str.lower().str.replace(",", "").str.replace("-", "").str.replace("1", "").str.replace("2", "").str.replace("3", "").str.replace(" ", "").str.strip()

egg_proc_2 = proc_data.egg_group_2.str.lower().str.replace(",", "").str.replace("-", "").str.replace("1", "").str.replace("2", "").str.replace("3", "").str.replace(" ", "").str.strip()

dual_type = proc_data.dual_type

dual_egg_group = proc_data.dual_egg_group

height_proc = proc_data.height

weight_proc = proc_data.weight

male_proc = proc_data.male_dist

female_proc = proc_data.female_dist

legend_status = proc_data.is_legendary

  ability_proc = proc_data.ability.str.lower().str.replace(" ", "").str.replace(".", "")


In [43]:
species_proc

id
1          seed
2          seed
3          seed
4        lizard
5         flame
         ...   
1020    paradox
1021    paradox
1022    paradox
1023    paradox
1024       tera
Name: species, Length: 1024, dtype: object

In [17]:
#Define TF-IDF Vectorizer Object

tfidf = TfidfVectorizer(ngram_range=(1,2))

# proc_poke_data['poke_desc_proc'] and proc_poke_data['evolution_tree'] are lists of lists

desc_documents = [''.join(map(str, i)) for i in proc_data['poke_desc_proc']]
evol_documents = [''.join(map(str, i)) for i in proc_data['evolution_tree']]

# Combine the documents

all_documents = desc_documents + evol_documents

# Construct the required TF-IDF matrix by fitting and transforming the data

tfidf_matrix_combined = tfidf.fit_transform(all_documents)

# Split the tfidf_matrix_combined into tfidf_matrix_description and tfidf_matrix_evolutions

tfidf_matrix_description = tfidf_matrix_combined[:len(desc_documents)]
tfidf_matrix_evolutions = tfidf_matrix_combined[len(desc_documents):]

# Output the shape of tfidf_matrix

print(tfidf_matrix_description.shape)
print(tfidf_matrix_evolutions.shape)


(1024, 60345)
(1024, 60345)


In [None]:
proc_data['poke_desc_proc'].iloc[0]

all_documents

In [398]:
#tfidf_matrix.toarray()

[i.strip('][').strip("'").rstrip("'").split(', ') for i in proc_poke_data['poke_desc_proc']][0]

["larger'",
 "'filled'",
 "'time'",
 "'nutrient'",
 "'energy'",
 "'grow'",
 "'pokmon'",
 "'go'",
 "'slowly'",
 "'seed'",
 "'carry'",
 "'us'",
 "'steadily'",
 "'grows'",
 "'also'",
 "'order'",
 "'bright'",
 "'strange'",
 "'bulbasaur'",
 "'sunlight'",
 "'older'",
 "'nourishment'",
 "'stored'",
 "'ray'",
 "'young'",
 "'soaking'",
 "'gaining'",
 "'taking'",
 "'sun'",
 "'right'",
 "'plant'",
 "'sprout'",
 "'back'",
 "'bulb'",
 "'day'",
 "'planted'",
 "'birth'",
 "'seen'",
 "'morsel'",
 "'without'",
 "'single'",
 "'store'",
 "'born'",
 "'progressively'",
 "'body'",
 "'eating'",
 "'napping"]

In [138]:
proc_data

#df

Unnamed: 0_level_0,name,pokedex_desc,species,ability,type_1,type_2,egg_group_1,egg_group_2,dual_type,dual_egg_group,poke_desc_proc,evolution_tree,height,weight,male_dist,female_dist,is_legendary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['larger', 'filled', 'time', 'nutrient', 'ener...","[ivysaur, venusaur]",0.7,6.9,0.875,0.125,False
2,Ivysaur,"['When the bulb on its back grows large, it ap...",Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['blossom', 'bud', 'appears', 'sweet', 'spends...","[bulbasaur, venusaur]",1.0,13.0,0.875,0.125,False
3,Venusaur,['The plant blooms when it is absorbing solar ...,Seed Pokémon,OvergrowChlorophyll,Grass,Poison,Grass,Monster,True,True,"['used', 'bewitching', 'become', 'larger', 've...","[bulbasaur, ivysaur]",2.0,100.0,0.875,0.125,False
4,Charmander,"['Obviously prefers hot places. When it rains,...",Lizard Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['obviously', 'tail', 'hear', 'fiercely', 'thi...","[charmeleon, charizard]",0.6,8.5,0.875,0.125,False
5,Charmeleon,"['When it swings its burning tail, it elevates...",Flame Pokémon,BlazeSolar Power,Fire,,Dragon,Monster,False,True,"['tail', 'calm', 'tip', 'night', 'blow', 'claw...","[charmander, charizard]",1.1,19.0,0.875,0.125,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,Gouging Fire,[],Paradox Pokémon,Protosynthesis,Fire,Dragon,Undiscovered,,True,False,[],[],3.5,590.0,,,False
1021,Raging Bolt,[],Paradox Pokémon,Protosynthesis,Electric,Dragon,Undiscovered,,True,False,[],[],5.2,480.0,,,False
1022,Iron Boulder,[],Paradox Pokémon,Quark Drive,Rock,Psychic,Undiscovered,,True,False,[],[],1.5,162.5,,,False
1023,Iron Crown,[],Paradox Pokémon,Quark Drive,Steel,Psychic,Undiscovered,,True,False,[],[],1.6,156.0,,,False


In [385]:
test_list = proc_poke_data.poke_desc_proc.iloc[0].strip('][').strip("''").rstrip("'").split(', ')

#test_list = [i[:-1] for i in test_list if "'" in i]

test_list

["larger'",
 "'filled'",
 "'time'",
 "'nutrient'",
 "'energy'",
 "'grow'",
 "'pokmon'",
 "'go'",
 "'slowly'",
 "'seed'",
 "'carry'",
 "'us'",
 "'steadily'",
 "'grows'",
 "'also'",
 "'order'",
 "'bright'",
 "'strange'",
 "'bulbasaur'",
 "'sunlight'",
 "'older'",
 "'nourishment'",
 "'stored'",
 "'ray'",
 "'young'",
 "'soaking'",
 "'gaining'",
 "'taking'",
 "'sun'",
 "'right'",
 "'plant'",
 "'sprout'",
 "'back'",
 "'bulb'",
 "'day'",
 "'planted'",
 "'birth'",
 "'seen'",
 "'morsel'",
 "'without'",
 "'single'",
 "'store'",
 "'born'",
 "'progressively'",
 "'body'",
 "'eating'",
 "'napping"]

In [40]:
# perform one hot encoding on other categorical variables

from sklearn.preprocessing import OneHotEncoder

# Convert categorical variable to one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')

species_encoded = encoder.fit_transform(species_proc.values.reshape(-1, 1))

ability_encoded = encoder.fit_transform(ability_proc.values.reshape(-1, 1))

type_encoded = encoder.fit_transform(type_proc.values.reshape(-1, 1))

type_encoded_1 = encoder.fit_transform(type_proc_1.values.reshape(-1, 1))#types_coded.type_1.values.reshape(-1,1) #encoder.fit_transform(type_proc_1.values.reshape(-1, 1))

type_encoded_2 = encoder.fit_transform(type_proc_2.values.reshape(-1, 1))#types_coded.type_2.values.reshape(-1,1) #encoder.fit_transform(type_proc_2.values.reshape(-1, 1))

egg_encoded = encoder.fit_transform(egg_proc.values.reshape(-1, 1))

egg_encoded_1 = encoder.fit_transform(egg_proc_1.values.reshape(-1, 1)) # eggs_coded.egg_group_1.values.reshape(-1,1)

egg_encoded_2 = encoder.fit_transform(egg_proc_2.values.reshape(-1, 1)) # eggs_coded.egg_group_2.values.reshape(-1,1)

dual_type_encoded = encoder.fit_transform(dual_type.values.reshape(-1, 1))

dual_egg_group_encoded = encoder.fit_transform(dual_egg_group.values.reshape(-1, 1))

height_encoded = encoder.fit_transform(height_proc.values.reshape(-1, 1))

weight_encoded = encoder.fit_transform(weight_proc.values.reshape(-1, 1))

male_encoded = encoder.fit_transform(male_proc.values.reshape(-1, 1))

female_encoded = encoder.fit_transform(female_proc.values.reshape(-1, 1))

legend_encoded = encoder.fit_transform(legend_status.values.reshape(-1, 1))

# Combine text embedding and categorical encoding
combined_embedding_v1 = np.concatenate((tfidf_matrix_description.toarray(), 
                                     species_encoded, 
                                     ability_encoded,
                                     type_encoded,
                                     egg_encoded), axis=1)

# Combine text embedding and categorical encoding for v2
combined_embedding_v2 = np.concatenate((tfidf_matrix_description.toarray(), 
                                        tfidf_matrix_evolutions.toarray(),
                                     species_encoded, 
                                     ability_encoded,
                                     type_encoded_1,
                                     type_encoded_2,
                                     egg_encoded_1,
                                     egg_encoded_2,
                                     dual_type_encoded,
                                     dual_egg_group_encoded), axis=1)

# Combine text embedding and categorical encoding for v3
combined_embedding_v3 = np.concatenate((tfidf_matrix_description.toarray(), 
                                        tfidf_matrix_evolutions.toarray(),
                                     species_encoded, 
                                     ability_encoded,
                                     type_encoded_1,
                                     type_encoded_2,
                                     egg_encoded_1,
                                     egg_encoded_2,
                                     dual_type_encoded,
                                     dual_egg_group_encoded,
                                     height_encoded,
                                     weight_encoded,
                                     male_encoded,
                                     female_encoded,
                                     legend_encoded), axis=1)


combined_embedding_v3

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [38]:
len(species_encoded[6])

712

In [144]:
# try eucliedean and manhattan distance

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity

In [79]:
# compute similarity matrices

cosine_sim_v4 = linear_kernel(combined_embedding_v3, combined_embedding_v3)

cosine_sim_v3 = linear_kernel(combined_embedding_v2, combined_embedding_v2)

cosine_sim_v2 = linear_kernel(combined_embedding_v1, combined_embedding_v1)

cosine_sim_v1 = linear_kernel(tfidf_matrix_description, tfidf_matrix_description)

euc_dist_v1 = euclidean_distances(combined_embedding_v2, combined_embedding_v2)

man_dist_v1 = manhattan_distances(combined_embedding_v2, combined_embedding_v2)

cosine_sim_native = cosine_similarity(combined_embedding_v2, combined_embedding_v2)

In [90]:
# updated euclidean distance matrix

euc_dist_v2 = euclidean_distances(combined_embedding_v3, combined_embedding_v3)


In [560]:
# save embeddings

np.save("cosine_sim_v3_pokemon_custom.npy", cosine_sim_v3)

np.save("cosine_sim_v1_pokemon_native.npy", cosine_sim_native)

np.savetxt("cosine_sim_v3_pokemon_custom.txt", cosine_sim_v3, delimiter=",")

In [72]:
# use np.load() to load .npy files

cosine_sim_v4 = np.load("cosine_sim_v4_pokemon_custom.npy")

cosine_sim_v3 = np.load("cosine_sim_v3_pokemon_custom.npy")

In [151]:
# save v4

np.save("cosine_sim_v4_pokemon_custom.npy", cosine_sim_v4)

In [53]:
# reverse mapping of indices and video game titles

indices = pd.Series(proc_data.index, index=proc_data['name'], dtype=int)

indices['Blastoise']

9

In [76]:
# Recommendation function that takes video game title 
# as input and outputs most similar video games

def rec_similar_pokemon(pokemon, cosine_sim_, list_len=11, reverse=True):
    
    # get index of pokemon that matches input
    # minus 1 since tabular data is indexed by Pokemon Pokedex ID 
    # since idx will be used to index the embedding, correct idx for Bulbasaur would be 0, not 1
    
    idx = indices[pokemon] - 1
    
    # get pairwise similarity scores of all pokemon with the given name
    
    sim_scores = list(enumerate(cosine_sim_[idx]))
    
    # sort pokemon based on similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=reverse)
    
    # get scores of 10 most similar pokemon
    
    sim_scores = sim_scores[0:list_len]
    
    # get pokemon indices
    
    poke_indices = [i[0] for i in sim_scores] 
    
    # return top 10 most similar pokemon
    
    recs = proc_data['name'].iloc[poke_indices]
    
    # create mask to exclude input pokemon
    
    mask = recs == pokemon
    
    #print([i[1] for i in sim_scores] )
    
    return recs[~mask]

In [230]:
# resave pokemon names csv

new_poke_names = pd.read_csv("app/pokemon_names.csv", index_col="id")

new_poke_names['url_name'] = poke_list

new_poke_names.to_csv("app/pokemon_names.csv", index=True, index_label='id')

new_poke_names

Unnamed: 0_level_0,name,pokedex_desc,url_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Bulbasaur,['A strange seed was planted on its back at bi...,bulbasaur
2,Ivysaur,"['When the bulb on its back grows large, it ap...",ivysaur
3,Venusaur,['The plant blooms when it is absorbing solar ...,venusaur
4,Charmander,"['Obviously prefers hot places. When it rains,...",charmander
5,Charmeleon,"['When it swings its burning tail, it elevates...",charmeleon
...,...,...,...
1020,Gouging Fire,[],gouging-fire
1021,Raging Bolt,[],raging-bolt
1022,Iron Boulder,[],iron-boulder
1023,Iron Crown,[],iron-crown


In [16]:
new_poke_names = pd.read_csv("app/pokemon_names.csv", index_col="id")

new_poke_names.query("name == 'Bulbasaur'")['url_name'].iloc[0]

'bulbasaur'

In [242]:
indices.loc['Bulbasaur']

poke_dict = {"index":new_poke_names.index, "url_name":new_poke_names['url_name'].values}

indices = pd.DataFrame(data=poke_dict, index=new_poke_names['name'])

print(indices.loc['Bulbasaur']['index'])

indices

1


Unnamed: 0_level_0,index,url_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bulbasaur,1,bulbasaur
Ivysaur,2,ivysaur
Venusaur,3,venusaur
Charmander,4,charmander
Charmeleon,5,charmeleon
...,...,...
Gouging Fire,1020,gouging-fire
Raging Bolt,1021,raging-bolt
Iron Boulder,1022,iron-boulder
Iron Crown,1023,iron-crown


In [112]:
# manhattan distance v2

rec_similar_pokemon("Houndour", cosine_sim_=man_dist_v1, reverse=False)

id
1009    Walking Wake
1010     Iron Leaves
1014         Okidogi
1015       Munkidori
1016     Fezandipiti
1020    Gouging Fire
1021     Raging Bolt
1022    Iron Boulder
1023      Iron Crown
1017         Ogerpon
Name: name, dtype: object

In [116]:
# euclidean distance v2

rec_similar_pokemon("Mightyena", cosine_sim_=euc_dist_v2, reverse=False)

id
261    Poochyena
359        Absol
352      Kecleon
28     Sandslash
509     Purrloin
77        Ponyta
53       Persian
57      Primeape
942     Maschiff
749      Mudbray
Name: name, dtype: object

In [114]:
# v4

# based on desc, species, ability, type 1, type 2, egg 1, egg 2, if dual type, if dual egg, evolution tree, height, weight, male_dist, female_dist, if_legendary

rec_similar_pokemon("Houndour", cosine_sim_=cosine_sim_v4, list_len=11)

id
229     Houndoom
461      Weavile
215      Sneasel
608      Lampent
827       Nickit
862    Obstagoon
585     Deerling
262    Mightyena
165       Ledyba
732     Trumbeak
Name: name, dtype: object

In [115]:
# euclidean distance

rec_similar_pokemon("Houndour", cosine_sim_=euc_dist_v1, reverse=False)

id
229      Houndoom
1004       Chi-Yu
461       Weavile
215       Sneasel
862     Obstagoon
359         Absol
262     Mightyena
571       Zoroark
893        Zarude
942      Maschiff
Name: name, dtype: object

In [117]:
# v3

# based on desc, species, ability, type 1, type 2, egg 1, egg 2, if dual type, if dual egg, evoultion-tree

rec_similar_pokemon("Houndour", cosine_sim_=cosine_sim_v3)

# for i,j in enumerate(rec_similar_pokemon("Blaziken", cosine_sim_=cosine_sim_v3)):
#     print(i,j)

id
229      Houndoom
461       Weavile
215       Sneasel
862     Obstagoon
1004       Chi-Yu
262     Mightyena
634      Zweilous
571       Zoroark
893        Zarude
630     Mandibuzz
Name: name, dtype: object

In [565]:
# v2

# based on description, species, ability, type (combined), egg (combined)

rec_similar_pokemon("Jolteon", cosine_sim_=cosine_sim_v2)

[4.999999999999999, 3.125649398726522, 2.1112994840586214, 2.0776057304141897, 2.0677103637777536, 2.0496647704056943, 2.0480572031629487, 2.040763593380544, 2.0407478962275, 2.0243462184600434, 2.019714591225493]


id
309    Electrike
310    Manectric
404        Luxio
523    Zebstrika
921        Pawmi
835       Yamper
522      Blitzle
403        Shinx
836      Boltund
405       Luxray
Name: name, dtype: object

In [179]:
# old

# just based on pokedex description

rec_similar_pokemon("Blastoise", cosine_sim_=cosine_sim_v1)

id
1      Bulbasaur
2        Ivysaur
3       Venusaur
4     Charmander
5     Charmeleon
6      Charizard
7       Squirtle
8      Wartortle
10      Caterpie
11       Metapod
Name: name, dtype: object

In [None]:
rec_similar_pokemon("Blastoise", cosine_sim_=cosine_sim_v1)

In [538]:
sim_scores

[(1, 1.0),
 (2, 1.0),
 (3, 1.0),
 (4, 1.0),
 (5, 1.0),
 (6, 1.0),
 (7, 1.0),
 (8, 1.0),
 (9, 1.0),
 (10, 1.0),
 (11, 1.0),
 (12, 1.0),
 (13, 1.0),
 (14, 1.0),
 (15, 1.0),
 (16, 1.0),
 (17, 1.0),
 (18, 1.0),
 (35, 1.0),
 (36, 1.0),
 (39, 1.0),
 (40, 1.0),
 (41, 1.0),
 (42, 1.0),
 (56, 1.0),
 (57, 1.0),
 (63, 1.0),
 (64, 1.0),
 (65, 1.0),
 (66, 1.0),
 (67, 1.0),
 (68, 1.0),
 (69, 1.0),
 (70, 1.0),
 (71, 1.0),
 (81, 1.0),
 (82, 1.0),
 (92, 1.0),
 (93, 1.0),
 (94, 1.0),
 (111, 1.0),
 (112, 1.0),
 (113, 1.0),
 (116, 1.0),
 (117, 1.0),
 (123, 1.0),
 (125, 1.0),
 (126, 1.0),
 (147, 1.0),
 (148, 1.0)]

### Jaccard Index Implementation

In [43]:
# define Jaccard Index Function

def jaccard_index(set1, set2): # takes 2 rows of an array as arguments
    
    intersection = len(list(set(set1).intersection(set2)))
    union = len(set1.union(set2))
    
    return intersection / union

In [140]:
other_row = set(combined_embedding_v3[2])

pokemon_row = set(combined_embedding_v3[0])

jaccard_index(pokemon_row, other_row)



0.041666666666666664

In [142]:
# set up test with Bulbasaur

# get index of pokemon that matches input

jaccard_indices = []
    
idx = indices["Salamence"]

# pokemon_row = tfidf_matrix.toarray()[idx]

# for i in range(tfidf_matrix.toarray().shape[0]):
    
#-------------------------------------------------------------------#
    
pokemon_row = cosine_sim_v3[idx-1]

for i in range(cosine_sim_v3.shape[0]):

#-------------------------------------------------------------------#
    
    #if i != idx:
        
#     other_row = tfidf_matrix.toarray()[i]
    
    other_row = cosine_sim_v3[i]
    
    set1 = set(pokemon_row)
    set2 = set(other_row)

    jaccard_indices.append(jaccard_index(set1, set2))
    
jaccard_scores = list(enumerate(jaccard_indices)) #[enumerate(j) for j in jaccard_indices]
        
# try to see results
        
sim_scores = sorted(jaccard_scores, key=lambda x: x[1], reverse=True)
    
# get scores of 10 most similar pokemon

sim_scores = sim_scores[1:11]

# get pokemon indices

poke_indices = [i[0] for i in sim_scores] 

# return top 10 most similar pokemon

recs = proc_data['name'].iloc[poke_indices]

recs

id
921            Pawmi
657        Frogadier
654          Braixen
169           Crobat
81         Magnemite
1017         Ogerpon
1024       Terapagos
1012    Poltchageist
1013       Sinistcha
1014         Okidogi
Name: name, dtype: object

## Hamming Distance Implementation

In [145]:
def hamming_distance(arr1, arr2):
    # Flatten both arrays
    flat_arr1 = arr1.flatten()
    flat_arr2 = arr2.flatten()
    
    # Count differences between corresponding elements
    diff_count = np.sum(flat_arr1 != flat_arr2)
    
    # Normalize the Hamming distance
    num_elements = flat_arr1.size
    hamming_dist = diff_count / num_elements
    
    return hamming_dist

In [155]:
# set up test with Bulbasaur

# get index of pokemon that matches input

hamming_indices = []
    
idx = indices["Sceptile"]

# pokemon_row = tfidf_matrix.toarray()[idx]

# for i in range(tfidf_matrix.toarray().shape[0]):
    
#-------------------------------------------------------------------#
    
pokemon_row = cosine_sim_v4[idx-1]

for i in range(cosine_sim_v4.shape[0]):

#-------------------------------------------------------------------#
    
    #if i != idx:
        
#     other_row = tfidf_matrix.toarray()[i]
    
    other_row = cosine_sim_v4[i]
    
    set1 = pokemon_row
    set2 = other_row

    hamming_indices.append(hamming_distance(set1, set2))
    
hamming_scores = list(enumerate(hamming_indices)) #[enumerate(j) for j in jaccard_indices]
        
# try to see results
        
sim_scores = sorted(hamming_scores, key=lambda x: x[1], reverse=False)
    
# get scores of 10 most similar pokemon

sim_scores = sim_scores[1:11]

# get pokemon indices

poke_indices = [i[0] for i in sim_scores] 

# return top 10 most similar pokemon

recs = proc_data['name'].iloc[poke_indices]

recs

id
153      Bayleef
253      Grovyle
388       Grotle
154     Meganium
252      Treecko
387      Turtwig
152    Chikorita
496      Servine
812    Rillaboom
495        Snivy
Name: name, dtype: object

In [None]:
idx = indices["Bulbasaur"]

list(enumerate(cosine_sim_v3[idx]))[:5]