In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

import gensim.downloader as api
# Load GloVe embeddings
# glove_model = api.load("glove-wiki-gigaword-100")  # 100-dimension GloVe embeddings

import torch
from transformers import BertTokenizer, BertModel
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mroberts\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mroberts\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


These are the questions I'm asking the users to learn more about them. These are the defaults, my answers, until I know it works. I don't want to have to fill these in every single time I test it, so I've hard coded the answers for now.

In [2]:
# inputs

# What is your personal aesthetic? What colors, materials, and patterns describe your wardrobe or living spaces? (cottagecore, beach vibes, lumberjack, business formal, cozy, etc.)
aesthetic = "I like the small cozy feel of cottagecore, with the dark greens, wood burning stove, and homemade bread, reading a book on a rainy day. I also wear a lot of hawaiian shirts and like cool, breezy clothes. Overall, my aesthetic is comfy."

# What kind of weather do you like? (thunderstorms, low humidity, sunny afternoons, temperature)
weather = "I enjoy sunny, cloudless days in general, with the occasional rainy day. I like warm days and cool evenings."

# What biomes or geographical areas do you find yourself drawn to? (deserts, beaches, mountain tops, big cities, boreal forests, etc)
biome = "I like rainforests, dry temperate forests, islands, meadows, beaches, and mountains."

# What do you do for a living? (student, psychologist, retired chef, salary man)
living = "I am a data scientist."

# What is your dream job and why? (astronaut, stay-at-home parent, pro skater)
dream_job = "My dream job is to build sustainable, family-friendly living spaces. I appreciate the world around me, so I like seeing the rooftop gardens that cool down our cities, and I like seeing solar panels covering parking lots. I like seeing kids playing outside in parks and on the streets that are safe for them. I like seeing community gardens."

# What is your general disposition? (grumpy, jolly, content)
mood = "I am generally happy but not always."

# What are your hobbies? (hiking, exercising, video games, underwater basketweaving)
hobbies = "I like to code and work on data analysis projects. I like to hike and go to the gym to stay healthy."

# Lastly, a responsible pet owner knows their limits. How many pokemon do you expect to care for?
num_pokemon = 6

I combined my answers into a paragraph and run the Bag of Words on it, without English stopwords. For my answers, I think my vocabular looks pretty good! I could do without words like 'lot', 'generally', and 'occasional', but I understand that's how some people talk, so I won't remove them.

In [3]:
inputs = [aesthetic, weather, biome, living, dream_job, mood, hobbies]
inputs = [i.lower() for i in inputs]
print(inputs)

# Input Bag of Words
inputs = list({aesthetic, weather, biome, living, dream_job, mood, hobbies})

vectorizer = CountVectorizer(stop_words='english')
vectored = vectorizer.fit_transform(inputs)
input_bow = vectored.toarray()
vocabulary = vectorizer.get_feature_names_out()
vocabulary = [text for text in vocabulary if not re.search(r'\d', text)]

print()
print("Vocab: ", len(vocabulary), vocabulary)

['i like the small cozy feel of cottagecore, with the dark greens, wood burning stove, and homemade bread, reading a book on a rainy day. i also wear a lot of hawaiian shirts and like cool, breezy clothes. overall, my aesthetic is comfy.', 'i enjoy sunny, cloudless days in general, with the occasional rainy day. i like warm days and cool evenings.', 'i like rainforests, dry temperate forests, islands, meadows, beaches, and mountains.', 'i am a data scientist.', 'my dream job is to build sustainable, family-friendly living spaces. i appreciate the world around me, so i like seeing the rooftop gardens that cool down our cities, and i like seeing solar panels covering parking lots. i like seeing kids playing outside in parks and on the streets that are safe for them. i like seeing community gardens.', 'i am generally happy but not always.', 'i like to code and work on data analysis projects. i like to hike and go to the gym to stay healthy.']

Vocab:  80 ['aesthetic', 'analysis', 'appreci

I brought in the full pokedex information that was previously preprocessed.

In [4]:
full_dex = pd.read_csv("Documents/GT/Potential Projects/pokedex_full.csv")
display(full_dex.shape)
display(full_dex)

(1025, 7)

Unnamed: 0,Pokemon,Number,Color,Habitat,Type,Generation,Description
0,Bulbasaur,1,green,grassland,Grass Poison,generation-i,green grassland grass poison - a strange seed ...
1,Ivysaur,2,green,grassland,Grass Poison,generation-i,green grassland grass poison - when the bulb o...
2,Venusaur,3,green,grassland,Grass Poison,generation-i,green grassland grass poison - the plant bloom...
3,Charmander,4,red,mountain,Fire,generation-i,red mountain fire - obviously prefers hot plac...
4,Charmeleon,5,red,mountain,Fire,generation-i,red mountain fire - when it swings its burning...
...,...,...,...,...,...,...,...
1020,Raging-bolt,1021,yellow,unknown,Electric Dragon,generation-ix,yellow electric dragon - it's said to incinera...
1021,Iron-boulder,1022,gray,unknown,Rock Psychic,generation-ix,gray rock psychic - it resembles a pokémon des...
1022,Iron-crown,1023,blue,unknown,Steel Psychic,generation-ix,blue steel psychic - it resembles a mysterious...
1023,Terapagos,1024,blue,unknown,Normal,generation-ix,blue normal - terapagos protects itself using ...


I wasn't too sure about the feasibility of creating a bag of words for each pokemon description. I needed to add the dynamic input bag of words into the vocabulary of the pokedex bag of words to make sure they had the same columns and same vocabulary.

First I turned the Description column into a string of features like I did with the inputs. I think this will make my encoding smaller and faster.

In [5]:
# Initialize CountVectorizer and fit the vectorizer on the 'Description' column
vectorizer.fit(full_dex['Description'])

# Transform the descriptions to a matrix
description_matrix = vectorizer.transform(full_dex['Description'])

# Convert the matrix to an array and create a DataFrame for features
features_df = pd.DataFrame(description_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create a new column in full_dex that contains the unique features
full_dex['Features'] = features_df.apply(lambda row: ' '.join([word for word, val in zip(features_df.columns, row) if val > 0]), axis=1)

# Display the modified DataFrame
display(full_dex[['Pokemon', 'Features']].head())

Unnamed: 0,Pokemon,Features
0,Bulbasaur,birth body born bright bulb bulbasaur carries ...
1,Ivysaur,ability absorbs adds appears aroma bloom bloom...
2,Venusaur,able absorbed absorbing absorbs aroma attracts...
3,Charmander,bit blazes born brightly burn burns charmander...
4,Charmeleon,aggressive agitated air away barbaric battle b...


In [6]:
desc_vectored = vectorizer.fit_transform(full_dex['Features'])

# Transform user inputs into the same vocabulary
inp_vectored = vectorizer.transform(vocabulary)

# Vectorize into bag of words
word_count_matrix = np.vstack((desc_vectored.toarray(), inp_vectored.toarray()))

# Convert to numpy arrays to avoid TypeError
full_dex_bow = word_count_matrix[:len(full_dex)]  # Pokémon descriptions
input_bow = word_count_matrix[len(full_dex):]  # User inputs

# Compute cosine similarity
similarity = cosine_similarity(input_bow, full_dex_bow)

# Create a DataFrame for better visualization of results
similarity_df = pd.DataFrame(similarity, columns=full_dex['Pokemon'], index=vocabulary)

# Sum cosine similarities across all vocabulary terms (rows)
total_similarity = similarity_df.sum(axis=0)

# Sort Pokémon by similarity score in descending order and get the top 6
top_6_pokemon = pd.DataFrame(total_similarity.sort_values(ascending=False)).head(6)

# Display the top 6 Pokémon and their similarity scores
print("\nTop 6 Pokémon with highest similarity:")
display(top_6_pokemon)


Top 6 Pokémon with highest similarity:


Unnamed: 0_level_0,0
Pokemon,Unnamed: 1_level_1
Morelull,0.824163
Vanillish,0.69282
Altaria,0.646997
Grimmsnarl,0.639602
Mabosstiff,0.639602
Reshiram,0.617213


Cosine Similarity is a way to measure the angles of the bag of word vectors to see how similar the bags of words are to each other.

I reviewed the Descriptions of the 6 pokemon, Morelull is a little mushroom pokemon that talks about spores, glow, trees, and dark places. Not bad at all. Vanillish is an ice cream cone, so it uses the words 'freeze' and 'ice' a lot. Not a great fit. I really like Altaria, a dragon with wings made of clouds. It references the words 'soprano', 'clouds', ' wings','sky', and 'cotton'. Even though I like this pokemon, I'm not sure its a great fit. Grimmsnarl is a hairy, buff goblin, so it talks about 'muscles', hair', 'fibers'. Not a great fit for bag of words, but I could see the muscles being close to the 'gym' that I input in something like GloVe or BERT. Mabosstiff is a mastiff that has the word 'children' and 'family' so it isn't too surprising to see it here considering I used the word 'family' in my inputs. Reshiram is a weather dragon, so 'atmosphere', 'heat', 'weather', and 'fire' all show up. I'd say its a medium fit.

I wasn't thrilled with these preliminary results, so I implemented TF-IDF, or, Term Frequency-Inverse Document Frequency, that takes words that appear more and make them less weighted, and words that appear less and make them more weighted.

In [7]:
# Combine the input text (user inputs) and Pokémon descriptions for consistent vectorization
inputs_combined = [' '.join(inputs)]  # User inputs combined into a single string
full_dex_descriptions = full_dex['Description'].tolist()

# Fit TF-IDF on both Pokémon descriptions and user inputs
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(full_dex_descriptions + inputs_combined)

# Separate TF-IDF matrices for Pokémon descriptions and the user input
full_dex_tfidf = tfidf_matrix[:len(full_dex_descriptions)]  # Pokémon descriptions
input_tfidf = tfidf_matrix[len(full_dex_descriptions):]  # User input (single vector)

# Convert to numpy arrays if needed
full_dex_tfidf = full_dex_tfidf.toarray()
input_tfidf = input_tfidf.toarray()

# Cosine Similarity: Now applied to TF-IDF vectors
cosine_sim_tfidf = cosine_similarity(input_tfidf, full_dex_tfidf)
cosine_sim_tfidf = cosine_sim_tfidf.flatten()

# Convert into a usable dataframe and sort by highest similarity
similarity_df_tfidf = pd.DataFrame({'Number': full_dex['Number'], 'Name': full_dex['Pokemon'], 'Type': full_dex['Type'], 'Generation': full_dex['Generation'], 'Similarity': cosine_sim_tfidf, 'Description': full_dex['Description']})

# Extract only the relevant part of the Description (after the " - ")
similarity_df_tfidf['Description'] = similarity_df_tfidf['Description'].str.split(' - ').str[-1]

# Sort by similarity and display top 'num_pokemon' Pokémon
similarity_df_tfidf = similarity_df_tfidf.sort_values(by='Similarity', ascending=False)

# Display the top matches
display(similarity_df_tfidf.head(num_pokemon))

Unnamed: 0,Number,Name,Type,Generation,Similarity,Description
670,671,Florges,Fairy,generation-vi,0.078787,it claims exquisite flower gardens as its terr...
332,333,Swablu,Normal Flying,generation-iii,0.075269,swablu has light and fluffy wings that are lik...
968,969,Glimmet,Rock Poison,generation-ix,0.072964,it absorbs nutrients from cave walls. the peta...
366,367,Huntail,Water,generation-iii,0.071804,huntail’s presence went unnoticed by people fo...
1012,1013,Sinistcha,Grass Ghost,generation-ix,0.071084,"it pretends to be tea, trying to fool people i..."
333,334,Altaria,Dragon Flying,generation-iii,0.068513,altaria dances and wheels through the sky amon...


Florges makes a lot of sense to me, referencing 'flower', 'garden', and 'plants'. Swablu is the preevolution of Altaria, both of which show up here, similar to above. I'm stumped at Glimmet. It has the words 'toxic', 'poisonous', 'petals', and 'cave'. This is just bag of words, so it really shouldn't be able to make these kinds of personality insinuations. Huntail is similar to an eel, with words like 'depths', 'ocean', 'snake', 'prey', and 'tail'. I don't think this is a great fit. Sinistcha is a matcha tea set, so that one isn't that confounding personality-wise, but I didn't put the word 'tea' in my inputs. I don't think TD-IDF was the way to go here.

I wanted to try another context-free method before moving on to the heavier duty models. GloVe, or Global Vectors for Word Representation, can compare semantically similar words and their vector representations. It can measure Euclidean distance between vectors, giving roughly the same distance between man-woman as it does king-queen. It also can compare words like nearest neighbors, so it knows that 'frog' and 'toad' are close to each other.

I put together a GloVe/TF-IDF combination since I liked the TF-IDF results, but still wanted to try GloVe's higher level semantic capture.

In [8]:
# Load GloVe embeddings from a file
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Function to tokenize text without lemmatization
def tokenize(text):
    return word_tokenize(text.lower())  # Tokenize and convert to lowercase

# Function to get the index of the token in the TF-IDF vocabulary
def get_tfidf_index(token, tfidf_vocab_dict):
    return tfidf_vocab_dict.get(token, None)

# Function to retrieve GloVe embedding for a given token
def get_glove_embedding(token, embeddings, embedding_dim=100):
    return embeddings.get(token, np.zeros(embedding_dim))  # Return the embedding if it exists, otherwise a zero vector

# Updated weighted_average_embedding function
def weighted_average_embedding(text, tfidf_vector, tfidf_vocab_dict, embeddings, embedding_dim=100):
    tokens = tokenize(text)
    embedding_sum = np.zeros(embedding_dim)
    total_weight = 0
    
    for token in tokens:
        idx = get_tfidf_index(token, tfidf_vocab_dict)
        if idx is not None:
            weight = tfidf_vector[0, idx]
            
            if weight > 0:
                glove_embedding = get_glove_embedding(token, embeddings, embedding_dim)
                
                if not np.all(glove_embedding == 0):
                    embedding_sum += glove_embedding * weight
                    total_weight += weight
    
    if total_weight > 0:
        return embedding_sum / total_weight
    else:
        return embedding_sum  # Return zero vector if no valid tokens

# Load GloVe embeddings
glove_file_path = 'Documents/GT/Potential Projects/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)

# Ensure the TF-IDF model (tfidf) is defined and fitted on your text data
# Example: tfidf = TfidfVectorizer().fit(your_text_data)

# Get the TF-IDF vocabulary as a dictionary (token: index)
tfidf_vocab_dict = tfidf.vocabulary_

# Calculate the weighted average embeddings for all Pokémon descriptions
dex_embeddings = []
for desc in full_dex['Features']:  # Make sure to replace 'Features' with the correct column name
    tfidf_vector = tfidf.transform([desc])
    dex_embedding = weighted_average_embedding(desc, tfidf_vector, tfidf_vocab_dict, glove_embeddings)
    dex_embeddings.append(dex_embedding)

# Calculate the weighted average embedding for user inputs
user_input = ' '.join(vocabulary)  # Combine inputs into one text
user_tfidf_vector = tfidf.transform([user_input])
user_embedding = weighted_average_embedding(user_input, user_tfidf_vector, tfidf_vocab_dict, glove_embeddings)

# Compute cosine similarity between user input embedding and dex embeddings
dex_embeddings = np.array(dex_embeddings)
user_embedding = user_embedding.reshape(1, -1)

cosine_sim = cosine_similarity(user_embedding, dex_embeddings)
cosine_sim = cosine_sim.flatten()

# Create a DataFrame to store the similarity results
similarity_df = pd.DataFrame({
    'Number': full_dex['Number'],
    'Name': full_dex['Pokemon'],
    'Type': full_dex['Type'],
    'Generation': full_dex['Generation'],
    'Similarity': cosine_sim,
    'Description': full_dex['Description']
})

# Sort by highest similarity and display top matches
similarity_df = similarity_df.sort_values(by='Similarity', ascending=False)
similarity_df['Description'] = similarity_df['Description'].str.split(' - ').str[-1]

# Display the top N results
num_pokemon = 6  # Set this to the desired number of results to display
display(similarity_df.head(num_pokemon))

Unnamed: 0,Number,Name,Type,Generation,Similarity,Description
360,361,Snorunt,Ice,generation-iii,0.925263,snorunt live in regions with heavy snowfall. i...
332,333,Swablu,Normal Flying,generation-iii,0.916174,swablu has light and fluffy wings that are lik...
181,182,Bellossom,Grass,generation-ii,0.911537,"the stinkier the better.at night, this pokémo..."
369,370,Luvdisc,Water,generation-iii,0.909286,luvdisc live in shallow seas in the tropics. t...
388,389,Torterra,Grass Ground,generation-iv,0.906368,small pokémon occasionally gather on its unmov...
550,551,Sandile,Ground Dark,generation-v,0.903758,they live buried in the sands of the desert. t...


This one was a difficult one to piece together, since I haven't used GloVe before. Snorunt is like a little icy cone. lots of 'snow', 'harmony', 'leaves', and 'prosper'. Snorunt doesn't have as much ice reference to the cold as I was expecting. Swablu again, I think that might just be my spirit pokemon at this point. Bellossom doesn't suprise me at all, its like a little grassy hula dancer. It references 'petals', 'sunlight', 'dance', and 'warm'. Since this isn't bag of words, and GloVe can make text connections through mathematical distances, this feels like a great fit for what I put in. Luvdisc is a heart shaped fish, talking about 'coral', 'love', and 'seas'. Another decent fit. Torterra, another favorite of mine, is a big turtle with a tree growing out of its back. 'back', 'ground', 'forests' all show up a few times, so another decent fit. Finally Sandile, a little desert crocodile, talking about 'sand', 'desert', and 'sun'. Not amazing, but not bad. So far this is up there with some good fits.

Finally, I implemented BERT, or Bidirectional Encoder Representations from Transformers. BERT can take ambiguous language and use the surrounding text to determine context. This is the only method I used, of all four methods that is considered to be contextual.

In [9]:
# Function to get BERT embeddings for a sentence
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # We use the CLS token embedding (outputs.last_hidden_state[:, 0, :]) as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].numpy()

# Calculate BERT embeddings for Pokédex descriptions
dex_embeddings = []
for desc in full_dex_descriptions:
    embedding = get_bert_embedding(desc, tokenizer, model)
    dex_embeddings.append(embedding)

dex_embeddings = np.array(dex_embeddings).squeeze(axis=1)  # Convert to NumPy array

# Calculate BERT embedding for the user input
user_input = ' '.join(full_dex_descriptions)  # Combine inputs into one text
user_embedding = get_bert_embedding(user_input, tokenizer, model)
user_embedding = user_embedding.squeeze()

# Compute cosine similarity between user input embedding and dex embeddings
cosine_sim = cosine_similarity(user_embedding.reshape(1, -1), dex_embeddings)
cosine_sim = cosine_sim.flatten()

# Create a DataFrame to store the similarity results
similarity_df = pd.DataFrame({
    'Number': full_dex['Number'],
    'Name': full_dex['Pokemon'],
    'Type': full_dex['Type'],
    'Generation': full_dex['Generation'],
    'Similarity': cosine_sim,
    'Description': full_dex['Description']
})

# Sort by highest similarity and display top 6 matches
similarity_df = similarity_df.sort_values(by='Similarity', ascending=False)
similarity_df['Description'] = similarity_df['Description'].str.split(' - ').str[-1]

top_6_matches = similarity_df.head(num_pokemon)

display(top_6_matches)

Unnamed: 0,Number,Name,Type,Generation,Similarity,Description
0,1,Bulbasaur,Grass Poison,generation-i,1.0,a strange seed was planted on its back at birt...
459,460,Abomasnow,Grass Ice,generation-iv,0.922281,it whips up blizzards in mountains that are al...
356,357,Tropius,Grass Flying,generation-iii,0.921147,the bunches of fruit around tropius’s neck are...
266,267,Beautifly,Bug Flying,generation-iii,0.920884,beautifly’s favorite food is the sweet pollen ...
841,842,Appletun,Grass Dragon,generation-viii,0.919707,eating a sweet apple caused its evolution. a n...
689,690,Skrelp,Poison Water,generation-vi,0.917851,"camouflaged as rotten kelp, they spray liquid ..."


This was the most exciting one to implement since I've never used BERT before. Bulbasaur is a grass dinosaur with a big bulb growing out of its back, so we are looking at 'bulb', 'seed', 'back', and 'birth'. This one isn't bad considering all the green forestry imagery I fed the machine. Abomasnow is the abominable snowman, 'snow flowers', 'blizzard', 'mountain', not good, but not the worst one I've been given. Tropius is like a long necked dinosaur that grows bananas on its body. 'Fruit', 'leaves', and 'tropic' are all here, so I think this is a good fit. I'm not sure about Beautifly, it mentions 'aggressive' several times, along with 'nectar' and 'pollen'. Good for the forest imagery I gave it, but there must be some calmer forest pokemon that would be a better fit. Appletun looks like a dragon made from an apple pie, it talks about 'nectar', 'sweet', and 'scent'. Not bad at all. Skrelp, I keep getting poison types and I don't appreciate it. It looks like a leafy seadragon and talks about 'seaweed', and 'kelp' primarily. Not a bad recommendation at all.

What I want to do is create a UI with all the options for teams and try to make a feedback system to see what people like the most. This concludes the team generator.