In [347]:
# All the imports we need

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

# Data Preparation

In [348]:
names = pd.read_csv("names.csv")
names = names.loc[(names['id'] <= 151)]
names = names[['id','identifier','color_id']]

In [349]:
colors = {
    1: "black",
    2: "blue",
    3: "brown",
    4: "gray",
    5: "green",
    6: "pink",
    7: "purple",
    8: "red",
    9: "white",
    10: "yellow"}

names["color_id"] = names["color_id"].apply(lambda x: colors[x]) 
names.rename(columns={"color_id":"color"})

Unnamed: 0,id,identifier,color
0,1,bulbasaur,green
1,2,ivysaur,green
2,3,venusaur,green
3,4,charmander,red
4,5,charmeleon,red
...,...,...,...
146,147,dratini,blue
147,148,dragonair,blue
148,149,dragonite,brown
149,150,mewtwo,purple


# Text preprocessing


In [350]:
text = pd.read_csv("text.csv")
text = text.loc[(text['language_id'] == 9)] # 9 is English in our dataest


replace_space = re.compile('[/(){}\[\]\|@,;]\\n')
remove_pokemon = re.compile("pokémon")
remove_symbols = re.compile('[^0-9a-z #+_]')
stopwords = set(stopwords.words('english'))


def clean_text(text):

    text = text.lower() 
    text = replace_space.sub(' ', text)
    text = remove_pokemon.sub("", text)
    text = remove_symbols.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text
    
text['flavor_text'] = text["flavor_text"].apply(clean_text)
text.drop_duplicates('flavor_text', inplace = True)
text = text.loc[text["species_id"] <=151]




In [351]:
# Join the two dataframes

text = text.merge(names, left_on="species_id", right_on="id")
text["text"] = text["flavor_text"] + " " + text["identifier"] + " " + text["color_id"]


In [352]:
text["text"][0]
text.dropna(inplace=True)

In [353]:
pokemon = text[["id", "flavor_text", "identifier"]]
pokemon
id_to_pokemon = pd.Series(pokemon.identifier.values,index=pokemon.id).to_dict()
id_to_pokemon
my_tags = list(id_to_pokemon.keys())
pokemon = pokemon[["id", "flavor_text"]]

# Machine Learning

In [354]:
X = pokemon.flavor_text
y = pokemon.id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 42)

In [355]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [361]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

y_pred = logreg.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))


accuracy 0.7704081632653061


# Inference

In [362]:
# initialize list of lists 
to_infer = ["It will teleport away",
       "Permanently angry pig",
       "This one came from the moon",
       "Tail is on fire",
       "Fire duck",
       "Yellow mouse with red cheeks",
       "Has a big tongue",
       "Flower on it's back",
       "Electric bird",
       "Flying dinosaur",
       "Swimmer",
       "Has three heads",
       "Mystery",
       "Has two heads",
       "Punching fighter",
       "Kicking fighter",
       "Water turtle",
       "Don't let it sting you",
       "Fire bird",
       "Created by scientist",
       "Useless fish"
       ]




  
# Create the pandas DataFrame 
to_infer_series = pd.DataFrame(inf, columns = ['Description']).Description
y_pred = logreg.predict(to_infer_series)
type(y_pred)
results = pd.DataFrame(y_pred, inf, columns = ["Results"])
results["Results"] = results["Results"].apply(lambda x: id_to_pokemon[x])
results

Unnamed: 0,Results
It will teleport away,abra
Permanently angry pig,primeape
This one came from the moon,clefairy
Tail is on fire,vulpix
Fire duck,charizard
Yellow mouse with red cheeks,pikachu
Has a big tongue,lickitung
Flower on it's back,venusaur
Electric bird,raichu
Flying dinosaur,aerodactyl
