# ADS 509 Pokemon Text Analysis: Final Project
### Imports

In [14]:
import re
import nltk
from nltk.tokenize import word_tokenize
from flask import Flask, render_template, url_for, request
import joblib
import json
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### Read in the text file

In [2]:
f = open('C:/Users/mendi/Desktop/USD Class Files/ADS 509/Pokemon_Data.txt')
data = json.load(f)
print(len(data))

1281


### Functions & Tokenize

In [3]:
# Removing URL's
def remove_URL(text):
    return re.sub(r"'url ' : 'http\S+", "", text)

# Tokenize the data
def tokenize(text) :
    return(word_tokenize(text))

# Applying the pipeline
def prepare(text, pipeline) :
    tokens = str(text)
    for transform in pipeline :
        tokens = transform(tokens)
    return(tokens)

# Descriptive statistics of the data
def descriptive_stats(tokens, num_words = 5, verbose=True) :
    num_tokens = len(tokens)
    num_unique_tokens = len(Counter(tokens).keys())
    lexical_diversity = round((num_unique_tokens/num_tokens),2) # Rounded to 2 Digits to match format of existing decimal rounding below
    num_characters = sum(len(i) for i in tokens)
    countered = Counter(tokens)
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")        
    return

In [4]:
pokemon_list = []
p_list = []
for pokemon in data:
    for features in data[pokemon]:
        #print(a) # a is the whole section of combined features
        for feature, description in features.items():
            # b is the dictionaries such as ability, weight, stats that are pulled
            #print(c) # Farthest I can break down for now.
            #p_list.append()
            poke_dict = {'pokemon': pokemon, 'feature': feature, 'description': description}
            pokemon_list.append(poke_dict)

In [5]:
df = pd.DataFrame.from_dict(pokemon_list)
df

Unnamed: 0,pokemon,feature,description
0,bulbasaur,abilities,"[{'ability': {'name': 'overgrow', 'url': 'http..."
1,bulbasaur,base_experience,64
2,bulbasaur,forms,"[{'name': 'bulbasaur', 'url': 'https://pokeapi..."
3,bulbasaur,game_indices,"[{'game_index': 153, 'version': {'name': 'red'..."
4,bulbasaur,height,7
...,...,...,...
23053,miraidon-glide-mode,species,"{'name': 'miraidon', 'url': 'https://pokeapi.c..."
23054,miraidon-glide-mode,sprites,"{'back_default': None, 'back_female': None, 'b..."
23055,miraidon-glide-mode,stats,"[{'base_stat': 100, 'effort': 0, 'stat': {'nam..."
23056,miraidon-glide-mode,types,"[{'slot': 1, 'type': {'name': 'electric', 'url..."


In [6]:
my_pipeline = [str.lower,remove_URL,tokenize]
cleaned_data = []
for row in pokemon_list :
    text = " ".join(prepare(row, pipeline = my_pipeline))
    if text :
        cleaned_data.append(text)

### Descriptive Statistics

In [7]:
descriptive_stats(cleaned_data)

There are 23058 tokens in the data.
There are 23058 unique tokens in the data.
There are 169063470 characters in the data.
The lexical diversity is 1.000 in the data.


In [12]:
descriptive_stats(df)

There are 23058 tokens in the data.
There are 3 unique tokens in the data.
There are 25 characters in the data.
The lexical diversity is 0.000 in the data.


### Building the model

In [19]:
X = df['description']
y = df['pokemon']
cv = CountVectorizer()
X = cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

AttributeError: 'list' object has no attribute 'lower'

### Create the app

In [None]:
app = Flask(_name_)

@app.route('/')
def home():
    return render_template('home.html')

@app.route('/predict', methods=['POST'])
def predict():
    cleaned_data