# Loading the Pokemon Stories

In [1]:
import sys
import pandas as pd
import sqlite3 as lite
import numpy as np
import unicodedata
import re

In [2]:
def connectToDataBase(database_name, data_folder):
    try:
        con = lite.connect(data_folder + '/' + database_name + '.sqlite')
        cur = con.cursor()
        cur.execute('SELECT name FROM sqlite_master WHERE type = "table"')
        table_names = cur.fetchall()
        if len(table_names) == 0: print('Empty Database')
        return cur, con, table_names
    except:
        print('Could not connect to database.')
    return None, None, None

In [3]:
cwd = 'D:/Websites/StoryWebsiteChatBots/Data/'

In [4]:
db_cur, db_con, table_names = connectToDataBase('pokemon', cwd)

In [5]:
table_names

[('sqlite_sequence',), ('Fanfiction',)]

In [6]:
def getTableFromDb(db_con, table_name : str):
    return pd.read_sql_query('SELECT * FROM ' + table_name, db_con)

In [7]:
df = getTableFromDb(db_con, 'Fanfiction')

In [8]:
df['Text'].loc[0][:2000]

"Ash Ketchum – trainer of Pallet Town, winner of the Orange League and increasingly close runner up in five other regional tournaments – was... confused.He was floating in black, empty nothingness. No light, no sound, nothing to use as a reference point – and, at first, he wasn't sure how he'd ended up there.When he tried to remember, instead of the most recent moments he found something strange happening. His mind went straight back to the earliest moments he could remember – flicking past at astonishing speed, as though his whole life were flashing before his eyes on fast forward.A few things stood out, as though he was slowing down to think about them more clearly.The first time he saw a Pokémon, at barely two years of age, when Professor Oak – taking care of him when Ash's mother was shopping - was called away suddenly and Ash, Gary and Daisy were left under the supervision of his distinguished old Arcanine, a gentle Pokémon, but a terror in battle decades before when Samuel Oak wa

In [9]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [10]:
# ToDo : add this to Django site and add tests
def processLine(line_string):
    line_string = unicodeToAscii(line_string)
    line_string = line_string.lower()
    
    # special case for ...
    line_string = re.sub(r'#', '', line_string)
    line_string = re.sub(r'\.\.\.', ' #', line_string)

    # replace unspaced '.' with a space ' .'
    line_string = re.sub(r'\.', '. ', line_string)
    line_string = re.sub(r'  ', ' ', line_string)

    # replace space with comma
    line_string = re.sub(r', ', ',_,', line_string)
    line_string = re.sub(r'[\s]', ',', line_string)

    # get rid of all invalid symbols
    line_string = re.sub(r'[^a-z,!\?\.\-0-9\']', '', line_string)

    # Add separators to puncuation
    line_string = re.sub(r'[!]', ',!', line_string)
    line_string = re.sub(r'[?]', ',?', line_string)
    line_string = re.sub(r'[.]', ',.', line_string)

    return line_string.split(',')

In [11]:
convertListToStr = lambda x : ' '.join(x)

In [12]:
df['Text'] = df['Text'].apply(processLine)
df['Text'] = df['Text'].apply(convertListToStr)

In [13]:
df['Text']

0      ash ketchum  trainer of pallet town  winner of...
1      ash came groggily to awareness in his home bed...
2      we are lost . ash sighed . i know . we are los...
3      there you go .  pidgeot flared her wings  dism...
4      as the group of friends left the rota pokemon ...
                             ...                        
296    is the camera ready ? roxy checked . naturally...
297    why didn't we take care of this sooner ? jessi...
298    okay  so i want to make sure i'm following all...
299    it's much warmer down here than it was up arou...
300    you know  there's an important question we hav...
Name: Text, Length: 301, dtype: object

In [14]:
def addSentenceSeparators(line_string):
    line_string = re.sub(r'\s\s', ' ', line_string)
    line_string = re.sub(r'(!)+', '!%', line_string)
    line_string = re.sub(r'(\?)+', '?%', line_string)
    line_string = re.sub(r'(\.)+', '.%', line_string)
    line_string = re.sub(r'!%"', '!"%', line_string)
    line_string = re.sub(r'\?%"', '?"%', line_string)
    line_string = re.sub(r'\.%"', '."%', line_string)
    return line_string

In [15]:
splitByPercSign = lambda x : x.split('%')

In [16]:
def createDataFrameOfEverySentence(df):
    df_mod = df.copy()

    df_mod['Text'] = df_mod['Text'].apply(addSentenceSeparators)
    df_mod['Text'] = df_mod['Text'].apply(splitByPercSign)

    out_df = pd.DataFrame(data = {'input' : [], 'truth' : []})
    ind = 0
    for i in df.index:
        sentence_list = df_mod.loc[i]['Text']
        n_sentences   = len(sentence_list)

        new_input = {(ind + j):sentence_list[j] for j in range(n_sentences-1)}
        new_truth = {(ind + j):sentence_list[j + 1] for j in range(n_sentences-1)}
        new_dict  = {'input' : new_input, 'truth' : new_truth}
        new_df    = pd.DataFrame.from_dict(new_dict)
        
        out_df = out_df.append(new_df)
        ind += n_sentences
    return out_df

In [17]:
sentence_df = createDataFrameOfEverySentence(df)

In [18]:
sentence_df

Unnamed: 0,input,truth
0,ash ketchum trainer of pallet town winner of t...,he was floating in black empty nothingness .
1,he was floating in black empty nothingness .,no light no sound nothing to use as a referen...
2,no light no sound nothing to use as a referen...,when he tried to remember instead of the most...
3,when he tried to remember instead of the most...,his mind went straight back to the earliest m...
4,his mind went straight back to the earliest m...,a few things stood out as though he was slowi...
...,...,...
183018,that's just another reason to beat paul then !,ash decided .
183019,ash decided .,you'd think that 'wanting to win the league' ...
183020,you'd think that 'wanting to win the league' ...,anand i'm moving on to the sinnoh conference .
183021,anand i'm moving on to the sinnoh conference .,ash does have kind of a tricky problem here .


In [19]:
sentence_df.to_csv(cwd + 'pokemon_story.csv')