# Muggle to Wizard Data Cleaning

In [1]:
!rm -rf ../data

In [2]:
import os
import sys

module_path = '{}/code'.format(os.path.abspath(os.path.join('..')))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import utils
from variables import *

!mkdir ../data
utils.download_files(BOOKS)
utils.download_files(EXTRAS)
utils.extract_html_table(SPELLS, na_values=['—'])

Downloaded 'hp1_sorcerers_stone' to 'data/' folder.
Downloaded 'hp2_chamber_of_secrets' to 'data/' folder.
Downloaded 'hp3_prisioner_of_azkaban' to 'data/' folder.
Downloaded 'hp4_globet_of_fire' to 'data/' folder.
Downloaded 'hp5_order_of_the_phoenix' to 'data/' folder.
Downloaded 'hp6_half_blood_prince' to 'data/' folder.
Downloaded 'hp7_deathly_hallows' to 'data/' folder.
Downloaded 'hp_places_list' to 'data/' folder.
Downloaded 'hp_characters_list' to 'data/' folder.
Downloaded 'hp_classes_list' to 'data/' folder.
Downloaded 'hp_spells_list' to 'data/' folder.


In [4]:
import glob
import json
import pandas as pd

books, spells, extras = {}, {}, {}

for f in sorted(glob.glob(os.path.join('../data', '*.txt'))):
    with open(f, 'r', encoding='utf-8', errors='ignore') as file:
        books[os.path.splitext(os.path.basename(f))[0]] = file.read().replace('\n', ' ')
        
for f in sorted(glob.glob(os.path.join('../data', '*.csv'))):
    extras[os.path.splitext(os.path.basename(f))[0]] = pd.read_csv(f, header=None)[0].tolist()
        
spells = pd.read_json(glob.glob(os.path.join('../data', '*.json'))[0], lines=True)
spells = spells[~spells['Resulting Effect'].str.contains('game')]

In [5]:
spells.head(10)

Unnamed: 0,Incantation,Resulting Effect,Type
0,Aberto,Opens objects,Charm
1,Accio,Summons an object,Charm
2,Age Line,Hides things from younger people,Enchanment
3,Aguamenti,Shoots water from wand.,Charm
4,Alarte Ascendare,Shoots things high in the air,Spell
5,Alohomora,Opens locked objects,Charm
6,Anapneo,Clears the target’s airway.,Spell
7,Anteoculatia,Turns head hair into antlers,Hex
8,Anti-Cheating,Prevents Cheating on Exams,Spell
9,Aparecium,Reveals invisible ink,Spell


In [6]:
extras.keys()

dict_keys(['hp_characters_list', 'hp_classes_list', 'hp_places_list'])

In [7]:
books['hp1_sorcerers_stone'][:500]

"Harry Potter and the Sorcerer's Stone   CHAPTER ONE   THE BOY WHO LIVED   Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.   Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large musta"

In [14]:
try:
    import string
    import nltk
    from nltk import word_tokenize, ngrams
    #nltk.download('stopwords')
    #nltk.download('punkt')
    from nltk.corpus import stopwords as sw
except:
    import string
    import nltk
    from nltk import word_tokenize, ngrams
    nltk.download('stopwords')
    nltk.download('punkt')
    from nltk.corpus import stopwords as sw

In [25]:
punc = str.maketrans('', '', string.punctuation)

def normalize_text(text):
    tokens = word_tokenize(text.translate(punc))    
    stopwords = sw.words('english')
    content = [w for w in tokens if w.lower() not in stopwords]
    return content
    
def get_bigrams(text):
    tokens = word_tokenize(text.translate(punc))    
    return ngrams(tokens, 2)
    
def get_trigrams(text):
    tokens = word_tokenize(text.translate(punc))    
    return ngrams(tokens, 3)

def get_wordcount(text):
    tokens = word_tokenize(text.translate(punc))    
    return len(tokens)
    
def get_unique_wordcount(text):
    tokens = word_tokenize(text.translate(punc)) 
    return len(set(tokens))

def get_unique_punctuation(text):
    puctuation = [c for c in text if c in string.punctuation]
    return len(puctuation)

In [27]:
books_clean = {}

for book in books.keys():
    books_clean[book] = normalize_text(books[book])
    books_clean['{}_bigrams'.format(book)] = get_bigrams(books[book])
    books_clean['{}_trigrams'.format(book)] = get_trigrams(books[book])
    books_clean['{}_wordcount'.format(book)] = get_wordcount(books[book])
    books_clean['{}_unique_wordcount'.format(book)] = get_unique_wordcount(books[book])
    books_clean['{}_unique_punctuation'.format(book)] = get_unique_punctuation(books[book])
    
    print('{}\nwordcount: {}\nunique wordcount: {}\nunique puntuation: {}\n'.format(
        book,
        books_clean['{}_wordcount'.format(book)],
        books_clean['{}_unique_wordcount'.format(book)],
        books_clean['{}_unique_punctuation'.format(book)])
    )


hp1_sorcerers_stone
wordcount: 77613
unique wordcount: 6925
unique puntuation: 23167

hp2_chamber_of_secrets
wordcount: 85535
unique wordcount: 8605
unique puntuation: 25563

hp3_prisioner_of_azkaban
wordcount: 104920
unique wordcount: 8943
unique puntuation: 31373

hp4_globet_of_fire
wordcount: 191095
unique wordcount: 12184
unique puntuation: 53860

hp5_order_of_the_phoenix
wordcount: 257210
unique wordcount: 14730
unique puntuation: 66434

hp6_half_blood_prince
wordcount: 170337
unique wordcount: 12193
unique puntuation: 52291

hp7_deathly_hallows
wordcount: 197581
unique wordcount: 13272
unique puntuation: 53004



In [39]:
def count_word(text, word):
    c = 0
    for item in text:
        if item.lower == word:
            c = c+1
    print(c)
    return c

In [36]:
print(count_word(books_clean['hp1_sorcerers_stone'], 'harry'))

0
