In [1]:
import numpy as np
import pandas as pd
import requests
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Parses and stores all 7 texts for preprocessing

In [2]:
url_list = [
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt","https://github.com/formcept/whiteboard/blob/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt","https://github.com/formcept/whiteboard/blob/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt","https://github.com/formcept/whiteboard/blob/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt"
]

text = []

for _,i in enumerate(url_list):
    req = requests.get(i)
    text.append(req.text)

## Preprocessing

In [30]:
def preprocessing(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)

    lower_filter = [w.lower() for w in text if w.lower()]
    filtered_text = []

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    stop_words = stopwords.words('english') + ['j','page','k','said','rowling','quot','back']

    for words in lower_filter:
        if words not in stop_words:
            filtered_text.append(words)

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    lem = WordNetLemmatizer()

    filtered_text = [lem.lemmatize(w) for w in filtered_text]

    return filtered_text

In [31]:
processed = [preprocessing(i) for _,i in enumerate(text)]

## Prepare the Corpus

In [32]:
from gensim.corpora import Dictionary
X = Dictionary(processed)

doc_term_matrix = [X.doc2bow(doc) for doc in processed]

In [33]:
from gensim.models import LsiModel
lsi_model = LsiModel(doc_term_matrix,num_topics=7,id2word=X)

In [34]:
lsi_model.print_topics()

[(0,
  '0.750*"harry" + 0.213*"ron" + 0.193*"potter" + 0.161*"hermione" + 0.103*"fire" + 0.096*"one" + 0.095*"mr" + 0.094*"goblet" + 0.088*"dumbledore" + 0.082*"around"'),
 (1,
  '0.400*"class" + 0.363*"data" + 0.236*"div" + 0.230*"flex" + 0.176*"true" + 0.173*"j" + 0.151*"id" + 0.150*"span" + 0.149*"path" + 0.143*"octicon"'),
 (2,
  '-0.415*"fire" + -0.408*"goblet" + 0.239*"azkaban" + 0.237*"prisoner" + -0.195*"mr" + 0.193*"lupin" + -0.175*"moody" + 0.160*"black" + -0.152*"crouch" + 0.141*"professor"'),
 (3,
  '-0.359*"chamber" + 0.341*"azkaban" + -0.325*"secret" + 0.317*"prisoner" + 0.263*"lupin" + 0.199*"hermione" + 0.196*"black" + -0.179*"lockhart" + -0.171*"stone" + -0.135*"philosopher"'),
 (4,
  '0.445*"stone" + 0.381*"philosopher" + -0.300*"chamber" + -0.285*"secret" + 0.244*"hagrid" + -0.159*"lockhart" + -0.157*"ron" + 0.124*"quirrell" + 0.124*"dudley" + -0.110*"dobby"'),
 (5,
  '0.319*"20blood" + 0.319*"20prince" + 0.319*"20half" + -0.235*"20the" + -0.232*"20phoenix" + -0.232*

## Questions

In [2]:
# questions = [
# "What is the name of the spell that causes an object to rise and move according to the caster's will?",
# "Who is the ghost that haunts the Hogwarts girls' bathroom?",
# "Which animal is the Patronus of Severus Snape?",
# "What is the name of the magical object that grants the possessor the power to control others?",
# "Who was the Dark wizard that Tom Riddle used to gain his knowledge of Horcruxes?",
# "What is the name of the game that involves catching a flying golden ball and throwing it through a hoop?",
# "What are the three Deathly Hallows and what powers do they possess?",
# "Who was the founder of Slytherin House and what was his signature characteristic?",
# "What is the name of the potion that allows the drinker to regain their strength and vitality?",
# "Who was the Defense Against the Dark Arts teacher in Harry's third year at Hogwarts?"
# ]