# LSI Implementation for Harry Potter Novels

In [1]:
import numpy as np
import pandas as pd
import requests
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Parses and stores all 7 texts for preprocessing

In [2]:
url_list = [
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%202%20-%20The%20Chamber%20of%20Secrets.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%203%20-%20The%20Prisoner%20of%20Azkaban.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%204%20-%20The%20Goblet%20of%20Fire.txt","https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%205%20-%20The%20Order%20of%20the%20Phoenix.txt",
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%206%20-%20The%20Half%20Blood%20Prince.txt",
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%207%20-%20The%20Deathly%20Hallows.txt"
    ]

text = []

for _,i in enumerate(url_list):
    req = requests.get(i)
    text.append(req.text)

## Preprocessing

In [3]:
def preprocessing(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)

    lower_filter = [w.lower() for w in text]
    filtered_text = []

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    stop_words = stopwords.words('english') + [' j ','page','k','said','rowling','quot','back','mr','mrs']

    for words in lower_filter:
        if words not in stop_words:
            filtered_text.append(words)

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    lem = WordNetLemmatizer()

    filtered_text = [lem.lemmatize(w) for w in filtered_text]

    return filtered_text

In [4]:
processed = [preprocessing(i) for _,i in enumerate(text)]

## Create Query Processor

In [5]:
def query_processing(text):
    tokenizer = RegexpTokenizer(r'\w+')
    processed_query = tokenizer.tokenize(text)

    processed_query = [w.lower() for w in processed_query]

    return processed_query

## Queries

In [6]:
queries = ["Who is the ghost that haunts the Hogwarts girls' bathroom?","Which animal is the Patronus of Severus Snape?","Who was the Defense Against the Dark Arts teacher in Harry's third year at Hogwarts","Who was the founder of Slytherin House and what was his gifts","What is the name of the spell that causes an object to rise and move according to the caster's will?"]

In [7]:
processed_queries = [query_processing(i) for _,i in enumerate(queries)]
processed_queries[0]

['who',
 'is',
 'the',
 'ghost',
 'that',
 'haunts',
 'the',
 'hogwarts',
 'girls',
 'bathroom']

## The Bag of Words representing the first book

In [8]:
processed[0][:10]

['boy',
 'lived',
 'dursley',
 'number',
 'four',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly']

## Prepare the Corpus and Indexing to form Doc-Term-Matrix

In [9]:
from gensim.corpora import Dictionary

X = Dictionary(processed)

doc_term_matrix = [X.doc2bow(doc) for doc in processed]

## Implementing the LSI Model

In [10]:
from gensim.models import LsiModel
lsi_model = LsiModel(corpus=doc_term_matrix,num_topics=7,id2word=X)

## 7 Key topics Corresponding with each book

In [11]:
lsi_model.print_topics()[0]

(0,
 '0.721*"harry" + 0.208*"ron" + 0.199*"potter" + 0.185*"hermione" + 0.159*"j" + 0.117*"dumbledore" + 0.098*"could" + 0.092*"know" + 0.090*"one" + 0.082*"like"')

## Convert the Query into LSI Space

In [12]:
indexed_query = [Dictionary(processed).doc2bow(j) for j in processed_queries]
vector_query = [lsi_model[a] for a in indexed_query]
print(vector_query[0])

[(0, 0.03491504088121333), (1, -0.026611678787936324), (2, 0.028591680955758127), (3, -0.021295612037096123), (4, 0.009907197298937059), (5, -0.06193307475850385), (6, -0.007823154196610602)]


## Matrix Similarity

In [13]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi_model[doc_term_matrix])

## Perform Ranking

In [281]:
def doc_ranking_score(similarities):
    score = []

    for i in similarities:
        t = np.argsort(i,-1)[-3:]
        sol = i[t]
        sol = (list(t+1)[::-1],list(sol)[::-1])
        print(f'The most relevant books and corresponding scores are {sol[0]}  {sol[1]}')
        score.append(sol[0])

    return score

In [276]:
similarities = index[vector_query]

rel = doc_ranking_score(similarities)

The most relevant books and corresponding scores are [2, 1, 4]  [0.6451366, 0.52452683, 0.47689116]
The most relevant books and corresponding scores are [3, 6, 1]  [0.53445834, 0.49757794, 0.4560766]
The most relevant books and corresponding scores are [2, 3, 4]  [0.9622794, 0.9406679, 0.9330468]
The most relevant books and corresponding scores are [2, 1, 4]  [0.5618626, 0.50057507, 0.3176321]
The most relevant books and corresponding scores are [7, 4, 1]  [0.61576915, 0.53625244, 0.51299]


## Calculate key Metrics

In [277]:
def score(rel):
    df = pd.read_csv('Harry_Potter_Query_Scores.csv')

    points_available = 6*len(rel)
    total = []

    for j in range(0,len(rel)):
        s = 0
        for i in rel[j]:
            s+=(df.iloc[j].loc[str(i)])

        total.append(s)

    return print(f'The total points scored for the LSI is {round(100*(sum(total)/points_available),0)}')

In [278]:
score(rel)

The total points scored for the LSI is 53.0


## Create a Simple BM25 Implementation as a Benchmark

In [274]:
from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(processed)
doc_scores = [bm25.get_scores(i) for i in processed_queries] # Scores per document

[array([0.37036529, 0.38783173, 0.33256298, 0.37479915, 0.33736104,
        0.37829639, 1.67905578]),
 array([0.36118587, 0.34467349, 0.50992329, 0.4042604 , 0.40070315,
        0.45081909, 0.49288222]),
 array([0.88938352, 0.90023636, 0.91156764, 0.902524  , 0.91000815,
        0.90566553, 2.20186738]),
 array([0.26182281, 0.35084156, 0.25924586, 0.33936818, 0.3503959 ,
        0.35996471, 1.65165687]),
 array([0.64635467, 1.55034645, 0.65753762, 0.73481593, 1.3164433 ,
        1.81848214, 1.45619786])]

In [282]:
rel = doc_ranking_score(similarities = doc_scores)

The most relevant books and corresponding scores are [7, 2, 6]  [1.679055782916744, 0.3878317283090054, 0.37829638653364905]
The most relevant books and corresponding scores are [3, 7, 6]  [0.5099232898673618, 0.4928822244033498, 0.45081908700215223]
The most relevant books and corresponding scores are [7, 3, 5]  [2.201867375435711, 0.911567642711484, 0.9100081486408287]
The most relevant books and corresponding scores are [7, 6, 2]  [1.6516568701193062, 0.3599647096376914, 0.3508415599113474]
The most relevant books and corresponding scores are [6, 2, 7]  [1.81848214442309, 1.5503464451254871, 1.4561978565949074]


In [283]:
score(rel)

The total points scored for the LSI is 40.0
