# LSI For Text Retrieval
___________________________________________________________

In [1]:
!pip install gensim
!pip install rank-bm25
import numpy as np
import pandas as pd
import requests
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ifeda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data
The model is based on three texts, the aim is to use LSI to retrieve the most relevant text from the collection of texts.
1. Harry Potter and the Philosophers Stone
2. The Fellowship of the Ring
3. Pride and Prejudice

The texts themselves are available on github and were imported in  using the following code

In [2]:
url_list = [
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt",
    "https://raw.githubusercontent.com/ganesh-k13/shell/master/test_search/www.glozman.com/TextPages/01%20-%20The%20Fellowship%20Of%20The%20Ring.txt",
    "https://raw.githubusercontent.com/laumann/ds/master/hashing/books/jane-austen-pride-prejudice.txt"
    ]

documents = ["Harry Potter and the Philosopher's Stone","Fellowship of the Ring","Pride and Prejudice"]
text = []

for _,i in enumerate(url_list):
    req = requests.get(i)
    text.append(req.text)

## Preprocessing
The text files need to be processed in order for use in the Search Engine. The processing methods implemented are as follows:
- Tokenization
- Lower casing
- Stop word removal
- Lemmatization

In [3]:
def preprocessing(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)

    lower_filter = [w.lower() for w in text]
    filtered_text = []

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    stop_words = stopwords.words('english') + ['j','page','k','said','rowling','quot','back','mr','mrs']

    for words in lower_filter:
        if words not in stop_words:
            filtered_text.append(words)

    filtered_text = [i for i in filtered_text if not i.isdigit()]

    lem = WordNetLemmatizer()

    filtered_text = [lem.lemmatize(w) for w in filtered_text]

    return filtered_text

In [4]:
processed = [preprocessing(i) for _,i in enumerate(text)]

### Processed Document Example

In [5]:
processed[0][:10]

['boy',
 'lived',
 'dursley',
 'number',
 'four',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly']

## Queries

Below are example user queries written in natural language terms.

In [6]:
queries = ["Who is the ghost that haunts the Hogwarts girls' bathroom?","Which animal is the Patronus of Severus Snape?","Who was the Defense Against the Dark Arts teacher in Harry's third year at Hogwarts","Who was the founder of Slytherin House and what was his gifts","Who is darcy?","Where can I find the one ring to rule them all"]

### Query Processor
The Queries also have to be processed before being mapped into the semantic space. The preprocessing of the query involve:
- Tokenization
- Lower Casing

In [7]:
def query_processing(text):
    tokenizer = RegexpTokenizer(r'\w+')
    processed_query = tokenizer.tokenize(text)
    processed_query = [w.lower() for w in processed_query]
    return processed_query

### Example processed queries

In [8]:
processed_queries = [query_processing(i) for _,i in enumerate(queries)]
processed_queries[0]

['who',
 'is',
 'the',
 'ghost',
 'that',
 'haunts',
 'the',
 'hogwarts',
 'girls',
 'bathroom']

## Prepare the Corpus and Indexing to form Doc-Term-Matrix
Using the gensim library the documents were used to generate a bag of words. These words were then used to generate a term document matrix

In [9]:
from gensim.corpora import Dictionary

X = Dictionary(processed)

doc_term_matrix = [X.doc2bow(doc) for doc in processed]

## Implementing the LSI Model
The LSI model was created as below with the number of topics representing each text from the collection. The model carries out an SVD calculation to generate the term concept space

In [10]:
from gensim.models import LsiModel
lsi_model = LsiModel(corpus=doc_term_matrix,num_topics=3,id2word=X)

### 3 Topics from 3 distinct texts

In [11]:
lsi_model.print_topics(num_words=5)

[(0,
  '0.251*"frodo" + 0.177*"could" + 0.160*"would" + 0.156*"one" + 0.152*"harry"'),
 (1,
  '-0.743*"harry" + -0.205*"potter" + 0.193*"frodo" + -0.191*"ron" + -0.165*"hagrid"'),
 (2,
  '-0.333*"elizabeth" + 0.233*"frodo" + -0.219*"darcy" + -0.175*"bennet" + -0.162*"could"')]

## Convert the Query into LSI Space
The queries are still in naturla language terms and as such they have to be mapped to be represented in the concept term space

In [12]:
indexed_query = [Dictionary(processed).doc2bow(j) for j in processed_queries]
vector_query = [lsi_model[a] for a in indexed_query]
print(vector_query[0])

[(0, 0.010196347685162956), (1, -0.046187303820103366), (2, 0.009154755902625687)]


## Matrix Similarity
The method for document retrieval is based on the cosine similarity between the query and the documents mapped in the concept space.

In [13]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi_model[doc_term_matrix])

## Perform Ranking
The following code cells perform the cosine similarity calculations and returns the reevant document as well as ouput the similarity score between the queries and the documents

In [14]:
def doc_ranking_score(similarities):
    score = []

    for i in similarities:
        t = np.argsort(i,-1)[::-1]
        sol = i[t]
        print(f'The most relevant books and corresponding scores are {list(t+1)}  {sol}')
        score.append(t)

    return score

In [15]:
similarities = index[vector_query]
rel = doc_ranking_score(similarities)

The most relevant books and corresponding scores are [1, 2, 3]  [ 9.3354422e-01  1.8608503e-02 -1.7413836e-08]
The most relevant books and corresponding scores are [1, 2, 3]  [0.93685794 0.0285731  0.0054644 ]
The most relevant books and corresponding scores are [1, 2, 3]  [0.9659396  0.16968174 0.03957374]
The most relevant books and corresponding scores are [1, 3, 2]  [0.848066  0.7509906 0.5083917]
The most relevant books and corresponding scores are [3, 1, 2]  [ 8.5409087e-01 -4.9142024e-09 -1.3282470e-08]
The most relevant books and corresponding scores are [2, 3, 1]  [0.98938537 0.5503045  0.4754797 ]


## Calculate key Metrics
As the collection is small the document retrieved will only be the most relevant 1 document

In [16]:
def retrieval(similarities,documents):
    t = np.argmax(similarities)

    return  documents[int(t)]

In [17]:
similarities = index[vector_query[0]]
output = retrieval(similarities,documents)
print(output)

Harry Potter and the Philosopher's Stone


In [18]:
def metrics():
    precision = 100 * (1/1) #Given that the above is an argument max the relevancy was also prejudged hence the scores
    recall = 100 * (1/1)

    return precision,recall

precision,recall = metrics()

print(f'The Precision is {precision} and the accuracy is {recall}')

The Precision is 100.0 and the accuracy is 100.0


## Create a Simple BM25 Implementation as a Benchmark

In [19]:
from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(processed)
doc_scores = [bm25.get_scores(i) for i in processed_queries]

In [20]:
rel = doc_ranking_score(similarities = doc_scores)

The most relevant books and corresponding scores are [1, 3, 2]  [ 1.22278439  0.         -0.01725772]
The most relevant books and corresponding scores are [1, 3, 2]  [ 2.29452938 -0.00861595 -0.01652686]
The most relevant books and corresponding scores are [1, 3, 2]  [ 2.15656136 -0.04956339 -0.08238967]
The most relevant books and corresponding scores are [1, 2, 3]  [ 1.22875045 -0.02028364 -0.02037731]
The most relevant books and corresponding scores are [3, 2, 1]  [1.27280758 0.         0.        ]
The most relevant books and corresponding scores are [3, 1, 2]  [-0.07273696 -0.07954259 -0.08756572]


In [21]:
retrieval(similarities, documents)

"Harry Potter and the Philosopher's Stone"

# Full Script
--------------------------

In [22]:
def LSI(query):

    url_list = [
    "https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt",
    "https://raw.githubusercontent.com/ganesh-k13/shell/master/test_search/www.glozman.com/TextPages/01%20-%20The%20Fellowship%20Of%20The%20Ring.txt",
    "https://raw.githubusercontent.com/laumann/ds/master/hashing/books/jane-austen-pride-prejudice.txt"
    ]

    documents = ["Harry Potter and the Philosopher's Stone","Fellowship of the Ring","Pride and Prejudice"]
    text = []

    for _,i in enumerate(url_list):
        req = requests.get(i)
        text.append(req.text)

    def preprocessing(text):
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)

        lower_filter = [w.lower() for w in text]
        filtered_text = []

        filtered_text = [i for i in filtered_text if not i.isdigit()]

        stop_words = stopwords.words('english') + ['j','page','k','said','rowling','quot','back','mr','mrs']

        for words in lower_filter:
            if words not in stop_words:
                filtered_text.append(words)

        filtered_text = [i for i in filtered_text if not i.isdigit()]

        lem = WordNetLemmatizer()

        filtered_text = [lem.lemmatize(w) for w in filtered_text]

        return filtered_text


    def query_processing(text):
        tokenizer = RegexpTokenizer(r'\w+')
        processed_query = tokenizer.tokenize(text)

        processed_query = [w.lower() for w in processed_query]

        return processed_query

    def doc_ranking_score(similarities):
        t = np.argmax(similarities)

        return int(t)

    processed = [preprocessing(i) for _,i in enumerate(text)]

    from gensim.corpora import Dictionary
    from gensim.models import LsiModel

    X = Dictionary(processed)
    doc_term_matrix = [X.doc2bow(doc) for doc in processed]
    lsi_model = LsiModel(corpus=doc_term_matrix,num_topics=3,id2word=X)

    processed_query = query_processing(query)

    vector_query = lsi_model[Dictionary(processed).doc2bow(processed_query)]

    similarities = index[vector_query]

    rel = doc_ranking_score(similarities)

    return documents[rel]

In [23]:
LSI('I Love you Darcy')

'Pride and Prejudice'