# Book Recommedation
### Topic Modeling using LDA
#### LDA - Unsupervised ML model

LDA predicts propotion of topics each document(input)

Idea is to make recommedation based on inequality of topic propotions predicted by model

In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
from tqdm import tqdm

### Load CMU book summary dataset for Training model
http://www.cs.cmu.edu/~dbamman/booksummaries.html

In [2]:
records = []

with open("booksummaries.txt", 'r',encoding="utf8") as f:
    data_reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(data_reader):
        records.append(row)

16559it [00:00, 30035.20it/s]


#### strip summary info as we only need summary for training

In [4]:
summary = []

#summary is present at index 6 according to CMU site
for i in tqdm(records):
    summary.append(i[6])

books = pd.DataFrame({'summary': summary})
books.head(2)

100%|███████████████████████████████████████████████████████████████████████| 16559/16559 [00:00<00:00, 1598505.83it/s]


Unnamed: 0,summary
0,"Old Major, the old boar on the Manor Farm, ca..."
1,"Alex, a teenager living in near-future Englan..."


remove stopwords and punctuations
and Lemmatize words into base form

In [5]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [6]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [8]:
doc_clean = [clean(doc).split() for doc in tqdm(books['summary'])]

100%|███████████████████████████████████████████████████████████████████████████| 16559/16559 [00:17<00:00, 954.23it/s]


Building Dictionary 

In [10]:
import gensim

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = gensim.corpora.Dictionary(doc_clean)

# convert our books_summaries into numerical data using doc to bag of words
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tqdm(doc_clean)]

100%|██████████████████████████████████████████████████████████████████████████| 16559/16559 [00:02<00:00, 6537.76it/s]


## Creation and training of LDA

In [11]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

Set parameters of LDA

In [13]:
num_topics = 5
passes = 5
eta = [0.01]*len(dictionary.keys())
alpha = [0.01]*num_topics

In [16]:
ldamodel = Lda(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=passes, alpha=alpha,eta=eta)

# Prediction using Trained Model

In [17]:
def predict(summary):
    clean_summary = clean(summary).split()
    doc_trm_matrix = dictionary.doc2bow(clean_summary)
    return ldamodel[doc_trm_matrix]

### 1984
#### George Orwell

In [18]:
book_1984 = """A man loses his identity while living under a repressive regime. \
            In a story based on George Orwell's classic novel, Winston Smith (John Hurt) is a government employee whose job involves the rewriting of history in a manner that casts his fictional country's leaders in a charitable light. \
            His trysts with Julia (Suzanna Hamilton) provide his only measure of enjoyment, but lawmakers frown on the relationship -- and in this closely monitored society, there is no escape from Big Brother."""

In [19]:
print('for Book : 1984')
print(predict(book_1984))

for Book : 1984
[(2, 0.2600291), (3, 0.73927027)]


### Harry Potter and the Sorcerer's Stone
#### J K Rowling

In [20]:
book_harry1 = """Adaptation of the first of J.K. Rowling's popular children's novels about Harry Potter, \
                a boy who learns on his eleventh birthday that he is the orphaned son of two powerful wizards\
                and possesses unique magical powers of his own. He is summoned from his life as an unwanted child to \
                become a student at Hogwarts, an English boarding school for wizards. There, \
                he meets several friends who become his closest allies \
                and help him discover the truth about his parents' mysterious deaths."""

In [22]:
print('for Book : Harry Potter and the Sorcerer\'s Stone')
print(predict(book_harry1))

for Book : Harry Potter and the Sorcerer's Stone
[(1, 0.3311144), (2, 0.08902878), (3, 0.16001026), (4, 0.41962406)]
