In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial
from sklearn.metrics.pairwise import linear_kernel

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/mark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop = stopwords.words('english')

In [4]:
br_df= pd.read_csv("book_sum.csv")
br_df =br_df.iloc[:,1:]

In [5]:
br_df.head()

Unnamed: 0,book_id,freebase_id,book_title,author,pub_date,book_genre,summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"['Roman à clef', 'Satire', ""Children's literat...","Old Major, the old boar on the Manor Farm, ca..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"['Science Fiction', 'Novella', 'Speculative fi...","Alex, a teenager living in near-future Englan..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"['Existentialism', 'Fiction', 'Absurdist ficti...",The text of The Plague is divided into five p...
3,2152,/m/0x5g,All Quiet on the Western Front,Erich Maria Remarque,1929-01-29,"['War novel', 'Roman à clef']","The book tells the story of Paul Bäumer, a Ge..."
4,2890,/m/011zx,A Wizard of Earthsea,Ursula K. Le Guin,1968,"[""Children's literature"", 'Fantasy', 'Speculat...","Ged is a young boy on Gont, one of the larger..."


In [6]:
br_df['clean_summary']=br_df['summary'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [7]:
data = br_df.loc[:,['book_title','summary','clean_summary']].copy() 
data.reset_index(level = 0, inplace = True)

In [8]:
indices = pd.Series(data.index, index = data['book_title'])

In [9]:
#Converting the book title into vectors and used bigram
tf = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df = 1, stop_words='english')
tfidf_matrix = tf.fit_transform(data['clean_summary'])

In [10]:
# Calculating the similarity measures based on Cosine Similarity
sg = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
idx = indices['Animal Farm']

In [12]:
print(idx)

0


In [13]:
sig = list(enumerate(sg[idx]))

In [14]:
sig[1:5]

[(1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0)]

In [15]:
sig = sorted(sig, key=lambda x: x[1], reverse=True)

In [16]:
sig

[(0, 0.9999999999999982),
 (6746, 0.012022820634668495),
 (4925, 0.007644580052096597),
 (4553, 0.0072332902899644004),
 (3696, 0.00701044612234915),
 (7186, 0.006541530553021099),
 (887, 0.0065146652742705865),
 (8144, 0.006303073913110872),
 (2607, 0.005741409085895621),
 (4793, 0.0056915295177419455),
 (6691, 0.005590640397534954),
 (2352, 0.005304019685751537),
 (1058, 0.00493262182654255),
 (3867, 0.004734271174169498),
 (3310, 0.0046426365798176925),
 (7151, 0.004530280804518196),
 (3077, 0.004387702904995243),
 (1721, 0.004161870652354175),
 (3537, 0.0040028103290452),
 (5815, 0.004001810415039532),
 (8576, 0.003891220458451734),
 (4386, 0.0037889766256805186),
 (3963, 0.0036948494970735165),
 (7617, 0.0036641294482421018),
 (4988, 0.0035946621085176655),
 (6016, 0.003588874622262852),
 (1382, 0.003405200005926319),
 (3333, 0.003378373354417254),
 (6333, 0.0033666486853632135),
 (7575, 0.0033578238371303115),
 (2577, 0.0033109380798335944),
 (4167, 0.003236406577310393),
 (8745,

In [17]:
sig = sig[1:6]

In [18]:
book_indices= [i[0] for i in sig]

In [19]:
rec = data[['book_title', 'summary']].iloc[book_indices]

In [20]:
rec

Unnamed: 0,book_title,summary
6746,The Monster Bed,The introduction starts the book in the setti...
4925,Victory,"Through a business misadventure, the European..."
4553,My Friend Leonard,The novel begins with Frey's release from an ...
3696,Punk Farm,Krosoczka's book tells the story of five farm...
7186,Edgar Huntly,"Edgar Huntly, a young man who lives with his ..."


In [21]:
data.to_csv("recommed.csv",index=False)

In [22]:
data.columns

Index(['index', 'book_title', 'summary', 'clean_summary'], dtype='object')

In [23]:
rec = data[['book_title', 'summary']].iloc[book_indices]

In [24]:
rec.book_title.values

array(['The Monster Bed', 'Victory', 'My Friend Leonard', 'Punk Farm',
       'Edgar Huntly'], dtype=object)

In [25]:
br_df[br_df.book_title.isin(rec.book_title.values)].book_id

3696     4656302
4553     6118585
4925     7111635
6746    13685257
7186    16029050
Name: book_id, dtype: int64

In [26]:
indices.head()

book_title
Animal Farm                       0
A Clockwork Orange                1
The Plague                        2
All Quiet on the Western Front    3
A Wizard of Earthsea              4
dtype: int64