In [10]:
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

In [32]:
df = pd.read_csv('data_in/Books.tsv', sep='\t', error_bad_lines=False)

In [33]:
df.isnull().sum()

bookId                0
bookTitle            20
bookSeries        16663
bookAuthors          20
ratingValue          20
ratingCount          20
reviewCount          20
Plot                598
NumberofPages      1158
PublishingDate      788
Characters        20348
Settings          21919
url                   0
dtype: int64

In [34]:
df = df.dropna(subset=['bookTitle', 'bookId', 'url'])

In [5]:
def detect_stable(s):
    if type(s) == str:
        try:
            l = detect(s)
            return l
        except LangDetectException as e:
            return None
    else:
        return None

In [6]:
df['lang'] = df['Plot'].apply(detect_stable)

In [7]:
df = df[df['lang'] == 'en']

In [8]:
df.shape

(26572, 14)

In [9]:
df.to_csv('../data_in/Books_processed.csv')

In [9]:
from nltk.stem import SnowballStemmer
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [18]:
stem = SnowballStemmer(language='english')

In [12]:
df['Plot_nopunct'] = df['Plot'].apply(lambda x: x.translate(str.maketrans('', '', punctuation)))

In [13]:
df['tokens'] = df['Plot'].apply(word_tokenize)

In [14]:
df['tokens2'] = df['Plot_nopunct'].apply(word_tokenize)

In [15]:
stopwordset= set(stopwords.words())

In [16]:
df['tokens_nostopwords'] = df['tokens2'].apply(lambda x: [w.lower() for w in x if w.lower() not in stopwordset])

In [17]:
df['tokens_stemmed'] = df['tokens_nostopwords'].apply(lambda x: [stem.stem(w) for w in x])

In [18]:
vocabulary = {}

In [19]:
def index_builder(plot, book_id):
    for token in plot:
        i = vocabulary.get(token)
        if i is None:
            vocabulary[token] = [book_id]
        if i is not None:
            vocabulary[token].append(book_id)
    return True

In [20]:
isok = df.apply(lambda x: index_builder(x['tokens_stemmed'], x['bookId']), axis=1).all()

In [21]:
isok

True

In [22]:
len(vocabulary.keys())

87428

In [23]:
df.to_csv('../data_in/Books_processed.csv', index=False)

In [4]:
import json

In [7]:
#with open('../data_out/vocab1.json', 'w+') as f:
#    json.dump(vocabulary, f)
    
with open('data_out/vocab1.json') as f:
    vocabulary = json.load(f)

In [35]:
from IPython.core.display import display, HTML

query = input()

tokens = [stem.stem(w.lower()) for w in word_tokenize(query)]
first = True
for t in tokens:
    if first:
        results = set(vocabulary[t])
        first = False
    else:
        results = results.intersection(set(vocabulary[t]))

serp_at_5 = df[df['bookId'].isin(results)][['bookTitle', 'Plot', 'url']][:5]
for ind in range(len(serp_at_5)):
    s = serp_at_5.iloc[ind]
    display(HTML(f"""<h3><a href="{s["url"]}" target="_blank">{s["bookTitle"]}</a></h3>
                     <p>{s["Plot"]}</p>"""))

america gods


In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lozzi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [15]:
import nltk