In [2]:
import pandas as pd
import numpy as np
import requests
import json
import re
import tqdm
import csv
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

In [16]:
def get_book_data(url: str, api_key: str, write=False):
    try:
        res = requests.get(url=url, params={'api_key': api_key})
        out = json.loads(res.content)
        if write:
            with open('./out.json', 'w+') as f:
                json.dump(out, f, indent=4)
        books = out['items']
        if books and len(books) > 0:
            book_data = []
            for i in range(len(books)):
                book = books[i]
                book_out = {
                    'title': book['volumeInfo']['title'],
                    'author': ' '.join([a for a in book['volumeInfo']['authors']]),
                    'publishedDate': re.match(r'^\d{4}', book['volumeInfo']['publishedDate']).group(), # get year
                    'pageCount': book['volumeInfo']['pageCount'],
                    'categories': ' '.join([c for c in book['volumeInfo']['categories']]),
                    'textSnippet': book['searchInfo']['textSnippet']
                }
                
                book_out['description'] = book['volumeInfo'].get('description', '')
                book_out['ratingsCount'] = book['volumeInfo'].get('ratingsCount', 0)

                book_data.append(book_out)
            return book_data
        else:
            print('no book found for query')
    except requests.exceptions.RequestException as e:
        print(f'error: {e}')

In [17]:
url = 'https://www.googleapis.com/books/v1/volumes?q=intitle:hunger+games'
api_key = 'AIzaSyBl2Zq0HTWgk0tZ0eNj_X5PD85b_O9-lfA'
book_data = get_book_data(url, api_key)

for entry in book_data:
    print(entry)

{'title': 'The Hunger Games', 'author': 'Suzanne Collins', 'publishedDate': '2011', 'pageCount': 302, 'categories': 'Juvenile Fiction', 'textSnippet': '. . . I couldn&#39;t stop reading.&quot; Stephen King, Entertainment Weekly &quot;I was so obsessed with this book. . . . The Hunger Games is amazing.&quot;Stephenie Meyer, author of the Twilightsaga &quot;Brilliantly plotted and perfectly paced.', 'description': 'First in the ground-breaking HUNGER GAMES trilogy. In a vision of the near future, a terrifying reality TV show is taking place. Twelve boys and twelve girls are forced to appear in a live event called The Hunger Games. There is only one rule: kill or be killed. But Katniss has been close to death before. For her, survival is second nature.', 'ratingsCount': 52}
{'title': 'The Ballad of Songbirds and Snakes (A Hunger Games Novel)', 'author': 'Suzanne Collins', 'publishedDate': '2020', 'pageCount': 744, 'categories': 'Young Adult Fiction', 'textSnippet': 'Ambition will fuel him

- **Book features**: Title, author, summary, genres, keywords, theme tags.
- **Vectorization**: Use NLP techniques like TF-IDF or embeddings (e.g., BERT) on summaries or metadata.
- **Similarity search**: Use cosine similarity to recommend books similar to the input.
- **Sentiment analysis**: With Text Blob polarity

In [18]:
book_df = pd.DataFrame(book_data)
for col in ['pageCount', 'ratingsCount']:
    book_df[col] = pd.to_numeric(book_df[col], errors='coerce')

str_cols = book_df.select_dtypes(exclude=['float', 'int']).columns
str_cols = str_cols.drop(['textSnippet', 'description'])
print(str_cols)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(book_df[str_cols]))
encoded_cols.index = book_df.index
book_df = pd.concat([book_df.drop(columns=str_cols), encoded_cols], axis=1)
book_df.columns = book_df.columns.astype(str)
print(book_df['textSnippet'])


Index(['title', 'author', 'publishedDate', 'categories'], dtype='object')
0    . . . I couldn&#39;t stop reading.&quot; Steph...
1                              Ambition will fuel him.
2    This 10th Anniversary Edition of THE HUNGER GA...
3    Even at the age of sixteen, Katniss Everdeen k...
4    This digital collection includes all three boo...
5    Welcome to Panem, the world of The Hunger Game...
6    The extraordinary, ground breaking New York Ti...
7    Set in a dark vision of the near future, a ter...
8    In a future North America, where the rulers of...
9    Instructors considering this book for use in a...
Name: textSnippet, dtype: object


In [19]:
def preprocess_text(text: str):
    text = text.lower().strip()
    text = re.sub(pattern=r'[^a-zA-Z]', repl=' ', string=text)
    tokens = word_tokenize(text, language='english', preserve_line=True)
    words = stopwords.words()
    tokens = [t for t in tokens if t not in words]
    text = ' '.join(tokens)
    return text

book_df['textSnippetProcessed'] = book_df['textSnippet'].apply(preprocess_text)
book_df['descriptionProcessed'] = book_df['description'].apply(preprocess_text)

Processing of separate dataset

In [20]:
cols = ['id', 'name', 'genre', 'summary']
books_df = pd.DataFrame(columns=cols)

with open('booksummaries.txt', 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm.tqdm(reader):
        themes = []
        entries = row[5].split(',')
        for entry in entries:
            splits = entry.split('\"')
            themes.append(splits[len(splits)-2])
        books_df.loc[len(books_df)] = {cols[0]: row[0], cols[1]: row[2], cols[2]: ' '.join([t for t in themes]), cols[3]: row[6]}

16559it [00:14, 1172.33it/s]


In [21]:
books_df.head()

Unnamed: 0,id,name,genre,summary
0,620,Animal Farm,Roman \u00e0 clef Satire Children's literature...,"Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Science Fiction Novella Speculative fiction Ut...,"Alex, a teenager living in near-future Englan..."
2,986,The Plague,Existentialism Fiction Absurdist fiction Novel,The text of The Plague is divided into five p...
3,1756,An Enquiry Concerning Human Understanding,,The argument of the Enquiry proceeds by a ser...
4,2080,A Fire Upon the Deep,Hard science fiction Science Fiction Speculati...,The novel posits that space around the Milky ...


In [22]:
books_df['genre'] = books_df['genre'].apply(preprocess_text)
books_df['summary'] = books_df['summary'].apply(preprocess_text)

KeyboardInterrupt: 

In [None]:
print(books_df.columns)
books_df['book info'] = books_df['genre'] + ' ' + books_df['summary']
books_df.drop(columns=['genre', 'summary'], inplace=True)

Index(['id', 'name', 'book info'], dtype='object')


KeyError: 'genre'

In [3]:
def load_df(from_file = False):
    return pd.read_csv('./books_out.csv') if from_file else books_df
books_df = load_df(from_file = True)

tf = TfidfVectorizer(analyzer = "word", ngram_range = (1,2), min_df = 0.1, stop_words = 'english')

tfidf_matrix = tf.fit_transform(books_df['book info'])

cosine_sim =  cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.19737192 0.13840422 ... 0.16538832 0.15853538 0.13765088]
 [0.19737192 1.         0.27824983 ... 0.09571242 0.         0.2179447 ]
 [0.13840422 0.27824983 1.         ... 0.10575205 0.11310744 0.14513411]
 ...
 [0.16538832 0.09571242 0.10575205 ... 1.         0.         0.1640478 ]
 [0.15853538 0.         0.11310744 ... 0.         1.         0.        ]
 [0.13765088 0.2179447  0.14513411 ... 0.1640478  0.         1.        ]]


In [7]:
from gensim.models import Word2Vec

training_data = ' '.join([f'{books_df.loc[i, 'name']} {books_df.loc[i, 'book info']}' for i in range(len(books_df))])
model = Word2Vec(sentences = training_data, window=10)

In [8]:
def get_similarity_score(first: str, second: str):
    simplify = lambda txt: txt.lower().replace(' ', '')
    first = simplify(first)
    second = simplify(second)
    return model.wv.similarity(first, second)
    

# TODO implement similarity search depending on whether title is close to actual title
def recommend(title: str, top_n=10):
    indices = pd.Series(books_df['name'])
    titles = indices.values
    if title not in indices.values:
        return 'title not found in database'
    recs = []
    index = indices[indices == title].index[0]
    scores = pd.Series(cosine_sim[index]).sort_values(ascending=False)
    top_n_indices = list(scores.iloc[1:top_n].index)

    for i in top_n_indices:
        recs.append(list(books_df['name'])[i])
    return recs

get_similarity_score('hi friendsF', 'hie')

KeyError: "Key 'hifriendsf' not present"