In [1]:
import pandas as pd
import random
import numpy as np
import stopwords
import requests

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim
import gensim.downloader as api
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import WordEmbeddingSimilarityIndex

from glob import glob
from dateutil import parser
from tqdm import tqdm
from string import punctuation

import nltk
from nltk.tokenize import TreebankWordTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

print(f"gensim version: {gensim.__version__}")




def df_expl (df):
    desc = {"Columns": df.columns, "Missing": df.isna().sum(), "D_Types": df.dtypes, "Shape": df.shape}
    return desc


gensim version: 4.3.1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mucki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# "booksresults.xlsx" has infrmation from Google Books API replacing missing info (used in CCS2)
books = pd.read_excel("booksresults.xlsx")


In [3]:
# Convert 'generes' column to string of genres separated by comma
books['generes'] = books['generes'].astype(str)
books['generes'] = books['generes'].str.replace(',amp', '').replace('&amp', '').replace(' &', ',')
for i in range(len(books)):
    genres_list = books.loc[i, 'generes']
    if isinstance(genres_list, str):
        genres_list = genres_list.split(',')
        genres_list = [genre.lower().strip() for genre in genres_list]
        genres_str = ','.join(genres_list)
        books.at[i, 'generes'] = genres_str
        
#split each genre into multiple single genres, each in their own row (see "build_a_recommender" for reference)
s = books.apply(lambda x: pd.Series(x['generes'].split(',')), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'single_genre'
books = books.join(s)

# drop all unneccessary coulumns to reduce size
books.drop(['year', 'published_date', 'ISBN', 'page_count', 'publisher', 'language'], axis=1, inplace=True)

books.head()

Unnamed: 0.1,Unnamed: 0,title,author,rating,voters,price,currency,description,generes,single_genre
0,0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428.0,43.28,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,"comics,graphic novels",comics
0,0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428.0,43.28,SAR,NO SAFE PLACE LEFT At great cost to the Garris...,"comics,graphic novels",graphic novels
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23.0,26.15,SAR,Determined to make a new start in her quaint h...,"fiction,mystery &amp,detective,cozy,general",fiction
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23.0,26.15,SAR,Determined to make a new start in her quaint h...,"fiction,mystery &amp,detective,cozy,general",mystery &amp
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23.0,26.15,SAR,Determined to make a new start in her quaint h...,"fiction,mystery &amp,detective,cozy,general",detective


In [4]:
# Convert values to float and drop missing values 
books['rating'] = books['rating'].astype(float).round(1)
books['voters'] = books['voters'].astype(float).round(1)  
books.dropna(subset = ["voters", "rating", "description"], inplace = True)


In [5]:
# Filter by highest ranking/voter ratio

books = books[books['voters'] >= 10]
books['ratio'] = books['rating'] / books['voters']
    

In [6]:
df_expl(books)

{'Columns': Index(['Unnamed: 0', 'title', 'author', 'rating', 'voters', 'price',
        'currency', 'description', 'generes', 'single_genre', 'ratio'],
       dtype='object'),
 'Missing': Unnamed: 0      0
 title           0
 author          0
 rating          0
 voters          0
 price           0
 currency        0
 description     0
 generes         0
 single_genre    0
 ratio           0
 dtype: int64,
 'D_Types': Unnamed: 0        int64
 title            object
 author           object
 rating          float64
 voters          float64
 price           float64
 currency         object
 description      object
 generes          object
 single_genre     object
 ratio           float64
 dtype: object,
 'Shape': (2233, 11)}

In [7]:
print(f"The dataset currently contains {books['title'].nunique()} unique titles.")

The dataset currently contains 187 unique titles.


In [40]:
def genre_filter (df: pd.DataFrame):
    
    while True:
        user_genre=input(f"Enter you favorite genre (leave blank to skip). \n \n Please choose from this available set:\n \n {set(df['single_genre'])}").strip()

        if user_genre == "":
            print("No input. Skip to next step")
            genre_books=df
            
            break
        
        else:
            try:
                
                # Check if user_genre is included in the single_genre column of timed_books dataframe
                genre_books = df[df['single_genre'] == user_genre]
                print(f"You have chosen books from the genre: {genre_books['single_genre'].unique()}.")
                
                break
        
            except ValueError:
                print("Invalid input! Please choose from the available list.")

    genre_books.drop_duplicates(subset=['title', 'description'], inplace=True)
    genre_books.sort_values(by = 'ratio', inplace = True)
    genre_books.to_excel("genrebooks.xlsx")
    
    if len(genre_books) < 1:
            print("There are no books available in the specified genre! The whole dataset will be used.")
            genre_books = df
                              
    print(f"{len(genre_books['title'].unique())} are remaining in the filtered dataset. \n")

    return genre_books



In [41]:
# Call the function with 'fantasy' 
fantasy = genre_filter(books)

Enter you favorite genre (leave blank to skip). 
 
 Please choose from this available set:
 
 {'american', 'development', 'philosophy', 'traditional', 'fitness', 'self-help', 'organizational behavior', 'securities', 'new thought', 'dark fantasy', 'science fiction', 'psychology', 'promotion', 'detective', 'entrepreneurship', 'cognition', 'corporate &amp', 'physiological psychology', 'social themes', 'movements', 'medical', 'alternative history', 'communication &amp', 'brain', 'comics &amp', 'friendship', 'legends &amp', 'bombay (india)', 'legal', 'adventure', 'video &amp', 'social activists', 'online trading', 'fantasy fiction', 'business', 'cognitive psychology &amp', 'classics', 'general', 'popular culture', 'superheroes', 'manga', 'historical', 'horror', 'healthy living', 'body', 'industries', 'personal growth', 'humorous stories', 'coming of age', 'personnel management', 'mystery & detective', 'juvenile fiction', 'spirit', 'police procedural', 'medical (incl. patients)', 'electronic

You have chosen books from the genre: ['fantasy'].
40 are remaining in the filtered dataset. 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.sort_values(by = 'ratio', inplace = True)


In [42]:
# Call the function with 'horror'
horror = genre_filter(books)

Enter you favorite genre (leave blank to skip). 
 
 Please choose from this available set:
 
 {'american', 'development', 'philosophy', 'traditional', 'fitness', 'self-help', 'organizational behavior', 'securities', 'new thought', 'dark fantasy', 'science fiction', 'psychology', 'promotion', 'detective', 'entrepreneurship', 'cognition', 'corporate &amp', 'physiological psychology', 'social themes', 'movements', 'medical', 'alternative history', 'communication &amp', 'brain', 'comics &amp', 'friendship', 'legends &amp', 'bombay (india)', 'legal', 'adventure', 'video &amp', 'social activists', 'online trading', 'fantasy fiction', 'business', 'cognitive psychology &amp', 'classics', 'general', 'popular culture', 'superheroes', 'manga', 'historical', 'horror', 'healthy living', 'body', 'industries', 'personal growth', 'humorous stories', 'coming of age', 'personnel management', 'mystery & detective', 'juvenile fiction', 'spirit', 'police procedural', 'medical (incl. patients)', 'electronic

You have chosen books from the genre: ['horror'].
7 are remaining in the filtered dataset. 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.sort_values(by = 'ratio', inplace = True)


In [43]:
# Call the function with 'political'
political = genre_filter(books)

Enter you favorite genre (leave blank to skip). 
 
 Please choose from this available set:
 
 {'american', 'development', 'philosophy', 'traditional', 'fitness', 'self-help', 'organizational behavior', 'securities', 'new thought', 'dark fantasy', 'science fiction', 'psychology', 'promotion', 'detective', 'entrepreneurship', 'cognition', 'corporate &amp', 'physiological psychology', 'social themes', 'movements', 'medical', 'alternative history', 'communication &amp', 'brain', 'comics &amp', 'friendship', 'legends &amp', 'bombay (india)', 'legal', 'adventure', 'video &amp', 'social activists', 'online trading', 'fantasy fiction', 'business', 'cognitive psychology &amp', 'classics', 'general', 'popular culture', 'superheroes', 'manga', 'historical', 'horror', 'healthy living', 'body', 'industries', 'personal growth', 'humorous stories', 'coming of age', 'personnel management', 'mystery & detective', 'juvenile fiction', 'spirit', 'police procedural', 'medical (incl. patients)', 'electronic

You have chosen books from the genre: ['political'].
1 are remaining in the filtered dataset. 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_books.sort_values(by = 'ratio', inplace = True)


In [44]:
# Results (that will be improved ;D)
print(f"The top books per category are: \n\n Fantasy: {fantasy[:5]} \n\n Horror: {horror[:5]} \n\n Political {political[:5]}")

The top books per category are: 

 Fantasy:       Unnamed: 0                                              title   
69            69  A Game of Thrones: A Song of Ice and Fire: Boo...  \
117          117  The Name of the Wind: The Kingkiller Chronicle...   
285          285  Mistborn Trilogy: The Final Empire, The Well o...   
733          733  A Dance with Dragons: A Song of Ice and Fire: ...   
1190        1190  The Eye of the World: Book One of 'The Wheel o...   

                   author  rating   voters  price currency   
69    George R. R. Martin     4.6  10650.0  39.34      SAR  \
117      Patrick Rothfuss     4.3   4683.0  51.16      SAR   
285     Brandon Sanderson     4.7   2700.0  66.12      SAR   
733   George R. R. Martin     4.5   1799.0  39.34      SAR   
1190        Robert Jordan     4.7   1780.0  36.79      SAR   

                                            description   
69    NOW THE ACCLAIMED HBO SERIES GAME OF THRONES—T...  \
117   'I have stolen princesses back f