In [1]:
import pandas as pd
import random
import numpy as np
import stopwords
import requests

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import gensim
import gensim.downloader as api
from gensim import corpora
from gensim import models
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import WordEmbeddingSimilarityIndex

from glob import glob
from dateutil import parser
from tqdm import tqdm
from string import punctuation

import nltk
from nltk.tokenize import TreebankWordTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

print(f"gensim version: {gensim.__version__}")




def df_expl (df):
    desc = {"Columns": df.columns, "Missing": df.isna().sum(), "D_Types": df.dtypes, "Shape": df.shape}
    return desc


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mucki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


gensim version: 4.3.1


In [23]:
# "booksresults.xlsx" has infrmation from Google Books API replacing missing info (used in CCS2)
books = pd.read_excel("booksresults.xlsx")


# drop all unneccessary coulumns to reduce size
books.drop(['Unnamed: 0', 'price', 'currency',  'published_date', 'ISBN', 'page_count', 'publisher', 'language'], axis=1, inplace=True)

# Convert values to float and drop missing values/duplicates 
books['rating'] = books['rating'].astype(float).round(1)
books['voters'] = books['voters'].astype(float).round(1)  

books.dropna(subset = ["voters", "rating", "description"], inplace = True)
books.drop_duplicates(subset=['title', 'description'], inplace=True)

# Filter by highest ranking/voter ratio
#books = books[books['voters'] >= 10]
books['ratio'] = books['rating'] / books['voters']

books['generes'] = books['generes'].astype(str)
books["generes"] = books["generes"].str.replace(" &amp,", " & ")

genres = books["generes"].unique().tolist()
genres = [g.replace("&amp,", "& ") for g in genres]

books.head()

Index(['Unnamed: 0', 'title', 'author', 'rating', 'voters', 'price',
       'currency', 'description', 'publisher', 'page_count', 'generes', 'ISBN',
       'language', 'published_date', 'year'],
      dtype='object')


Unnamed: 0,title,author,rating,voters,description,generes,year,ratio
0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428.0,NO SAFE PLACE LEFT At great cost to the Garris...,"comics, graphic novels",2014.0,0.010748
1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23.0,Determined to make a new start in her quaint h...,"fiction,mystery & detective,cozy,general",2007.0,0.143478
2,The Art of Super Mario Odyssey,Nintendo,3.9,9.0,Take a globetrotting journey all over the worl...,"games & activities,video & electronic",2019.0,0.433333
3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,10.0,"With swollen feet and swelling belly, pregnant...",fiction,2009.0,0.4
4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,577.0,The stunning debut fantasy novel from author P...,"fiction,fantasy,dark fantasy",2009.0,0.007799


In [40]:
# Here's a list of the unique genres, useful to check spelling ;)
genres_j = ",".join(genres)
genres_sp = genres_j.split(",")
unique = set(genres_sp)

unique

{' autobiography',
 ' economics',
 ' graphic novels',
 ' mind',
 ' spirit',
 '20th century',
 'accounting',
 'action & adventure',
 'amateur sleuth',
 'assassination',
 'biography',
 'biography & autobiography',
 'body',
 'bombay (india)',
 'brain',
 'budgeting',
 'business',
 'business & economics',
 'classics',
 'comics',
 'comics & graphic novels',
 'communication & social skills',
 'computers',
 'contemporary',
 'corporate finance',
 'cozy',
 'crime',
 'dark fantasy',
 'dragons & mythical creatures',
 'drama',
 'drug couriers',
 'economics',
 'electronic commerce',
 'environmental conservation & protection',
 'epic',
 'fairy tales',
 'fantasy',
 'fantasy & magic',
 'fiction',
 'financial',
 'folk tales',
 'games & activities',
 'gay',
 'gender studies',
 'general',
 'hard-boiled',
 'history',
 'horror',
 'humorous stories',
 'industries',
 'juvenile fiction',
 'leadership',
 'legends & mythology',
 'lgbt',
 'literary',
 'literary figures',
 'management',
 'media tie-in',
 'medical 

In [25]:
print(f"The dataset currently contains {books['title'].nunique()} unique titles.\n")
df_expl(books)


The dataset currently contains 229 unique titles.



{'Columns': Index(['title', 'author', 'rating', 'voters', 'description', 'generes', 'year',
        'ratio'],
       dtype='object'),
 'Missing': title          0
 author         0
 rating         0
 voters         0
 description    0
 generes        0
 year           1
 ratio          0
 dtype: int64,
 'D_Types': title           object
 author          object
 rating         float64
 voters         float64
 description     object
 generes         object
 year           float64
 ratio          float64
 dtype: object,
 'Shape': (230, 8)}

In [46]:
def by_genre (genre: str, df: pd.DataFrame): # Input the genre (spelled correctly!) and respective DataFrame
    
    if genre in unique:
        subset = df["generes"].str.contains(genre, case=False, na=False) # Create a subset containing the genre
        filtered = df[subset]                                            # Create new dataframe based on subset
    
        filtered.drop_duplicates(subset=['title', 'description'], inplace=True)
        filtered.sort_values(by = 'ratio', inplace = True)               # Sort by ratio, results seem plausible 
    
        print(f"The top 3 books (out of {len(filtered)}) in {genre} are:\n\n{filtered.head(3)}\n\n")
        # Remove the line above if you wanna use the 'filtered'  DataFrame for another filter ;)
        
        return filtered

    else:
        print(f"The genre {genre} is not in the data base. Please pick from the following: \n {unique}")
        
by_genre("action & adventure", books)

The top 3 books (out of 8) in action & adventure are:

                                                 title               author   
239  A Dance with Dragons: A Song of Ice and Fire: ...  George R. R. Martin  \
29                                        Morning Star         Pierce Brown   
162                                 Thrawn (Star Wars)         Timothy Zahn   

     rating  voters                                        description   
239     4.5  1799.0  #1 NEW YORK TIMES BESTSELLER • THE BOOK BEHIND...  \
29      4.8   886.0  #1 NEW YORK TIMES BESTSELLER • Red Rising thri...   
162     4.7   588.0  NEW YORK TIMES BESTSELLER • In this definitive...   

                                        generes    year     ratio  
239                  fiction,action & adventure  2011.0  0.002501  
29   fiction,science fiction,action & adventure  2016.0  0.005418  
162                  fiction,action & adventure  2017.0  0.007993  




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.sort_values(by = 'ratio', inplace = True)               # Sort by ratio, results seem plausible


Unnamed: 0,title,author,rating,voters,description,generes,year,ratio
239,A Dance with Dragons: A Song of Ice and Fire: ...,George R. R. Martin,4.5,1799.0,#1 NEW YORK TIMES BESTSELLER • THE BOOK BEHIND...,"fiction,action & adventure",2011.0,0.002501
29,Morning Star,Pierce Brown,4.8,886.0,#1 NEW YORK TIMES BESTSELLER • Red Rising thri...,"fiction,science fiction,action & adventure",2016.0,0.005418
162,Thrawn (Star Wars),Timothy Zahn,4.7,588.0,NEW YORK TIMES BESTSELLER • In this definitive...,"fiction,action & adventure",2017.0,0.007993
264,The Shadow Rising: Book Four of 'The Wheel of ...,Robert Jordan,4.7,567.0,The Wheel of Time ® is a PBS Great American Re...,"fiction,fantasy,action & adventure",2010.0,0.008289
456,Wedge's Gamble: Star Wars Legends (X-Wing),Michael A. Stackpole,4.5,87.0,"Sleek, swift, and deadly, they are the X-wing ...","fiction,science fiction,action & adventure",2011.0,0.051724
847,We,Yevgeny Zamyatin,4.3,49.0,Yevgeny Zamyatin's We is set in an urban glass...,"fiction,science fiction,action & adventure",2013.0,0.087755
217,Moby Dick. Illustrated edition,Melville Herman,5.0,3.0,The story of the novel created by the famous A...,"fiction,action & adventure",2018.0,1.666667
36,Salvaged,Madeleine Roux,4.0,1.0,A WOMAN ON THE RUN. A CAPTAIN ADRIFT IN SPACE....,"fiction,science fiction,action & adventure",2019.0,4.0


In [47]:
'''
I looked over the dataset and those are the most popular genres with enough titles.
Feel free to explore other genres aswell tho ;D
'''

fantasy = by_genre("fantasy", books)
scie_fi = by_genre("science fiction", books)
myst_det = by_genre("mystery & detective", books)
bus_eco = by_genre("business & economics", books)
self_help = by_genre("self-help", books)

The top 3 books (out of 22) in fantasy are:

                                                 title               author   
69   A Game of Thrones: A Song of Ice and Fire: Boo...  George R. R. Martin  \
117  The Name of the Wind: The Kingkiller Chronicle...     Patrick Rothfuss   
285  Mistborn Trilogy: The Final Empire, The Well o...    Brandon Sanderson   

     rating   voters                                        description   
69      4.6  10650.0  NOW THE ACCLAIMED HBO SERIES GAME OF THRONES—T...  \
117     4.3   4683.0  'I have stolen princesses back from sleeping b...   
285     4.7   2700.0  This discounted ebundle includes: Mistborn: Th...   

                   generes    year     ratio  
69    fiction,fantasy,epic  2003.0  0.000432  
117  fiction,fantasy,urban  2010.0  0.000918  
285   fiction,fantasy,epic  2011.0  0.001741  


The top 3 books (out of 10) in science fiction are:

                                                title        author  rating   
29             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.sort_values(by = 'ratio', inplace = True)               # Sort by ratio, results seem plausible
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered.drop_duplicates(subset=['title', 'description'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.