In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import pipeline
import torch

In [2]:
books = pd.read_csv("../data/books_cleaned.csv")

In [3]:
books['categories'].value_counts()

categories
Fiction                      2075
Juvenile Fiction              377
Biography & Autobiography     302
History                       203
Literary Criticism            121
                             ... 
Egypt                           1
Conspiracies                    1
Brothers and sisters            1
Rock musicians                  1
Indic fiction (English)         1
Name: count, Length: 475, dtype: int64

In [4]:
# books[books.categories == "Juvenile Fiction"]

In [5]:
# books[books.categories == "Juvenile Nonfiction"]

In [6]:
categories = list(books.categories.value_counts().reset_index().head(10).categories)
categories.append("Miscellaneous")
categories

['Fiction',
 'Juvenile Fiction',
 'Biography & Autobiography',
 'History',
 'Literary Criticism',
 'Comics & Graphic Novels',
 'Philosophy',
 'Religion',
 'Drama',
 'Science',
 'Miscellaneous']

In [7]:
pipe = pipeline('zero-shot-classification', model="facebook/bart-large-mnli")

Device set to use mps:0


In [8]:
books["simple_categories"] = np.where(books.categories.isin(categories), books.categories, np.nan)

In [9]:
def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [10]:
from tqdm import tqdm

# actual_cats = []
# predicted_cats = []

# for i in tqdm(range(1000)):
#     entry = books.loc[books.simple_categories.isin(categories)].iloc[i]
#     sequence = entry["description"]
#     predicted_cats.append(generate_predictions(sequence, categories))
#     actual_cats.append(entry.simple_categories)

# np.save("../data/actual_cats.npy", actual_cats)
# np.save("../data/predicted_cats.npy", predicted_cats)

In [11]:
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# report = classification_report(y_true=actual_cats, y_pred=predicted_cats)

# print(report)

In [12]:
# class_names = np.unique(actual_cats + ["Misc"])
# cm = confusion_matrix(y_true=actual_cats, y_pred=predicted_cats)
# disp = ConfusionMatrixDisplay(cm, display_labels=class_names)
# disp.plot()
# plt.xticks(rotation=90)
# plt.show()

In [13]:
isbns = []
predicted = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
# for i in tqdm(range(len(missing_cats))):
#     sequence = missing_cats['description'][i]
#     predicted.append(generate_predictions(sequence, categories))
#     isbns.append(missing_cats["isbn13"][i])

# np.save("../data/isbns.npy", isbns)
# np.save("../data/actual_predictions.npy", predicted)

100%|██████████| 1534/1534 [57:12<00:00,  2.24s/it] 


In [15]:
missing_predicted_df = pd.DataFrame({"isbn13":isbns, "predicted_categories": predicted})

In [17]:
missing_predicted_df

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Philosophy
2,9780006280934,Religion
3,9780006380832,History
4,9780006470229,Fiction
...,...,...
1529,9788125026600,Philosophy
1530,9788171565641,Literary Criticism
1531,9788172235222,Drama
1532,9788173031014,Religion


In [18]:
books = pd.merge(books, missing_predicted_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns=["predicted_categories"])

In [19]:
books.isnull().sum()

Unnamed: 0             0
isbn13                 0
authors               30
categories            28
description            0
published_year         0
average_rating         0
num_pages              0
ratings_count          0
title_and_subtitle     0
tagged_description     0
simple_categories      0
dtype: int64

In [22]:
books.to_csv("../data/books_with_categories.csv", index=False)