In [1]:
import pandas as pd
import numpy as np

from transformers import pipeline
import torch

In [2]:
books = pd.read_csv("../data/books_cleaned.csv")

In [3]:
books['categories'].value_counts()

categories
Fiction                      2075
Juvenile Fiction              377
Biography & Autobiography     302
History                       203
Literary Criticism            121
                             ... 
Egypt                           1
Conspiracies                    1
Brothers and sisters            1
Rock musicians                  1
Indic fiction (English)         1
Name: count, Length: 475, dtype: int64

In [4]:
# books[books.categories == "Juvenile Fiction"]

In [5]:
# books[books.categories == "Juvenile Nonfiction"]

In [6]:
categories = list(books.categories.value_counts().reset_index().head(10).categories)
categories.append("Miscellaneous")
categories

['Fiction',
 'Juvenile Fiction',
 'Biography & Autobiography',
 'History',
 'Literary Criticism',
 'Comics & Graphic Novels',
 'Philosophy',
 'Religion',
 'Drama',
 'Science',
 'Miscellaneous']

In [7]:
pipe = pipeline('zero-shot-classification', model="facebook/bart-large-mnli")

Device set to use mps:0


In [8]:
books["simple_categories"] = np.where(books.categories.isin(categories), books.categories, np.nan)

In [9]:
def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [None]:
from tqdm import tqdm

actual_cats = []
predicted_cats = []

for i in tqdm(range(1000)):
    entry = books.loc[books.simple_categories.isin(categories)].iloc[i]
    sequence = entry["description"]
    predicted_cats.append(generate_predictions(sequence, categories))
    actual_cats.append(entry.simple_categories)

np.save("../data/actual_cats.npy", actual_cats)
np.save("../data/predicted_cats.npy", predicted_cats)

 24%|██▎       | 237/1000 [14:14<32:06,  2.52s/it]  

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

report = classification_report(y_true=actual_cats, y_pred=predicted_cats)
report

In [None]:
cm = confusion_matrix(y_true=actual_cats, y_pred=predicted_cats)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

In [None]:
isbns = []
predicted = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(len(missing_cats))):
    sequence = missing_cats['description'][i]
    predicted.append(generate_predictions(sequence, categories))
    isbns.append(missing_cats["isbn13"][i])

np.save("../data/isbns.npy", isbns)
np.save("../data/actual_predictions.npy", predicted)

In [None]:
missing_predicted_df = pd.DataFrame({"isbn13":isbns, "predicted_categories": predicted})