In [1]:
#we'll use a pre-trained llm to sort out our messy categories field and classify the book entries into either fiction or nonfiction (for example)
#the method we'll use is text-classification
#llm's who have seen enough categorisation data are then able to classify unknown classes without being trained on them
#this is known as zero shot classification

In [1]:
import pandas as pd
from zmq.sugar import device

books = pd.read_csv('../datasets/books_cleaned.csv')
# books
books["categories"].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Aged women,1
475,Imperialism,1
476,Human-animal relationships,1
477,Amish,1


In [4]:
books["categories"].value_counts().reset_index().query('count >= 50')

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Religion,117
6,Philosophy,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [5]:
#on closer look we see that we can classify books into either fiction and nonfiction
#so we create a category mapping that we will use to create a new column called simple categories
category_mapping = {
    'Fiction': "Fiction",
    'Juvenile Fiction':"Fiction",
    'Biography & Autobiography':"Nonfiction",
    'History':"Nonfiction",
    'Literary Criticism':"Nonfiction",
    'Religion':"Nonfiction",
    'Philosophy':"Nonfiction",
    'Comics & Graphic Novels':"Fiction",
    'Drama':"Fiction",
    'Juvenile Nonfiction':"Nonfiction",
    'Science':"Nonfiction",
    'Poetry':"Fiction",
    'Literary Collections':"Fiction",
}
books["simple_categories"] = books["categories"].map(category_mapping)

In [6]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,199,Gilead,0002005883: A NOVEL THAT READERS and critics h...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,205,Spider's Web: A Novel,0002261987: A new 'Christie for Christmas' -- ...,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,57,Rage of angels,"0006178731: A memorable, mesmerizing heroine J...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,45,The Four Loves,0006280897: Lewis' work on the nature of love ...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,75,The Problem of Pain,"0006280935: ""In The Problem of Pain, C.S. Lewi...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,288,Mistaken Identity,8172235224: On A Train Journey Home To North I...,
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,63,Journey to the East,8173031010: This book tells the tale of a man ...,
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,117,The Monk Who Sold His Ferrari: A Fable About F...,817992162X: Wisdom to Create a Life of Passion...,
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,8185300534: This collection of the timeless te...,Nonfiction


In [7]:
books[~books["simple_categories"].isna()]
#which of the simple categories are not missing
#we see we have about 3,793 entries which is a pretty strong dataset to base our llm based classification on (train)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,199,Gilead,0002005883: A NOVEL THAT READERS and critics h...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,57,Rage of angels,"0006178731: A memorable, mesmerizing heroine J...",Fiction
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,136,Warhost of Vastmark,0006482074: Tricked once more by his wily half...,Fiction
30,9780006646006,000664600X,Ocean Star Express,Mark Haddon;Peter Sutton,Juvenile Fiction,http://books.google.com/books/content?id=I2QZA...,Joe and his parents are enjoying a summer holi...,2002.0,3.50,32.0,1.0,129,Ocean Star Express,000664600X: Joe and his parents are enjoying a...,Fiction
31,9780007105045,0007105045,Tree and Leaf,John Ronald Reuel Tolkien,Literary Collections,http://books.google.com/books/content?id=aPb_A...,"""The two works 'On fairy-stories' and 'Leaf by...",2001.0,4.09,176.0,2245.0,38,Tree and Leaf: The Homecoming of Beorhtnoth : ...,"0007105045: ""The two works 'On fairy-stories' ...",Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,9781933648279,1933648279,Night Has a Thousand Eyes,Cornell Woolrich,Fiction,http://books.google.com/books/content?id=3Gk6s...,"""Cornell Woolrich's novels define the essence ...",2007.0,3.77,344.0,680.0,100,Night Has a Thousand Eyes,"1933648279: ""Cornell Woolrich's novels define ...",Fiction
5188,9784770028969,4770028962,Coin Locker Babies,村上龍,Fiction,http://books.google.com/books/content?id=87DJw...,Rescued from the lockers in which they were le...,2002.0,3.75,393.0,5560.0,41,Coin Locker Babies,4770028962: Rescued from the lockers in which ...,Fiction
5189,9788122200850,8122200850,"Cry, the Peacock",Anita Desai,Fiction,http://books.google.com/books/content?id=_QKwV...,This book is the story of a young girl obsesse...,1980.0,3.22,218.0,134.0,33,"Cry, the Peacock",8122200850: This book is the story of a young ...,Fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,8185300534: This collection of the timeless te...,Nonfiction


In [2]:
from transformers import pipeline

classification_categories = ["Fiction", "Nonfiction"]

pipe = pipeline("zero-shot-classification",
                model="facebook/bart-large-mnli",
                device=0)

Device set to use cpu


In [9]:
#device=0 means run the model on first gpu available
#-1 means use cpu only
#to make sure you have gpu available run
import torch
print(torch.cuda.is_available())  # True if GPU is available
print(torch.cuda.device_count())  # Number of GPUs
print(torch.cuda.get_device_name(0))  # Number of GPUs
#also when installing pytorch install with CUDA support
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# also we can check the huggingface tool window on the left side of pycharm to see what all huggingface models we have downloaded and have cached

True
1
NVIDIA GeForce GTX 1650


In [15]:
# let us work on classifying some already classified entries
books.loc[books['simple_categories']=="Fiction","description"].reset_index(drop=True)[0]

'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world ha

In [3]:
sequence_to_classify = books.loc[books["simple_categories"]=="Fiction","description"].reset_index(drop=True)[0]

KeyError: 'simple_categories'

In [4]:
# pipe(sequence_to_classify, classification_categories)
#scores = probability
pipe("A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst the world has to offer. At its heart is a tale of the sacred bonds between fathers and sons, pitch-perfect in style and story, set to dazzle critics and readers alike.", classification_categories)

{'sequence': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gilead is a song of celebration and acceptance of the best and the worst

In [18]:
import numpy as np
index = np.argmax(pipe(sequence_to_classify, classification_categories)['scores'])
label = pipe(sequence_to_classify, classification_categories)['labels'][index]
label

'Fiction'

In [5]:
def classify_book(sequence, categories):
    prediction = pipe(sequence, categories)
    max_index = np.argmax(prediction["scores"])
    max_label = prediction["labels"][max_index]
    return max_label

In [45]:
from tqdm import tqdm
actual_cat = []
predicted_cat = []

for i in tqdm(range(0,300)):
    sequence = books.loc[books["simple_categories"]=="Fiction","description"].reset_index(drop=True)[i]
    actual_cat += ['Fiction']
    predicted_cat += [classify_book(sequence, classification_categories)]

100%|██████████| 300/300 [00:48<00:00,  6.20it/s]


In [48]:
for i in tqdm(range(300)):
    sequence = books.loc[books["simple_categories"]=="Nonfiction","description"].reset_index(drop=True)[i]
    actual_cat += ['Nonfiction']
    predicted_cat += [classify_book(sequence, classification_categories)]

100%|██████████| 300/300 [00:50<00:00,  5.89it/s]


In [51]:
predictions_table = pd.DataFrame({'actual_cat': actual_cat, 'predicted_cat': predicted_cat})
predictions_table

Unnamed: 0,actual_cat,predicted_cat
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Fiction
3,Fiction,Fiction
4,Fiction,Nonfiction
...,...,...
595,Nonfiction,Nonfiction
596,Nonfiction,Nonfiction
597,Nonfiction,Nonfiction
598,Nonfiction,Nonfiction


In [53]:
predictions_table['flag'] = np.where(predictions_table['actual_cat'] == predictions_table['predicted_cat'], 1, 0)
predictions_table

Unnamed: 0,actual_cat,predicted_cat,flag
0,Fiction,Fiction,1
1,Fiction,Fiction,1
2,Fiction,Fiction,1
3,Fiction,Fiction,1
4,Fiction,Nonfiction,0
...,...,...,...
595,Nonfiction,Nonfiction,1
596,Nonfiction,Nonfiction,1
597,Nonfiction,Nonfiction,1
598,Nonfiction,Nonfiction,1


In [55]:
# correctness = np.sum(predictions_table['flag'])/len(predictions_table)
correctness = predictions_table['flag'].sum()/len(predictions_table)
correctness

np.float64(0.7433333333333333)

In [56]:
#now let us predict all the categories that are missing
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,199,Gilead,0002005883: A NOVEL THAT READERS and critics h...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,205,Spider's Web: A Novel,0002261987: A new 'Christie for Christmas' -- ...,
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,57,Rage of angels,"0006178731: A memorable, mesmerizing heroine J...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,45,The Four Loves,0006280897: Lewis' work on the nature of love ...,
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,75,The Problem of Pain,"0006280935: ""In The Problem of Pain, C.S. Lewi...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,288,Mistaken Identity,8172235224: On A Train Journey Home To North I...,
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,63,Journey to the East,8173031010: This book tells the tale of a man ...,
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,117,The Monk Who Sold His Ferrari: A Fable About F...,817992162X: Wisdom to Create a Life of Passion...,
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,8185300534: This collection of the timeless te...,Nonfiction


In [76]:
missing_categories = books.loc[books["simple_categories"].isna(), ["isbn10", "description"]].reset_index(drop=True)
isbns = []
predictions = []
missing_categories

Unnamed: 0,isbn10,description
0,0002261987,A new 'Christie for Christmas' -- a full-lengt...
1,0006280897,Lewis' work on the nature of love divides love...
2,0006280935,"""In The Problem of Pain, C.S. Lewis, one of th..."
3,0006380832,Until Vasco da Gama discovered the sea-route t...
4,000647022X,A new-cover reissue of the fourth book in the ...
...,...,...
1399,8125026606,Not only does Nietzsche for Beginners delve in...
1400,8171565646,"Forster's lively, informed originality and wit..."
1401,8172235224,On A Train Journey Home To North India After L...
1402,8173031010,This book tells the tale of a man who goes on ...


In [77]:
for i in tqdm(range(len(missing_categories))):
    sequence = missing_categories["description"][i]
    predictions += [classify_book(sequence, classification_categories)]
    isbns += [missing_categories["isbn10"][i]]


100%|██████████| 1404/1404 [03:42<00:00,  6.31it/s]


In [83]:
missing_categories_df = pd.DataFrame({'isbn10': isbns, 'predictions': predictions})

In [84]:
missing_categories_df

Unnamed: 0,isbn10,predictions
0,0002261987,Fiction
1,0006280897,Nonfiction
2,0006280935,Nonfiction
3,0006380832,Nonfiction
4,000647022X,Fiction
...,...,...
1399,8125026606,Nonfiction
1400,8171565646,Fiction
1401,8172235224,Fiction
1402,8173031010,Nonfiction


In [85]:
books = pd.merge(books, missing_categories_df, on="isbn10", how="left")
books['simple_categories'] = np.where(books['simple_categories'].isna(), books['predictions'], books['simple_categories'])
books = books.drop(columns="predictions")

In [86]:
books

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,words_in_description,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,199,Gilead,0002005883: A NOVEL THAT READERS and critics h...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,205,Spider's Web: A Novel,0002261987: A new 'Christie for Christmas' -- ...,Fiction
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,57,Rage of angels,"0006178731: A memorable, mesmerizing heroine J...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,45,The Four Loves,0006280897: Lewis' work on the nature of love ...,Nonfiction
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,75,The Problem of Pain,"0006280935: ""In The Problem of Pain, C.S. Lewi...",Nonfiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,288,Mistaken Identity,8172235224: On A Train Journey Home To North I...,Fiction
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,63,Journey to the East,8173031010: This book tells the tale of a man ...,Nonfiction
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,117,The Monk Who Sold His Ferrari: A Fable About F...,817992162X: Wisdom to Create a Life of Passion...,Fiction
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,174,I Am that: Talks with Sri Nisargadatta Maharaj,8185300534: This collection of the timeless te...,Nonfiction


In [87]:
books.to_csv("books_with_categories.csv", index=False)