In [60]:
import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm import tqdm


In [61]:
books = pd.read_csv("../data/books_with_categories.csv")

In [62]:
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", 
                      return_all_scores=True)

classifier("I love this!")

Device set to use mps:0


[[{'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'disgust', 'score': 0.001611992483958602},
  {'label': 'fear', 'score': 0.00041385198710486293},
  {'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'neutral', 'score': 0.00576459476724267},
  {'label': 'sadness', 'score': 0.002092391485348344},
  {'label': 'surprise', 'score': 0.00852868054062128}]]

In [63]:
predictions = classifier(books['description'][0].split("."))

In [64]:
emotion_labels = [i["label"] for i in predictions[0]]
isbn = []

In [65]:
emotion_labels

['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

In [66]:
emotion_scores = {label: [] for label in emotion_labels}

In [67]:
def calculate_max_emotion_scores(prediction):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x : x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label : np.max(scores) for label, scores in per_emotion_scores.items()}


In [68]:
for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

  0%|          | 3/5088 [00:00<17:18,  4.90it/s]


KeyboardInterrupt: 

In [None]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df['isbn13'] = isbn

In [None]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise,isbn13
0,0.064134,0.273590,0.928169,0.932798,0.646216,0.967158,0.729602,9780002005883
1,0.612619,0.348286,0.942528,0.704422,0.887940,0.111690,0.252544,9780002261982
2,0.064134,0.104007,0.972321,0.767237,0.549477,0.111690,0.078765,9780006178736
3,0.351483,0.150722,0.360707,0.251881,0.732687,0.111690,0.078765,9780006280897
4,0.081412,0.184495,0.095043,0.040564,0.884389,0.475881,0.078765,9780006280934
...,...,...,...,...,...,...,...,...
5083,0.148208,0.030643,0.919165,0.255170,0.853723,0.980877,0.030656,9788172235222
5084,0.064134,0.114383,0.051363,0.400263,0.883198,0.111690,0.227765,9788173031014
5085,0.009997,0.009929,0.339217,0.947779,0.375755,0.066685,0.057625,9788179921623
5086,0.064134,0.104007,0.459270,0.759455,0.951104,0.368110,0.078765,9788185300535


In [None]:
books = pd.merge(books, emotions_df, on="isbn13")

In [None]:
emotions_df.describe()

Unnamed: 0,anger,disgust,fear,joy,neutral,sadness,surprise,isbn13
count,5088.0,5088.0,5088.0,5088.0,5088.0,5088.0,5088.0,5088.0
mean,0.166382,0.202044,0.311614,0.283588,0.762585,0.225002,0.176143,9780667000000.0
std,0.219698,0.213456,0.343393,0.3193,0.202771,0.2486,0.19016,597739600.0
min,0.000769,0.000821,0.000442,0.000556,0.000981,0.001251,0.000779,9780002000000.0
25%,0.064134,0.104007,0.051363,0.040564,0.549477,0.11169,0.078765,9780313000000.0
50%,0.064134,0.104007,0.097962,0.091727,0.840248,0.11169,0.078765,9780521000000.0
75%,0.141981,0.190982,0.592009,0.511456,0.936846,0.182212,0.204457,9780807000000.0
max,0.989582,0.989417,0.995326,0.992068,0.974344,0.989361,0.983455,9789028000000.0


In [78]:
books.to_csv("../data/books_with_emotions.csv", index=False)

In [79]:
books.head()

Unnamed: 0,isbn13,authors,categories,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description,simple_categories,anger,disgust,fear,joy,neutral,sadness,surprise
0,9780002005883,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,0.064134,0.27359,0.928169,0.932798,0.646216,0.967158,0.729602
1,9780002261982,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction,0.612619,0.348286,0.942528,0.704422,0.88794,0.11169,0.252544
2,9780006178736,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078765
3,9780006280897,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Philosophy,0.351483,0.150722,0.360707,0.251881,0.732687,0.11169,0.078765
4,9780006280934,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Religion,0.081412,0.184495,0.095043,0.040564,0.884389,0.475881,0.078765


In [2]:
import pandas as pd
old_books = pd.read_csv("../data/books.csv")
new_books = pd.read_csv("../data/books_with_emotions.csv")

In [5]:
old_books2 = old_books[["isbn13", "thumbnail"]]
new_books = pd.merge(new_books, old_books2, on="isbn13")

In [7]:
new_books.to_csv("../data/books_with_emotions.csv")