In [17]:
import pandas as pd

books = pd.read_csv('books_with_categories.csv')

In [18]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k = None)
classifier("I love this!")

Device set to use cpu


[[{'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'surprise', 'score': 0.008528684265911579},
  {'label': 'neutral', 'score': 0.005764586851000786},
  {'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'disgust', 'score': 0.0016119900392368436},
  {'label': 'fear', 'score': 0.0004138521908316761}]]

In [22]:
import numpy as np

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [23]:
for i in range(10):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

In [24]:
emotion_scores

{'anger': [np.float64(0.06413355469703674),
  np.float64(0.6126194000244141),
  np.float64(0.06413355469703674),
  np.float64(0.35148441791534424),
  np.float64(0.08141238987445831),
  np.float64(0.23222465813159943),
  np.float64(0.5381842255592346),
  np.float64(0.06413355469703674),
  np.float64(0.3006700873374939),
  np.float64(0.06413355469703674)],
 'disgust': [np.float64(0.27359139919281006),
  np.float64(0.3482847809791565),
  np.float64(0.10400658845901489),
  np.float64(0.1507224589586258),
  np.float64(0.18449527025222778),
  np.float64(0.7271749377250671),
  np.float64(0.155854731798172),
  np.float64(0.10400658845901489),
  np.float64(0.279481440782547),
  np.float64(0.17792704701423645)],
 'fear': [np.float64(0.9281681180000305),
  np.float64(0.9425276517868042),
  np.float64(0.9723208546638489),
  np.float64(0.36070606112480164),
  np.float64(0.09504339843988419),
  np.float64(0.05136274918913841),
  np.float64(0.7474274635314941),
  np.float64(0.40449756383895874),
  np

In [25]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
isbn = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(books))):
    isbn.append(books["isbn13"][i])
    sentences = books["description"][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 5230/5230 [37:16<00:00,  2.34it/s]  


In [27]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["isbn13"] = isbn

In [30]:
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,isbn13
0,0.064134,0.273591,0.928168,0.932798,0.646216,0.967158,0.729602,9.780000e+12
1,0.612619,0.348285,0.942528,0.704422,0.887940,0.111690,0.252546,9.780000e+12
2,0.064134,0.104007,0.972321,0.767238,0.549477,0.111690,0.078765,9.780010e+12
3,0.351484,0.150722,0.360706,0.251881,0.732685,0.111690,0.078765,9.780010e+12
4,0.081412,0.184495,0.095043,0.040564,0.884390,0.475880,0.078765,9.780010e+12
...,...,...,...,...,...,...,...,...
5225,0.148208,0.030643,0.919165,0.255172,0.853721,0.980877,0.030656,9.788170e+12
5226,0.064134,0.114383,0.051363,0.400263,0.883198,0.111690,0.227765,9.788170e+12
5227,0.009997,0.009929,0.339218,0.947779,0.375754,0.066685,0.057625,9.788180e+12
5228,0.064134,0.104007,0.459268,0.759456,0.951104,0.368111,0.078765,9.788190e+12
