In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from transformers import pipeline

warnings.filterwarnings("ignore")
%load_ext jupyter_black

In [None]:
df = pd.read_csv("../data/The-Office-Lines-V4.csv")
df.head()

In [None]:
print(df.columns)
print(df.info())

In [None]:
df = df[["season", "episode", "title", "scene", "speaker", "line"]]
df.head()

In [None]:
print(df.line[0])

In [None]:
speaker_counts = df["speaker"].value_counts().reset_index()
speaker_counts.columns = ["speaker", "line_count"]

plt.figure(figsize=(30, 20))
sns.barplot(
    data=speaker_counts.head(15), x="speaker", y="line_count", palette="viridis"
)
plt.xticks(rotation=90)
plt.title("Top 15 Characters by Number of Lines")
plt.xlabel("Character")
plt.ylabel("Number of Lines")
plt.tight_layout()
plt.show()

In [None]:
pip install torch


In [None]:
from transformers import pipeline
import torch

emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True,
)

In [None]:
def get_top_emotions(text, top_k=2):
    scores = emotion_classifier(text)[0]
    sorted_scores = sorted(scores, key=lambda x: x["score"], reverse=True)
    return [e["label"] for e in sorted_scores[:top_k]]

In [None]:
df["emotions"] = df["line"].apply(get_top_emotions)

In [None]:
print(df.head(10))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import string

# Load the sarcasm model
MODEL_PATH = "helinivan/english-sarcasm-detector"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)


def preprocess_data(text: str) -> str:
    return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()


def detect_sarcasm(text, threshold=0.5):
    tokenized_text = tokenizer(
        [preprocess_data(text)],
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    with torch.no_grad():
        output = model(**tokenized_text)
    probs = output.logits.softmax(dim=-1).tolist()[0]
    confidence = max(probs)
    prediction = probs.index(confidence)
    is_sarcastic = bool(prediction)  # Usually 1 = sarcastic, 0 = not
    return "sarcastic" if is_sarcastic and confidence >= threshold else "not_sarcastic"

In [None]:
df["sarcasm"] = df["line"].apply(detect_sarcasm)

In [None]:
df.head(20)

In [None]:
df.sarcasm.value_counts().plot(kind="bar", color=["blue", "orange"])
plt.title("Sarcasm Detection Results")