In [1]:
%pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━

In [11]:
import sklearn
import pandas as pd
import numpy as np
import random

seed = 42

np.random.seed(seed)
random.seed(seed)

In [9]:
from datasets import load_dataset
sem_eval_2018_task_1 = load_dataset('sem_eval_2018_task_1', 'subtask5.english', trust_remote_code=True)

In [None]:
print(sem_eval_2018_task_1['train'][0])
print(sem_eval_2018_task_1['validation'][0])
print(sem_eval_2018_task_1['test'][0])

{'ID': '2017-En-21441', 'Tweet': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'anger': False, 'anticipation': True, 'disgust': False, 'fear': False, 'joy': False, 'love': False, 'optimism': True, 'pessimism': False, 'sadness': False, 'surprise': False, 'trust': True}
{'ID': '2018-En-00866', 'Tweet': '@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.', 'anger': True, 'anticipation': False, 'disgust': True, 'fear': False, 'joy': False, 'love': False, 'optimism': False, 'pessimism': False, 'sadness': False, 'surprise': False, 'trust': False}
{'ID': '2018-En-01559', 'Tweet': '@Adnan__786__ @AsYouNotWish Dont worry Indian army is on its ways to dispatch all Terrorists to Hell', 'anger': True, 'anticipation': True, 'disgust': False, 'fear': False, 'joy': False, 'love': False, 'optimism': True, 'pessimism': False, 'sadness': False, 'surprise': False, 'trust': True}


In [None]:
# Example selection for the GPT-4o-mini model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_single_emotion_tweets(dataset, emotions, min_tokens=5):
    """
    This function takes a multi-label dataset and returns a dictionary with a tweet
    for each unique single emotion present in the dataset using cosine similarity,
    along with the average similarity scores.

    Parameters:
    dataset (Dataset): The loaded dataset.
    emotions (list): List of possible emotions.
    min_tokens (int): Minimum number of tokens a tweet must have to be considered.

    Returns:
    dict: A dictionary where keys are emotions and values are tuples of (tweet, average similarity score).
    """
    # Initialize dictionary to store tweets for each single emotion
    emotion_tweets = {emotion: [] for emotion in emotions}

    # Iterate through the dataset
    for example in dataset['train']:
        tweet = example['Tweet']

        # Check if the tweet meets the minimum length requirement
        if len(tweet.split()) < min_tokens:
            continue

        # Create a list of emotions that are labeled as 1 for the current tweet
        example_emotions = [emotion for emotion in emotions if example[emotion] == 1]

        # Check if the tweet has only one emotion label
        if len(example_emotions) == 1:
            emotion = example_emotions[0]
            emotion_tweets[emotion].append(tweet)

    # Remove emotions that do not have any single labeled tweets
    emotion_tweets = {k: v for k, v in emotion_tweets.items() if v}

    # Vectorize the tweets using TF-IDF
    all_tweets = [tweet for tweets in emotion_tweets.values() for tweet in tweets]
    vectorizer = TfidfVectorizer(stop_words='english')
    tweet_vectors = vectorizer.fit_transform(all_tweets)

    # Initialize dictionary to store the best example for each emotion and their average similarity scores
    best_examples = {}

    # Calculate cosine similarity and select the best example for each emotion
    for emotion, tweets in emotion_tweets.items():
        if len(tweets) > 1:  # Ensure there are multiple tweets to compare
            indices = [all_tweets.index(tweet) for tweet in tweets]
            vectors = tweet_vectors[indices]

            # Calculate cosine similarity matrix
            cosine_sim = cosine_similarity(vectors)

            # Average similarity score for each tweet
            avg_sim_scores = cosine_sim.mean(axis=1)

            # Get the index of the tweet with the highest average similarity
            best_idx = np.argmax(avg_sim_scores)
            best_tweet = tweets[best_idx]
            best_score = avg_sim_scores[best_idx]

            best_examples[emotion] = (best_tweet, best_score)
        else:
            # If there's only one tweet for this emotion, take that as the best example with a similarity score of 1
            best_examples[emotion] = (tweets[0], 1.0)

    return best_examples

# Define the list of emotions
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

# Get the best examples for each emotion
best_examples = get_single_emotion_tweets(sem_eval_2018_task_1, emotions, min_tokens=5)

# Output the best examples with their cosine similarity scores
for emotion, (tweet, score) in best_examples.items():
    print(f"Best example for {emotion}: {tweet} (Average Cosine Similarity: {score:.4f})")

Best example for anger: I need some to help with my anger (Average Cosine Similarity: 0.0294)
Best example for anticipation: Let's get drunk and tell each other things we're afraid to say when we're sober. (Average Cosine Similarity: 0.0290)
Best example for disgust: I don't like pineapple I only eat them on pizza, they lose the sting when they get cooked. (Average Cosine Similarity: 0.0327)
Best example for fear: I'm so over having anxiety (Average Cosine Similarity: 0.0381)
Best example for joy: Watch this amazing live.ly broadcast by @izzybuzy365 #lively #musically (Average Cosine Similarity: 0.0466)
Best example for love: @lizbon @anomalily @gerikkransky Sorry for the levity if at all inappropriate but this combined with my love of Charlotte reminds me rn... (Average Cosine Similarity: 0.3496)
Best example for optimism: The point of living, and being an optimist, is to be foolish enough to believe the best is yet to come' - Peter Ustinov #optimism #quote (Average Cosine Similarity:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset

# Load the dataset
sem_eval_2018_task_1 = load_dataset('sem_eval_2018_task_1', 'subtask5.english', trust_remote_code=True)

def get_best_emotion_tweet(dataset, emotion):
    """
    This function takes a multi-label dataset and returns the best tweet for a specific emotion
    using cosine similarity, along with the average similarity score.

    Parameters:
    dataset (Dataset): The loaded dataset.
    emotion (str): The emotion to find the best tweet for.

    Returns:
    tuple: A tuple containing the best tweet and its average similarity score.
    """
    # Initialize list to store tweets for the specified emotion
    emotion_tweets = []

    # Iterate through the dataset
    for example in dataset['train']:
        tweet = example['Tweet']

        # Add tweet to the list if it is labeled with the specified emotion
        if example[emotion] == 1:
            emotion_tweets.append(tweet)

    if not emotion_tweets:
        return None, None

    # Vectorize the tweets using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tweet_vectors = vectorizer.fit_transform(emotion_tweets)

    # Calculate cosine similarity matrix
    cosine_sim = cosine_similarity(tweet_vectors)

    # Average similarity score for each tweet
    avg_sim_scores = cosine_sim.mean(axis=1)

    # Get the index of the tweet with the highest average similarity
    best_idx = np.argmax(avg_sim_scores)
    best_tweet = emotion_tweets[best_idx]
    best_score = avg_sim_scores[best_idx]

    return best_tweet, best_score

# Specify the emotion
emotion = 'trust'

# Get the best example for the specified emotion
best_tweet, best_score = get_best_emotion_tweet(sem_eval_2018_task_1, emotion)

# Output the best example with its cosine similarity score
if best_tweet:
    print(f"Best example for {emotion}: {best_tweet} (Average Cosine Similarity: {best_score:.4f})")
else:
    print(f"No tweets found for emotion: {emotion}")

Best example for trust: @ThatBritishDude no okay you don't not need to be going out of your way to make sure you are pleasing everyone do what makes you happy (Average Cosine Similarity: 0.0216)


In [None]:
# Average length of a tweet, longest tweet, shortest tweet (based on the number of tokens)

def analyze_tweet_tokens(dataset):
    """
    This function takes a dataset and returns the average number of tokens per tweet,
    the longest tweet based on token count, and the shortest tweet based on token count.

    Parameters:
    dataset (Dataset): The loaded dataset.

    Returns:
    dict: A dictionary with the average number of tokens, longest tweet, and shortest tweet.
    """
    # Initialize variables to store the longest and shortest tweets
    longest_tweet = ""
    shortest_tweet = None
    longest_tokens = 0
    shortest_tokens = float('inf')
    total_tokens = 0
    tweet_count = 0

    # Simple tokenization function
    def simple_tokenize(text):
        return text.split()

    # Iterate through the dataset to find tweet token counts
    for example in dataset['train']:
        tweet = example['Tweet']
        tokens = simple_tokenize(tweet)
        num_tokens = len(tokens)

        # Update total tokens and count for average calculation
        total_tokens += num_tokens
        tweet_count += 1

        # Check for longest tweet based on token count
        if num_tokens > longest_tokens:
            longest_tokens = num_tokens
            longest_tweet = tweet

        # Check for shortest tweet based on token count
        if num_tokens < shortest_tokens:
            shortest_tokens = num_tokens
            shortest_tweet = tweet

    # Calculate average number of tokens
    average_tokens = total_tokens / tweet_count if tweet_count > 0 else 0

    return {
        "average_tokens": average_tokens,
        "longest_tweet": longest_tweet,
        "longest_tweet_tokens": longest_tokens,
        "shortest_tweet": shortest_tweet,
        "shortest_tweet_tokens": shortest_tokens
    }

# Analyze tweet tokens
tweet_token_analysis = analyze_tweet_tokens(sem_eval_2018_task_1)

# Print the results
print(f"Average number of tokens per tweet: {tweet_token_analysis['average_tokens']:.2f}")
print(f"Longest tweet: {tweet_token_analysis['longest_tweet']} ({tweet_token_analysis['longest_tweet_tokens']} tokens)")
print(f"Shortest tweet: {tweet_token_analysis['shortest_tweet']} ({tweet_token_analysis['shortest_tweet_tokens']} tokens)")

Average number of tokens per tweet: 16.06
Longest tweet: I feel like a burden every day that I waste but I don't know how to get out of this bc I get so discouraged all I wanna do is lay around 🙃 (33 tokens)
Shortest tweet: testing  (1 tokens)


In [None]:
# Function to count emotions in a dataset split
def count_emotions(dataset_split):
    emotion_counts = {}

    # Iterate through examples in the dataset split
    for example in dataset_split:
        # Iterate through emotions in the example
        for emotion, value in example.items():
            # Check if the value is True (indicating the presence of the emotion)
            if value:
                # Increment the count for the emotion
                emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1

    return emotion_counts

# Count emotions in the train split
train_emotion_counts = count_emotions(sem_eval_2018_task_1['train'])
print("Train Split Emotion Counts:")
print(train_emotion_counts)

# Count emotions in the validation split
val_emotion_counts = count_emotions(sem_eval_2018_task_1['validation'])
print("\nValidation Split Emotion Counts:")
print(val_emotion_counts)

# Count emotions in the test split
test_emotion_counts = count_emotions(sem_eval_2018_task_1['test'])
print("\nTest Split Emotion Counts:")
print(test_emotion_counts)

Train Split Emotion Counts:
{'ID': 6838, 'Tweet': 6838, 'anticipation': 978, 'optimism': 1984, 'trust': 357, 'joy': 2477, 'love': 700, 'anger': 2544, 'disgust': 2602, 'pessimism': 795, 'sadness': 2008, 'fear': 1242, 'surprise': 361}

Validation Split Emotion Counts:
{'ID': 886, 'Tweet': 886, 'anger': 315, 'disgust': 319, 'joy': 400, 'love': 132, 'optimism': 307, 'fear': 121, 'pessimism': 100, 'sadness': 265, 'surprise': 35, 'anticipation': 124, 'trust': 43}

Test Split Emotion Counts:
{'ID': 3259, 'Tweet': 3259, 'anger': 1101, 'anticipation': 425, 'optimism': 1143, 'trust': 153, 'disgust': 1099, 'sadness': 960, 'joy': 1442, 'fear': 485, 'pessimism': 375, 'love': 516, 'surprise': 170}


In [None]:
# Function to count occurrences of multiple emotions in a dataset split
def count_multiple_emotions(dataset_split):
    multiple_emotion_count = 0

    # Iterate through examples in the dataset split
    for example in dataset_split:
        # Count the number of True values in the example
        num_true_values = sum(1 for value in example.values() if value == True)

        # If there are more than 1 True values, increment the count
        if num_true_values > 1:
            multiple_emotion_count += 1

    return multiple_emotion_count

# Count occurrences of multiple emotions in the train split
train_multiple_emotion_count = count_multiple_emotions(sem_eval_2018_task_1['train'])
print("Number of cases with multiple emotions in train split:", train_multiple_emotion_count)

# Count occurrences of multiple emotions in the validation split
val_multiple_emotion_count = count_multiple_emotions(sem_eval_2018_task_1['validation'])
print("Number of cases with multiple emotions in validation split:", val_multiple_emotion_count)

# Count occurrences of multiple emotions in the test split
test_multiple_emotion_count = count_multiple_emotions(sem_eval_2018_task_1['test'])
print("Number of cases with multiple emotions in test split:", test_multiple_emotion_count)

Number of cases with multiple emotions in train split: 5652
Number of cases with multiple emotions in validation split: 755
Number of cases with multiple emotions in test split: 2802


In [None]:
# Function to find the maximum number of emotions in a single example in a dataset split
def max_emotions_in_example(dataset_split):
    max_emotion_count = 0
    max_emotion_example = None

    # Iterate through examples in the dataset split
    for example in dataset_split:
        # Count the number of True values in the example
        num_true_values = sum(1 for value in example.values() if value == True)

        # Update max_emotion_count if num_true_values is greater
        if num_true_values > max_emotion_count:
            max_emotion_count = num_true_values
            max_emotion_example = example

    return max_emotion_count, max_emotion_example

# Find the maximum number of emotions in a single example for the train split
train_max_emotion_count, train_max_emotion_example = max_emotions_in_example(sem_eval_2018_task_1['train'])
print("Maximum number of emotions in a single example in the train split:", train_max_emotion_count)
print("Example with maximum number of emotions in train split:")
print(train_max_emotion_example)

# Find the maximum number of emotions in a single example for the validation split
val_max_emotion_count, val_max_emotion_example = max_emotions_in_example(sem_eval_2018_task_1['validation'])
print("\nMaximum number of emotions in a single example in the validation split:", val_max_emotion_count)
print("Example with maximum number of emotions in validation split:")
print(val_max_emotion_example)

# Find the maximum number of emotions in a single example for the test split
test_max_emotion_count, test_max_emotion_example = max_emotions_in_example(sem_eval_2018_task_1['test'])
print("\nMaximum number of emotions in a single example in the test split:", test_max_emotion_count)
print("Example with maximum number of emotions in test split:")
print(test_max_emotion_example)

Maximum number of emotions in a single example in the train split: 6
Example with maximum number of emotions in train split:
{'ID': '2017-En-22055', 'Tweet': "I had really strange and awful dreams last night. I'd didn't even eat cheese before bed  #lovemysleep", 'anger': True, 'anticipation': False, 'disgust': True, 'fear': True, 'joy': True, 'love': True, 'optimism': False, 'pessimism': False, 'sadness': True, 'surprise': False, 'trust': False}

Maximum number of emotions in a single example in the validation split: 6
Example with maximum number of emotions in validation split:
{'ID': '2018-En-01944', 'Tweet': '@_elliegillxx @0liviarobertson Poor Lyn 😂😂😂 livv I hope she forgets yours this year in revenge', 'anger': True, 'anticipation': False, 'disgust': True, 'fear': False, 'joy': True, 'love': False, 'optimism': True, 'pessimism': True, 'sadness': True, 'surprise': False, 'trust': False}

Maximum number of emotions in a single example in the test split: 6
Example with maximum number

Preparing data for training and evaluation.
The TRAIN and VALIDATION data is combined and transformed with TfidfVectorizer + Scaler.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler

# Converting the DatasetDict into a DataFrame
df_train = pd.DataFrame(sem_eval_2018_task_1['train'])
df_validation = pd.DataFrame(sem_eval_2018_task_1['validation'])
df = pd.concat([df_train, df_validation])

# Extract texts and labels for the train set
texts_train = df['Tweet']
labels_train = df.drop(columns=['Tweet', 'ID'])

# Extract texts and labels for the test set
df_test = pd.DataFrame(sem_eval_2018_task_1['test'])
texts_test = df_test['Tweet']
labels_test = df_test.drop(columns=['Tweet', 'ID'])

# Transform the text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

# Apply Standard Scaling
scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

OneVsRestClassifier + SVC
Seed

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix

# Define model and parameter grid
model = OneVsRestClassifier(SVC(random_state=42))
parameters = [
    {
        "estimator__C": [1, 2, 4],
        "estimator__kernel": ["poly", "rbf"],
        "estimator__degree": [1, 2],
    },
]

# Perform grid search with cross-validation
clf = GridSearchCV(model, parameters, scoring='accuracy', n_jobs=-1)
clf.fit(X_train, labels_train)

# Print best parameters and best score from grid search
print(clf.best_params_, clf.best_score_)

# Predict on test data using the best estimator
best_model = clf.best_estimator_
labels_pred = best_model.predict(X_test)

{'estimator__C': 1, 'estimator__degree': 1, 'estimator__kernel': 'poly'} 0.18202701343125913


In [None]:
from sklearn.metrics import f1_score, accuracy_score, multilabel_confusion_matrix

# Calculate test accuracy and F1 scores
accuracy_test = accuracy_score(labels_test, labels_pred) * 100
f1_macro_test = f1_score(labels_test, labels_pred, average='macro') * 100
f1_micro_test = f1_score(labels_test, labels_pred, average='micro') * 100

# Calculate confusion matrix
confusion_matrix_test = multilabel_confusion_matrix(labels_test, labels_pred)

# Print the results
print(f"Test Accuracy: {accuracy_test:.2f}")
print(f"Test F1-macro: {f1_macro_test:.2f}")
print(f"Test F1-micro: {f1_micro_test:.2f}")
print("Confusion Matrix:")
label_columns = ['anticipation', 'optimism', 'trust', 'joy', 'love', 'anger', 'disgust', 'pessimism', 'sadness', 'fear', 'surprise']
for label, matrix in zip(label_columns, confusion_matrix_test):
    print(f"Label: {label}")
    print(matrix)
    print()

Test Accuracy: 18.04
Test F1-macro: 37.94
Test F1-micro: 55.20
Confusion Matrix:
Label: anticipation
[[1928  230]
 [ 466  635]]

Label: optimism
[[2827    7]
 [ 420    5]]

Label: trust
[[1853  307]
 [ 525  574]]

Label: joy
[[2740   34]
 [ 266  219]]

Label: love
[[1638  179]
 [ 490  952]]

Label: anger
[[2702   41]
 [ 376  140]]

Label: disgust
[[1884  232]
 [ 611  532]]

Label: pessimism
[[2881    3]
 [ 366    9]]

Label: sadness
[[2205   94]
 [ 603  357]]

Label: fear
[[3088    1]
 [ 163    7]]

Label: surprise
[[3106    0]
 [ 153    0]]

