In [23]:
import pandas as pd
import random
import pandas as pd
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Beheerder\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Beheerder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# Fix utf-8 error
test_df = pd.read_csv("../datasets/group 2_url1.csv", encoding='ISO-8859-1')
train_df = pd.read_csv("../datasets/final_dataset.csv", encoding='ISO-8859-1')

# Keep only the necessary columns in test_df (Sentence, Emotion)
test_df = test_df[['Sentence', 'Emotion']]
# Keep only the necessary columns in train_df (Sentence, Core_Pipeline_Emotion)
train_df = train_df[['Sentence', 'Core_Pipeline_Emotion']]

# Remove rows with NaN values in the 'Emotion' column of test_df
test_df = test_df.dropna(subset=['Emotion'])

# Create mapping dictionary for emotions for test_df 
emotion_mapping = {
    'anger': ['disapproval', 'annoyance'],
    'fear': ['fear', 'nervousness'],
    'happiness': ['admiration', 'excitement', 'relief', 'amusement', 'optimism',
            'approval', 'gratitude', 'caring', 'joy', 'pride', 'desire', 'love'],
    'sadness': ['sadness', 'embarrassment', 'disappointment', 'remorse'],
    'surprise': ['curiosity', 'realization', 'confusion', 'surprise'],
    'neutral': ['nan', 'neutral'],
    'disgust': ['disgust']
}

# Apply the mapping to test_df
def map_emotion(row):
    for emotion, synonyms in emotion_mapping.items():
        if row['Emotion'] in synonyms:
            return emotion
test_df['Emotion'] = test_df.apply(map_emotion, axis=1)

In [25]:
print(test_df.head())
print(train_df.head())

# Print all the unique emotions in test_df
print("Unique emotions in test_df:", test_df['Emotion'].unique())
# Print all the unique emotions in train_df
print("Unique emotions in train_df:", train_df['Core_Pipeline_Emotion'].unique())

# Print the number of emotions for each unique emotions in test_df
print("Number of emotions in test_df:")
print(test_df['Emotion'].value_counts())
# Print the number of emotions for each unique emotions in train_df
print("Number of emotions in train_df:")
print(train_df['Core_Pipeline_Emotion'].value_counts())

                                            Sentence    Emotion
1               van jullie het eiland weer verlaten.    neutral
2  Maar zie het als een compliment, want eigenlij...  happiness
3  zien als de grootste bedreiging voor hun relatie.       fear
4                    OkÃ©, hier zijn ze, de koppels!  happiness
5  De koppels zien elkaar een laatste keer terug,...    sadness
                                            Sentence Core_Pipeline_Emotion
0        Heb je een vriend? Nee, ik heb geen vriend.               neutral
1  Wil je een single? Ja, ik ben al vier jaar sin...               neutral
2                            Oh. Maar vind je dat...               neutral
3  Bewust, niet bewust. Ik vind het goed dat het is.              surprise
4            Je gaat er ook niet naar opzoeken? Nee.               neutral
Unique emotions in test_df: ['neutral' 'happiness' 'fear' 'sadness' 'surprise' 'anger' 'disgust']
Unique emotions in train_df: ['neutral' 'surprise' 'sadness' 'happin

In [26]:
# Define underrepresented emotions with their desired augmentation counts
augmentation_targets = {
    'surprise': 3,
    'anger': 15,
    'fear': 6,
    'sadness': 3,
    'disgust': 8
}

# Function to replace words with Dutch synonyms
def augment_with_synonyms(sentence, max_replacements=2):
    words = sentence.split()
    augmented = words[:]
    indices = [i for i, word in enumerate(words) if wordnet.synsets(word, lang='nld')]

    if not indices:
        return sentence

    random.shuffle(indices)
    for i in indices[:min(max_replacements, len(indices))]:
        syns = wordnet.synsets(words[i], lang='nld')
        if syns:
            lemmas = syns[0].lemma_names(lang='nld')
            if lemmas:
                augmented[i] = random.choice(lemmas).replace('_', ' ')

    return ' '.join(augmented)

# Augment the underrepresented emotions
augmented_rows = []
for _, row in train_df.iterrows():
    emotion = row['Core_Pipeline_Emotion']
    sentence = row['Sentence']

    n_augments = augmentation_targets.get(emotion, 0)
    for _ in range(n_augments):
        augmented_rows.append({
            'Corrected Sentence': augment_with_synonyms(sentence),
            'Corrected_Emotion': emotion
        })

# Combine original and augmented data
df_aug = pd.DataFrame(augmented_rows)
df_combined = pd.concat([train_df, df_aug], ignore_index=True)

# Map emotion labels to indices
class_names = ['neutral', 'sadness', 'happiness', 'surprise', 'fear', 'anger', 'disgust']
df_combined = df_combined[df_combined['Corrected_Emotion'].isin(class_names)]
df_combined['label'] = df_combined['Corrected_Emotion'].map(lambda x: class_names.index(x))


# Create training/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined['Corrected Sentence'].tolist(),
    df_combined['label'].tolist(),
    test_size=0.1,
    random_state=42
)

In [27]:
# Make sure both dataframes have the same structure
df_combined['Sentence'] = df_combined['Corrected Sentence']
df_combined['Core_Pipeline_Emotion'] = df_combined['Corrected_Emotion']
df_combined = df_combined.drop(columns=['Corrected Sentence', 'Corrected_Emotion', 'label'])

# Let's assume df_original has columns 'Sentence' and 'Emotion'
train_df = train_df.rename(columns={'Emotion': 'Core_Pipeline_Emotion'})

# Combine both datasets
full_df = pd.concat([df_combined[['Sentence', 'Core_Pipeline_Emotion']],
                     train_df[['Sentence', 'Core_Pipeline_Emotion']]],
                    ignore_index=True)

# Print summary
print("Unique emotions in full_df:", full_df['Core_Pipeline_Emotion'].unique())
print("Number of emotions in full_df:")
print(full_df['Core_Pipeline_Emotion'].value_counts())


Unique emotions in full_df: ['surprise' 'sadness' 'disgust' 'fear' 'anger' 'neutral' 'happiness']
Number of emotions in full_df:
Core_Pipeline_Emotion
surprise     1700
neutral      1576
sadness      1480
fear         1463
happiness    1145
disgust       837
anger         592
Name: count, dtype: int64


In [28]:
# rename core pipeline emotion to emotion
full_df = full_df.rename(columns={'Core_Pipeline_Emotion': 'Emotion'})

full_df.head()

Unnamed: 0,Sentence,Emotion
0,"Bewust, kram bewust. ik vind het goed dat het is.",surprise
1,"Bewust, niet bewust. hooghartigheid vind het h...",surprise
2,"Bewust, kram bewust. hooghartigheid vind het g...",surprise
3,"Er staat wel houden open, bezwaar het is niet.",sadness
4,"Er staat wel houden open, maar het is niet.",sadness


In [29]:
# Save the df_combined DataFrame to a CSV file (Training_dataset.csv)
#full_df.to_csv('Training_dataset.csv', index=False, encoding='utf-8')


In [30]:
# Save the test_df DataFrame to a CSV file (Test_dataset.csv)
# test_df.to_csv('Test_dataset.csv', index=False, encoding='utf-8')

In [None]:
# Show duplicate sentences in df_combined and test_df
duplicates_combined = full_df[full_df.duplicated(subset=['Sentence'], keep=False)]
duplicates_test = test_df[test_df.duplicated(subset=['Sentence'], keep=False)]
print("Duplicate sentences in df_combined:")
print(duplicates_combined)
print("Duplicate sentences in test_df:")
print(duplicates_test)

# Show if there are any sentences both in df_combined and test_df
common_sentences = set(full_df['Sentence']).intersection(set(test_df['Sentence']))
if common_sentences:
    print("Common sentences found in both df_combined and test_df:")
    print(common_sentences)

# remove duplicates from full_df
full_df = full_df.drop_duplicates(subset=['Sentence'])
# remove duplicates from test_df
test_df = test_df.drop_duplicates(subset=['Sentence'])

#remove the common sentences from train_df

full_df = full_df[~full_df['Sentence'].isin(test_df['Sentence'])]

# Save the final DataFrames to CSV files
#full_df.to_csv('Training_dataset.csv', index=False, encoding='utf-8')
#test_df.to_csv('Test_dataset.csv', index=False, encoding='utf-8')

Duplicate sentences in df_combined:
                                               Sentence    Emotion
6                                          agrafe doen.    disgust
7                                          agrafe doen.    disgust
8                                            niet doen.    disgust
9                                          agrafe doen.    disgust
10                                           niet doen.    disgust
...                                                 ...        ...
8788                                         Top. Mooi.  happiness
8789  Jullie gaan naar de cocoonruimte. Dat is de gr...    neutral
8790          En jullie leren elkaar daar beter kennen.  happiness
8791              De barbie room. Hij heeft wel koffer.   surprise
8792                     Ja. Hebben jullie geen koffer?    sadness

[4141 rows x 2 columns]
Duplicate sentences in test_df:
                                              Sentence    Emotion
11                                   