In [1]:
# List of unique sentiment classes from all three datasets:
# anger
# boredom
# empty
# enthusiasm
# fear
# fun
# happiness
# hate
# joy
# love
# neutral ---> This is a standalone high level class
# relief
# sadness
# surprise
# worry
# ---------------------

#Positive leaning:

# enthusiasm
# fun
# happiness
# joy
# love
# relief
# surprise
# ----------

#Negative leaning:

# anger
# boredom
# empty
# fear
# hate
# sadness
# worry
# ---------------------

#Positive Spectrum:

# Mild / Low-Arousal Positive (1) = relief
# Moderate / Everyday Positive (2) = fun, happiness, surprise
# High / Energetic Positive (3) = enthusiasm and joy
# Peak / Deep Positive (4) = love
# ----------

#Negative Spectrum:

# Mild / Low-Arousal Negative (1) = boredom and worry
# Moderate / Internalized Negative (2) = sadness and empty
# High / Threat-Oriented Negative (3) = fear and anger
# Extreme / Hostile Negative (4) = hate
# ---------------------

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/final_dataset.csv')
df2 = pd.read_csv('../data/text.csv')
df3 = pd.read_csv('../data/tweet_emotions.csv')

In [4]:
df = df.rename(columns={'emotion': 'label'})

In [5]:
df.head()

Unnamed: 0,text,label
0,i feel rather funny ending with so many dupes ...,fun
1,i feel surprised by the result,surprise
2,i am officially feeling festive,neutral
3,i suddenly found myself standing before this w...,surprise
4,i look at the meager pile of food i purchased ...,enthusiasm


In [6]:
df['label'].value_counts()

label
fun           10000
surprise      10000
enthusiasm    10000
anger         10000
happiness     10000
hate          10000
love          10000
relief        10000
sadness        9999
neutral        9998
empty          6358
Name: count, dtype: int64

In [7]:
df2.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [8]:
df2 = df2.drop(columns=['Unnamed: 0'])

label_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

df2["label"] = df2["label"].map(label_map)

In [9]:
df2['label'].value_counts()

label
joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: count, dtype: int64

In [10]:
df2.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sadness
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sadness
4,i am a kindergarten teacher and i am thoroughl...,fear


In [11]:
df3 = df3.drop(columns = ['tweet_id'])

In [12]:
df3 = df3.rename(columns={'sentiment': 'label', 'content' : 'text'})

In [13]:
df3.head()

Unnamed: 0,label,text
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [14]:
df3 = df3[['text', 'label']]

In [15]:
df3['text'] = df3['text'].str.replace(r'@\w+', '', regex=True).str.strip()

In [16]:
df3['label'].value_counts()

label
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [17]:
df3.head()

Unnamed: 0,text,label
0,i know i was listenin to bad habit earlier an...,empty
1,Layin n bed with a headache ughhhh...waitin o...,sadness
2,Funeral ceremony...gloomy friday...,sadness
3,wants to hang out with friends SOON!,enthusiasm
4,We want to trade with someone who has Houston ...,neutral


In [18]:
# -------------------- #

In [19]:
#Positive sentiment label 1 relief

labels_POS_1 = ['relief']

df_POS_1 = (
    pd.concat(
        [
            df[df['label'].isin(labels_POS_1)],
            df2[df2['label'].isin(labels_POS_1)],
            df3[df3['label'].isin(labels_POS_1)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [20]:
#Positive sentiment label 2 fun, happiness, surprise

labels_POS_2 = ['fun', 'happiness', 'surprise']

df_POS_2 = (
    pd.concat(
        [
            df[df['label'].isin(labels_POS_2)],
            df2[df2['label'].isin(labels_POS_2)],
            df3[df3['label'].isin(labels_POS_2)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [21]:
#Positive sentiment label 3 enthusiasm and joy

labels_POS_3 = ['enthusiasm', 'joy']

df_POS_3 = (
    pd.concat(
        [
            df[df['label'].isin(labels_POS_3)],
            df2[df2['label'].isin(labels_POS_3)],
            df3[df3['label'].isin(labels_POS_3)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [22]:
#Positive sentiment label 4 love

labels_POS_4 = ['love']

df_POS_4 = (
    pd.concat(
        [
            df[df['label'].isin(labels_POS_4)],
            df2[df2['label'].isin(labels_POS_4)],
            df3[df3['label'].isin(labels_POS_4)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [23]:
# Now negative sentiment DFs --> We will randomly sample # of rows based on smallest df from positive, negative, and neutral

In [24]:
#Negative sentiment label 1 boredom and worry

labels_NEG_1 = ['boredom', 'worry']

df_NEG_1 = (
    pd.concat(
        [
            df[df['label'].isin(labels_NEG_1)],
            df2[df2['label'].isin(labels_NEG_1)],
            df3[df3['label'].isin(labels_NEG_1)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [25]:
#Negative sentiment label 2 sadness and empty

labels_NEG_2 = ['sadness', 'empty']

df_NEG_2 = (
    pd.concat(
        [
            df[df['label'].isin(labels_NEG_2)],
            df2[df2['label'].isin(labels_NEG_2)],
            df3[df3['label'].isin(labels_NEG_2)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [26]:
#Negative sentiment label 3 fear and anger

labels_NEG_3 = ['fear', 'anger']

df_NEG_3 = (
    pd.concat(
        [
            df[df['label'].isin(labels_NEG_3)],
            df2[df2['label'].isin(labels_NEG_3)],
            df3[df3['label'].isin(labels_NEG_3)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [27]:
#Negative sentiment label 4 hate

labels_NEG_4 = ['hate']

df_NEG_4 = (
    pd.concat(
        [
            df[df['label'].isin(labels_NEG_4)],
            df2[df2['label'].isin(labels_NEG_4)],
            df3[df3['label'].isin(labels_NEG_4)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [28]:
#Neutral sentiment df

labels_NEU = ['neutral']

df_NEU = (
    pd.concat(
        [
            df[df['label'].isin(labels_NEU)],
            df2[df2['label'].isin(labels_NEU)],
            df3[df3['label'].isin(labels_NEU)]
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [29]:
# Now going to sample 5000 rows randomly from each DF, combine, clean up, and export as csv

In [30]:
#Stratified random sampling to ensure equal distribution of classes when randomly sampling

def stratified_sample(df, n_samples, label_col='label', random_state=42):
    labels = df[label_col].unique()
    n_labels = len(labels)
    n_per_label = n_samples // n_labels
    
    sampled_dfs = [
        df[df[label_col] == label].sample(
            n=min(len(df[df[label_col] == label]), n_per_label),
            random_state=random_state
        )
        for label in labels
    ]
    return pd.concat(sampled_dfs).reset_index(drop=True)

In [31]:
df_POS_1_sampled = df_POS_1.sample(n=5000, random_state=42)

# df_POS_2_sampled = df_POS_2.sample(n=5000, random_state=42) #stratify
df_POS_2_sampled = stratified_sample(df_POS_2, 5000)

# df_POS_3_sampled = df_POS_3.sample(n=5000, random_state=42) #stratify
df_POS_3_sampled = stratified_sample(df_POS_3, 5000)

df_POS_4_sampled = df_POS_4.sample(n=5000, random_state=42)

df_NEG_1_sampled = df_NEG_1.sample(n=5000, random_state=42)

# df_NEG_2_sampled = df_NEG_2.sample(n=5000, random_state=42) #stratify
df_NEG_2_sampled = stratified_sample(df_NEG_2, 5000)

# df_NEG_3_sampled = df_NEG_3.sample(n=5000, random_state=42) #stratify
df_NEG_3_sampled = stratified_sample(df_NEG_3, 5000)

df_NEG_4_sampled = df_NEG_4.sample(n=5000, random_state=42)

df_NEU_sampled = df_NEU.sample(n=5000, random_state=42)

In [32]:
df_all = (
    pd.concat(
        [
            df_POS_1_sampled,
            df_POS_2_sampled,
            df_POS_3_sampled,
            df_POS_4_sampled,
            df_NEG_1_sampled,
            df_NEG_2_sampled,
            df_NEG_3_sampled,
            df_NEG_4_sampled,
            df_NEU_sampled,
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [33]:
all_mapping = {
    "neutral" : 0,
    "relief" : 1,
    "fun" : 2,
    "happiness" : 2,
    "surprise" : 2,
    "enthusiasm" : 3,
    "joy" : 3,
    "love" : 4,
    "boredom" : -1,
    "worry" : -1,
    "sadness" : -2,
    "empty" : -2,
    "fear" : -3,
    "anger" : -3,
    "hate" : -4
}

df_all["label"] = df_all["label"].map(all_mapping)

In [34]:
#Cleaning text
import re

CONTRACTIONS = {
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "i've": "i have",
    "i'll": "i will",
    "i'd": "i would",
    "you're": "you are",
    "you've": "you have",
    "you'll": "you will",
    "you'd": "you would",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "that's": "that is",
    "there's": "there is",
    "what's": "what is",
}

def expand_contractions(text):
    pattern = re.compile(r'\b(' + '|'.join(CONTRACTIONS.keys()) + r')\b')
    return pattern.sub(lambda x: CONTRACTIONS[x.group()], text)

def clean_text(s):
    if pd.isna(s):
        return s

    s = s.lower()

    # expand contractions FIRST
    s = expand_contractions(s)

    # remove urls
    s = re.sub(r'http\S+|www\S+', ' ', s)

    # remove html tags
    s = re.sub(r'<.*?>', ' ', s)

    # remove punctuation / non-alphanumeric chars
    s = re.sub(r'[^a-z0-9\s]', ' ', s)

    # collapse multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # strip leading/trailing whitespace
    return s.strip()

In [35]:
df_all['text'] = df_all['text'].apply(clean_text)

In [36]:
df_all

Unnamed: 0,text,label
0,im feeling a lil uncomfortable,1
1,i awoke pleased with the feeling but dismayed ...,1
2,i feel comfortable just thinking about them,1
3,i wear feels comfortable,1
4,definitely the first time i have felt alive ag...,1
...,...,...
44993,i got it,0
44994,i do get the feeling that even among their so ...,0
44995,just booked some tickets for jessicaaaa,0
44996,i went in with no idea what it was going to be...,0


In [37]:
df_all.to_csv('../data/clean_combined_dataset.csv', index=False)