## Prerequisites

In [1]:
import pandas as pd
import numpy as np

## Data preperation

In [2]:
# read train data
emotion_train = pd.read_csv("../data/emotions/train.txt", delimiter=';', header=None, names=['Sentence','Label'])

In [3]:
# show first 5 rows
emotion_train.head()

Unnamed: 0,Sentence,Label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
# data frame details
emotion_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  16000 non-null  object
 1   Label     16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [5]:
emotion_train.shape

(16000, 2)

In [6]:
# all unique labels
emotion_train.Label.unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

## Clean the emojis dataset

In [7]:
# read the second dataset
emojis = pd.read_csv("../data/emojis/Emoji_Sentiment_Data_v1.0.csv")

In [8]:
emojis.head()
# we need the emoji itself - to return as output
# we need the name of the emoji - to match with input message
# we need the number of occurrences - the emoji with the highest number is suggested first (?)

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


In [9]:
# create a smaller dataframe to loop through
emoji_sentiment = emojis[['Negative', 'Neutral','Positive']]
# assign each emoji with only one sentiment which is the sentiment with highest number
emojis['Sentiment'] = emoji_sentiment.apply(lambda x: emoji_sentiment.columns[x.argmax()], axis = 1)

In [10]:
# before
emojis.head() # we dont need all of these columns

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block,Sentiment
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons,Positive
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats,Positive
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols,Positive
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons,Positive
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons,Negative


In [11]:
# keep columns that we need
emojis = emojis[['Emoji', 'Occurrences', 'Unicode name', 'Sentiment']]

In [12]:
# after
emojis.head()

Unnamed: 0,Emoji,Occurrences,Unicode name,Sentiment
0,😂,14622,FACE WITH TEARS OF JOY,Positive
1,❤,8050,HEAVY BLACK HEART,Positive
2,♥,7144,BLACK HEART SUIT,Positive
3,😍,6359,SMILING FACE WITH HEART-SHAPED EYES,Positive
4,😭,5526,LOUDLY CRYING FACE,Negative


In [13]:
# save the dataset as csv
emojis.to_csv('"../data/emojis_sentiment_cleaned.csv', index=False)

## Merge the two datasets

In [14]:
emojis[emojis['Sentiment'] == 'Neutral']

Unnamed: 0,Emoji,Occurrences,Unicode name,Sentiment
17,😏,1522,SMIRKING FACE,Neutral
31,☯,992,YIN YANG,Neutral
35,✨,848,SPARKLES,Neutral
38,★,828,BLACK STAR,Neutral
39,█,798,FULL BLOCK,Neutral
...,...,...,...,...
959,♊,1,GEMINI,Neutral
964,➛,1,DRAFTING POINT RIGHTWARDS ARROW,Neutral
965,♝,1,BLACK CHESS BISHOP,Neutral
966,❋,1,HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,Neutral


In [15]:
# how do i match these two arrays?
# ['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
# ['Negative', 'Neutral','Positive']

### option
# ['sadnees', 'negative', 'neutral']
# ['anger', 'negative']
# ['love', 'positive', 'neutral']
# ['surprise', 'neutral']
# ['fear', 'negative']
# ['joy', 'positive', 'neutral']

In [16]:
# classify labels and put them in arrays 
negative_emotions = ['sadness', 'anger', 'fear']
neutral_emotions = ['sadness', 'love', 'surprise', 'joy']
positive_emotions = ['love', 'joy']
# use list comprehension to create a new column called sentiment
emotion_train['Sentiment'] = ['Negative' if label in negative_emotions else 'Positive' if label in positive_emotions else 'Neutral' for label in emotion_train['Label']]

In [17]:
emotion_train.head(20)

Unnamed: 0,Sentence,Label,Sentiment
0,i didnt feel humiliated,sadness,Negative
1,i can go from feeling so hopeless to so damned...,sadness,Negative
2,im grabbing a minute to post i feel greedy wrong,anger,Negative
3,i am ever feeling nostalgic about the fireplac...,love,Positive
4,i am feeling grouchy,anger,Negative
5,ive been feeling a little burdened lately wasn...,sadness,Negative
6,ive been taking or milligrams or times recomme...,surprise,Neutral
7,i feel as confused about life as a teenager or...,fear,Negative
8,i have been with petronas for years i feel tha...,joy,Positive
9,i feel romantic too,love,Positive


In [18]:
emojis.head(20)

Unnamed: 0,Emoji,Occurrences,Unicode name,Sentiment
0,😂,14622,FACE WITH TEARS OF JOY,Positive
1,❤,8050,HEAVY BLACK HEART,Positive
2,♥,7144,BLACK HEART SUIT,Positive
3,😍,6359,SMILING FACE WITH HEART-SHAPED EYES,Positive
4,😭,5526,LOUDLY CRYING FACE,Negative
5,😘,3648,FACE THROWING A KISS,Positive
6,😊,3186,SMILING FACE WITH SMILING EYES,Positive
7,👌,2925,OK HAND SIGN,Positive
8,💕,2400,TWO HEARTS,Positive
9,👏,2336,CLAPPING HANDS SIGN,Positive


In [19]:
# merge the two tables on 'Sentiment' column
emotion_emoji_merged = emotion_train.merge(emojis)

In [23]:
emotion_emoji_merged.head()

Unnamed: 0,Sentence,Label,Sentiment,Emoji,Occurrences,Unicode name
0,i didnt feel humiliated,sadness,Negative,😭,5526,LOUDLY CRYING FACE
1,i didnt feel humiliated,sadness,Negative,😩,1808,WEARY FACE
2,i didnt feel humiliated,sadness,Negative,😒,1385,UNAMUSED FACE
3,i didnt feel humiliated,sadness,Negative,😔,1205,PENSIVE FACE
4,i didnt feel humiliated,sadness,Negative,😡,756,POUTING FACE


In [20]:
# shape after merging (4M)
emotion_emoji_merged.shape

(4370002, 6)

In [29]:
emotion_emoji_merged.isna().sum()

Sentence        0
Label           0
Sentiment       0
Emoji           0
Occurrences     0
Unicode name    0
dtype: int64

In [32]:
emotion_emoji_merged.duplicated().sum()

481

In [26]:
#check distribution of data based on labels
print(emotion_emoji_merged.Label.value_counts())

joy         2579122
love         627224
sadness      503928
anger        233172
surprise     217360
fear         209196
Name: Label, dtype: int64


In [33]:
# save the merged dataset as csv
# emotion_emoji_merged.to_csv('../data/sentence_emoji_sentiment_merged.csv', index=False)

## Explore merged dataset