In [219]:
import pandas as pd
import re 
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy as np


In [209]:
df = pd.read_csv("text.csv")

with open ("stopwords.txt", "r") as f:
    stopwords = set(f.read().splitlines())
if df.columns[0] == "Unnamed: 0":
    df.rename(columns={"Unnamed: 0": "index"}, inplace=True)
df.set_index("index", inplace= True)
print(df)

                                                     text  label
index                                                           
0           i just feel really helpless and heavy hearted      4
1       ive enjoyed being able to slouch about relax a...      0
2       i gave up my internship with the dmrg and am f...      4
3                              i dont know i feel so lost      0
4       i am a kindergarten teacher and i am thoroughl...      4
...                                                   ...    ...
416804  i feel like telling these horny devils to find...      2
416805  i began to realize that when i was feeling agi...      3
416806  i feel very curious be why previous early dawn...      5
416807  i feel that becuase of the tyranical nature of...      3
416808  i think that after i had spent some time inves...      5

[416809 rows x 2 columns]


In [210]:
sentences = df["text"].apply(lambda x: x.lower().split()).tolist()
tokens = []
vc = pd.Series(tokens).value_counts()
vc = vc.head(7000)
words = vc.index.tolist()
def clean_text(text):
    text = text.lower()
    row_tokens = text.split()
    row_tokens = [word for word in row_tokens if word not in stopwords]
    tokens.extend(row_tokens)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text
df['text'] = df['text'].apply(clean_text)


In [211]:
sentences

[['i', 'just', 'feel', 'really', 'helpless', 'and', 'heavy', 'hearted'],
 ['ive',
  'enjoyed',
  'being',
  'able',
  'to',
  'slouch',
  'about',
  'relax',
  'and',
  'unwind',
  'and',
  'frankly',
  'needed',
  'it',
  'after',
  'those',
  'last',
  'few',
  'weeks',
  'around',
  'the',
  'end',
  'of',
  'uni',
  'and',
  'the',
  'expo',
  'i',
  'have',
  'lately',
  'started',
  'to',
  'find',
  'myself',
  'feeling',
  'a',
  'bit',
  'listless',
  'which',
  'is',
  'never',
  'really',
  'a',
  'good',
  'thing'],
 ['i',
  'gave',
  'up',
  'my',
  'internship',
  'with',
  'the',
  'dmrg',
  'and',
  'am',
  'feeling',
  'distraught'],
 ['i', 'dont', 'know', 'i', 'feel', 'so', 'lost'],
 ['i',
  'am',
  'a',
  'kindergarten',
  'teacher',
  'and',
  'i',
  'am',
  'thoroughly',
  'weary',
  'of',
  'my',
  'job',
  'after',
  'having',
  'taken',
  'the',
  'university',
  'entrance',
  'exam',
  'i',
  'suffered',
  'from',
  'anxiety',
  'for',
  'weeks',
  'as',
  'i',

In [212]:
vc = pd.Series(tokens).value_counts()
vc = vc.head(7000)
words = vc.index.tolist()
print(vc)


feel         289939
feeling      134185
like          73972
im            61662
am            54541
              ...  
politely         33
recipient        33
driveway         33
exited           33
psycho           33
Name: count, Length: 7000, dtype: int64


In [214]:
word_to_index = {}
index = 1 

for sentence in sentences:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = index
            index += 1
print("Vocabulary:", word_to_index)




In [215]:
def sentence_to_sequence(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index["<UNK>"]) for word in sentence]

sequences = [sentence_to_sequence(sent, word_to_index) for sent in sentences]
print("Sequences:", sequences)

KeyError: '<UNK>'

In [217]:
all_words = []
for sentence in df['text']:
    all_words.extend(clean_text(sentence))

word_freq = Counter(all_words)

word2idx = {word: idx+2 for idx, word in enumerate(all_words)}

word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

def text_to_sequence(text):
    return [word2idx.get(word, word2idx['<UNK>']) for word in clean_text(text)]


df['sequence'] = df['text'].apply(text_to_sequence)

In [218]:
df['sequence']

index
0         [40442288, 40442304, 40441585, 40442296, 40442...
1         [40442288, 40442222, 40442310, 40442304, 40442...
2         [40442288, 40442304, 40442259, 40442293, 40442...
3         [40442288, 40442304, 40442270, 40442307, 40442...
4         [40442288, 40442304, 40442293, 40442280, 40442...
                                ...                        
416804    [40442288, 40442304, 40442275, 40442310, 40442...
416805    [40442288, 40442304, 40442294, 40442310, 40442...
416806    [40442288, 40442304, 40442275, 40442310, 40442...
416807    [40442288, 40442304, 40442275, 40442310, 40442...
416808    [40442288, 40442304, 40442300, 40442301, 40442...
Name: sequence, Length: 416809, dtype: object

In [220]:

def pad_sequences(sequences, max_len):
    padded = np.zeros((len(sequences), max_len), dtype=int)
    for i, seq in enumerate(sequences):
        length = min(len(seq), max_len)
        padded[i, :length] = seq[:length]
    return padded

# Example: set max_len (you can use 50, 100, or based on your data)
max_len = 50  # try different values if needed

# Apply padding
padded_array = pad_sequences(df['sequence'], max_len)

In [221]:
padded_array

array([[40442288, 40442304, 40441585, ...,        0,        0,        0],
       [40442288, 40442222, 40442310, ..., 40442270, 40442304, 40442296],
       [40442288, 40442304, 40442259, ..., 40442310, 40442309, 40442288],
       ...,
       [40442288, 40442304, 40442275, ..., 40442288, 40442258, 40442304],
       [40442288, 40442304, 40442275, ..., 40442300, 40442301, 40442310],
       [40442288, 40442304, 40442300, ..., 40442288, 40442259, 40442293]])

In [148]:
list_sadness = []
list_joy = []
list_love = []
list_anger = []
list_fear = []
list_surprise = []
list_unlabeled = []
def categorize_emotions(row):
    if row['label'] == 0:
        list_sadness.append(row['text'])
    elif row['label'] == 1:
        list_joy.append(row['text'])
    elif row['label'] == 2:
        list_love.append(row['text'])
    elif row['label'] == 3:
        list_anger.append(row['text'])
    elif row['label'] == 4:
        list_fear.append(row['text'])
    elif row['label'] == 5:
        list_surprise.append(row['text'])
    return row
df = df.apply(categorize_emotions, axis=1)

In [149]:
sadness = pd.DataFrame({'text': list_sadness, 'label': 0})
joy = pd.DataFrame({'text': list_joy, 'label': 1})
love = pd.DataFrame({'text': list_love, 'label': 2})
anger = pd.DataFrame({'text': list_anger, 'label': 3})
fear = pd.DataFrame({'text': list_fear, 'label': 4})
surprise = pd.DataFrame({'text': list_surprise, 'label': 5})

In [150]:
sorted_df = pd.concat([sadness, joy, love, anger, fear, surprise], ignore_index=True)


In [None]:
sorted_df.head()

Unnamed: 0,text,label
0,ive enjoyed being able to slouch about relax a...,0
1,i dont know i feel so lost,0
2,i was beginning to feel quite disheartened,0
3,i can still lose the weight without feeling de...,0
4,im feeling a little like a damaged tree and th...,0


In [8]:
type(df)

pandas.core.frame.DataFrame

In [9]:
sadness_tokens  = []
joy_tokens      = []
love_tokens     = []
anger_tokens    = []
fear_tokens     = []
surprise_tokens = []

def tokenize_sadness(row):
    token = row['text'].split()
    sadness_tokens.extend(token)
    return row
def tokenize_joy(row):
    token = row['text'].split()
    joy_tokens.extend(token)
    return row
def tokenize_love(row):
    token = row['text'].split()
    love_tokens.extend(token)
    return row
def tokenize_anger(row):
    token = row['text'].split()
    anger_tokens.extend(token)
    return row
def tokenize_fear(row):
    token = row['text'].split()
    fear_tokens.extend(token)
    return row
def tokenize_surprise(row):
    token = row['text'].split()
    surprise_tokens.extend(token)
    return row
sadness = sadness.apply(tokenize_sadness, axis=1)
joy = joy.apply(tokenize_joy, axis=1)
love = love.apply(tokenize_love, axis=1)
anger = anger.apply(tokenize_anger, axis=1)
fear = fear.apply(tokenize_fear, axis=1)
surprise = surprise.apply(tokenize_surprise, axis=1)


In [10]:
sadness_words_vc  = pd.Series(sadness_tokens).value_counts()
joy_words_vc      = pd.Series(joy_tokens).value_counts()
love_words_vc     = pd.Series(love_tokens).value_counts()
anger_words_vc    = pd.Series(anger_tokens).value_counts()
fear_words_vc     = pd.Series(fear_tokens).value_counts()
surprise_words_vc = pd.Series(surprise_tokens).value_counts()

In [11]:
sadness_words_vc.head(200)

i           198644
feel         84184
and          70589
to           60555
the          56262
             ...  
unhappy       1505
terrible      1503
shitty        1494
off           1494
helpless      1491
Name: count, Length: 200, dtype: int64

In [12]:
joy_words_vc.head(200)


i          225154
feel       102709
and         86957
to          85554
the         79354
            ...  
glad         1756
sweet        1745
proud        1736
getting      1732
while        1726
Name: count, Length: 200, dtype: int64

In [13]:
love_words_vc.head(50)


i          56675
feel       24463
and        22462
the        21567
to         20979
a          15772
of         13461
that       11769
my         11453
feeling    10303
in          7866
like        7558
it          7210
for         6763
me          6043
have        5770
is          5313
so          5243
this        4895
you         4821
im          4775
but         4762
with        4724
be          4577
am          4399
was         4357
about       4333
not         4238
as          4144
on          3607
just        2891
more        2802
all         2723
when        2673
or          2600
can         2564
at          2504
love        2365
her         2300
very        2289
are         2274
really      2247
because     2202
do          2154
know        2077
how         2073
he          2033
what        1962
t           1960
if          1952
Name: count, dtype: int64

In [14]:

anger_words_vc.head(50)


i          93430
feel       37717
and        33570
to         30176
the        27873
a          21523
feeling    19236
that       17073
of         16617
my         14297
it         11598
in         10434
like       10367
im          9298
me          9236
so          8809
but         8584
was         8537
for         7998
with        7990
have        7591
is          6979
am          6944
not         6818
this        6744
when        6279
at          6110
be          6089
about       5836
just        5768
or          5681
on          5376
you         5283
because     4993
as          4893
all         4221
up          4146
by          4090
t           3899
do          3881
really      3771
myself      3766
can         3433
know        3399
out         3352
get         3340
if          3279
being       3203
more        3142
been        3087
Name: count, dtype: int64

In [15]:
fear_words_vc.head(200)


i        77495
feel     30913
to       28211
and      27600
the      23126
         ...  
left       687
might      684
take       672
while      670
away       669
Name: count, Length: 200, dtype: int64

In [16]:
surprise_words_vc.head(200)

i             24805
feel           9953
and            9101
the            8466
to             7631
              ...  
makes           188
someone         188
completely      188
sure            188
off             185
Name: count, Length: 200, dtype: int64

In [133]:
print("Sadness:",  len(sadness) / 416809 * 100)
print(" ")
print("joy:", len(joy) / 416809 * 100)
print(" ")
print("love:",  len(love) / 416809 * 100)
print(" ")
print("anger:",  len(anger) / 416809 * 100)
print(" ")
print("fear:",len(fear) / 416809 * 100)
print(" ")
print("surprise:",  len(surprise) / 416809 * 100)
print(" ")

Sadness: 29.07494799776396
 
joy: 33.84451871240784
 
love: 8.29012809224369
 
anger: 13.751382527728529
 
fear: 11.446969715145308
 
surprise: 3.5920529547106708
 


In [19]:
lemma = ""
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return list(synonyms)

def get_antonyms(word):
    antonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            for ant in lemma.antonyms():
                antonyms.add(ant.name().lower())
    return list(antonyms)

def get_hypernyms(word):
    hypernyms = set()
    for syn in wn.synsets(word):
        for hyper in syn.hypernyms():
            for lemma in hyper.lemmas():
                hypernyms.add(lemma.name().lower())
    return list(hypernyms)

def get_hyponyms(word):
    hyponyms = set()
    for syn in wn.synsets(word):
        for hypo in syn.hyponyms():
            for lemma in hypo.lemmas():
                hyponyms.add(lemma.name().lower())
    return list(hyponyms)

In [37]:
text = "I feel angry and frustrated"

new_words = text.split()
emotion_related_words = []

word = "angry"
emotion_related_words.extend(get_synonyms(word))
emotion_related_words.extend(get_antonyms(word))
emotion_related_words.extend(get_hyponyms(word))
emotion_related_words.extend(get_hypernyms(word))

print(set(get_synonyms(word)))

{'tempestuous', 'furious', 'wild', 'angry', 'raging'}
