In [16]:
import pandas as pd
import pandas as pd
import nltk
import re
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob

In [2]:
CATEGORIES = ['empathy', 'distress']

TRAIN_DATA_PATH_IN = "./datasets/WASSA23_essay_level_with_labels_train.tsv"
TRAIN_DATA_PATH_OUT = "datasets/train_essay_level_preproc_EMP.tsv"

DEV_DATA_PATH_IN = "./datasets/WASSA23_essay_level_dev_preproc.tsv"
DEV_DATA_PATH_OUT = "datasets/dev_essay_level_preproc_EMP.tsv"

TEST_DATA_PATH_IN = "datasets/WASSA23_essay_level_test_preproc.tsv"
TEST_DATA_PATH_OUT = "datasets/test_essay_level_preproc_EMP.tsv"

In [106]:

def read_lexicon_df():
    categories_dfs = {}
    for category in CATEGORIES:
        categories_dfs[category] = pd.read_csv(f"./lexicon/{category}_lexicon.txt", skiprows = [0],
                                               names=['word', category], sep=',', engine='python')
    lexicon = pd.DataFrame(columns=['word'])
    
    for category in CATEGORIES:
        lexicon = pd.merge(lexicon, categories_dfs[category], on='word', how='outer')
    
    lexicon.dropna(inplace=True) # row with empty string
    lexicon.sort_values(by='word', inplace=True, ignore_index=True)
    lexicon.set_index('word', inplace=True)
    return lexicon

In [107]:
lexicon = read_lexicon_df()

In [108]:
lexicon

Unnamed: 0_level_0,empathy,distress
word,Unnamed: 1_level_1,Unnamed: 2_level_1
!,4.368395,4.291805
!!,2.333994,2.238771
!!!,2.333994,2.238771
!!!!,2.333994,2.238771
!!!!!!!!!!!!!!,2.333994,2.238771
...,...,...
“,4.446715,4.351591
“...,2.333994,2.238771
”,4.648777,4.262829
”.,2.333994,2.238771


In [111]:
lexicon.dropna

<bound method DataFrame.dropna of                  empathy  distress
word                              
!               4.368395  4.291805
!!              2.333994  2.238771
!!!             2.333994  2.238771
!!!!            2.333994  2.238771
!!!!!!!!!!!!!!  2.333994  2.238771
...                  ...       ...
“               4.446715  4.351591
“...            2.333994  2.238771
”               4.648777  4.262829
”.              2.333994  2.238771
…               4.099375  3.411232

[9356 rows x 2 columns]>

In [110]:
lexicon['empathy'].astype(float)
lexicon['distress'].astype(float)

word
!                 4.291805
!!                2.238771
!!!               2.238771
!!!!              2.238771
!!!!!!!!!!!!!!    2.238771
                    ...   
“                 4.351591
“...              2.238771
”                 4.262829
”.                2.238771
…                 3.411232
Name: distress, Length: 9356, dtype: float64

In [112]:
def get_stemmed_lexicon(lexicon):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stemmed_lexicon = {}

    lexicon['stemma'] = ['' for _ in range(len(lexicon))]
    
    for word, _ in lexicon.iterrows():
  
        lemma = lemmatizer.lemmatize(word)
        stemma = stemmer.stem(lemma)
        lexicon.loc[word, 'stemma'] = stemma

        if stemma in stemmed_lexicon:
            l = [lexicon.loc[word, 'empathy']+stemmed_lexicon[stemma][0],
                  lexicon.loc[word, 'distress']+stemmed_lexicon[stemma][1], 
                  stemmed_lexicon[stemma][-1]+1]
            stemmed_lexicon[stemma] = l
            
        else:
            l = [ lexicon.loc[word, 'empathy'], lexicon.loc[word, 'distress'], 1]
            stemmed_lexicon[stemma] = l
        

    print(stemmed_lexicon)
    #col_names = ['word', 'empathy', 'distress', 'count']
    
    stemmed_lexicon =  pd.DataFrame(stemmed_lexicon).T
    stemmed_lexicon.rename(columns={0:'empathy', 1:'distress', 2:'count'}, inplace=True)
    print(stemmed_lexicon)
    stemmed_lexicon['empathy'] = stemmed_lexicon['empathy'].astype(float) / stemmed_lexicon['count']
    stemmed_lexicon['distress'] = stemmed_lexicon['distress'].astype(float) / stemmed_lexicon['count']
    #stemmed_lexicon[stemmed_lexicon > 0] = 1
    print(stemmed_lexicon)
    #stemmed_lexicon = stemmed_lexicon.loc[(stemmed_lexicon!=0).any(axis=1)]
    stemmed_lexicon.to_csv('./lexicon/stemmed_lexicon_EMP.csv')
    lexicon.to_csv('./lexicon/lexicon_EMP.csv')
    return stemmed_lexicon, lexicon

In [113]:
stemmed_lexicon, lexicon = get_stemmed_lexicon(lexicon)

{'!': [4.36839481048805, 4.29180536964034, 1], '!!': [2.33399398365416, 2.23877081014407, 1], '!!!': [2.33399398365416, 2.23877081014407, 1], '!!!!': [2.33399398365416, 2.23877081014407, 1], '!!!!!!!!!!!!!!': [2.33399398365416, 2.23877081014407, 1], "!'": [2.33399398365416, 2.23877081014407, 1], '!)': [2.33399398365416, 2.23877081014407, 1], '!,': [2.33399398365416, 2.23877081014407, 1], '!?': [2.33399398365416, 2.23877081014407, 1], '!?!': [2.33399398365416, 2.23877081014407, 1], '!”': [2.33399398365416, 2.23877081014407, 1], '"!"""': [2.33399398365416, 2.23877081014407, 1], '""""': [2.38476133350099, 3.18264289635284, 1], '""","': [2.33399398365416, 2.23877081014407, 1], '"""."': [2.33399398365416, 2.23877081014407, 1], '"""?"': [2.33399398365416, 2.23877081014407, 1], '","""': [2.33399398365416, 2.23877081014407, 1], '"."""': [2.33399398365416, 2.23877081014407, 1], '".\'"""': [2.33399398365416, 2.23877081014407, 1], '"?"""': [2.33399398365416, 2.23877081014407, 1], '#': [3.51459465

In [117]:
stemmed_lexicon[250:300]

Unnamed: 0,empathy,distress,count
accompani,4.016118,3.074063,1.0
accomplish,2.891621,2.291637,3.0
accord,3.229234,2.96697,3.0
accordingli,3.163577,3.197889,1.0
account,3.121783,3.940747,6.0
accumul,3.713865,3.593881,1.0
accur,4.455513,2.152636,1.0
accus,3.266171,4.889083,4.0
accustom,3.038972,2.747147,1.0
achiev,3.889223,3.07919,4.0


In [114]:
stemmed_lexicon['count'].sum()

9356.0

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorenzoleuzzi/nltk_data...


True