In [37]:
import pandas as pd

from src.settings import SENTIMENT_RES_DIR


SENTIWORDNET_PATH = SENTIMENT_RES_DIR.joinpath('SentiWordNet_3.0.0.txt')

with SENTIWORDNET_PATH.open(mode='r') as f:
    sentiwordnet_lines = f.readlines()


sentiwordnet_lines[25:]

['# POS\tID\tPosScore\tNegScore\tSynsetTerms\tGloss\n',
 'a\t00001740\t0.125\t0\table#1\t(usually followed by `to\') having the necessary means or skill or know-how or authority to do something; "able to swim"; "she was able to program her computer"; "we were at last able to buy a car"; "able to get a grant for the project"\n',
 'a\t00002098\t0\t0.75\tunable#1\t(usually followed by `to\') not having the necessary means or skill or know-how; "unable to get to town without a car"; "unable to obtain funds"\n',
 'a\t00002312\t0\t0\tdorsal#2 abaxial#1\tfacing away from the axis of an organ or organism; "the abaxial surface of a leaf is the underside or side facing away from the stem"\n',
 'a\t00002527\t0\t0\tventral#2 adaxial#1\tnearest to or facing toward the axis of an organ or organism; "the upper side of a leaf is known as the adaxial surface"\n',
 'a\t00002730\t0\t0\tacroscopic#1\tfacing or on the side toward the apex\n',
 'a\t00002843\t0\t0\tbasiscopic#1\tfacing or on the side toward 

In [45]:
HEADERS = ['pos', 'id', 'pos_score', 'neg_score', 'synset_terms','gloss']

df_sentiwordnet = pd.DataFrame(
    columns=HEADERS,
    data=[row.rstrip('\n').split('\t') for row in sentiwordnet_lines[26:117685]],
)

df_sentiwordnet

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss
0,a,00001740,0.125,0,able#1,(usually followed by `to') having the necessar...
1,a,00002098,0,0.75,unable#1,(usually followed by `to') not having the nece...
2,a,00002312,0,0,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...
3,a,00002527,0,0,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...
4,a,00002730,0,0,acroscopic#1,facing or on the side toward the apex
...,...,...,...,...,...,...
117654,v,02771756,0,0,run_dry#1 dry_out#2,"become empty of water; ""The river runs dry in ..."
117655,v,02771888,0,0.125,fog_up#1,"get foggy; ""The windshield fogged up"""
117656,v,02771997,0,0,coal#1 char#1,"burn to charcoal; ""Without a drenching rain, t..."
117657,v,02772202,0.125,0.25,haze#1,"become hazy, dull, or cloudy"


In [46]:
df_sentiwordnet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117659 entries, 0 to 117658
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pos           117659 non-null  object
 1   id            117659 non-null  object
 2   pos_score     117659 non-null  object
 3   neg_score     117659 non-null  object
 4   synset_terms  117659 non-null  object
 5   gloss         117659 non-null  object
dtypes: object(6)
memory usage: 5.4+ MB


In [47]:
df_sentiwordnet.describe()

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss
count,117659,117659,117659,117659,117659,117659
unique,4,117374,20,20,115251,117033
top,n,1740,0,0,upstage#1,a variety of aster
freq,82115,4,100644,99631,4,23


In [48]:
df_sentiwordnet[df_sentiwordnet['pos'].isna()]

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss


In [49]:
df_sentiwordnet[df_sentiwordnet['id'].isna()]

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss


In [50]:
df_sentiwordnet[df_sentiwordnet['pos'] == '']

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss


In [51]:
df_sentiwordnet[df_sentiwordnet['id'] == '']

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss


In [55]:
def get_wn_offset(wn_id: str, pos: str) -> str:
    try:
        wn_id = int(wn_id.lstrip('0'))
    except ValueError:
        print(wn_id)
    offset = f'{wn_id}{pos}'
    return offset

def get_sentiment_score(pos: float, neg: float) -> float:
    return pos - neg

df_sentiwordnet = df_sentiwordnet.astype({'pos_score': float, 'neg_score': float})
df_sentiwordnet['synset_offset'] = df_sentiwordnet[['id', 'pos']].apply(lambda x: get_wn_offset(*x), axis=1)
df_sentiwordnet['sent_score'] = df_sentiwordnet[['pos_score', 'neg_score']].apply(lambda x: get_sentiment_score(*x), axis=1)
df_sentiwordnet

Unnamed: 0,pos,id,pos_score,neg_score,synset_terms,gloss,synset_offset,sent_score
0,a,00001740,0.125,0.000,able#1,(usually followed by `to') having the necessar...,1740a,0.125
1,a,00002098,0.000,0.750,unable#1,(usually followed by `to') not having the nece...,2098a,-0.750
2,a,00002312,0.000,0.000,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...,2312a,0.000
3,a,00002527,0.000,0.000,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...,2527a,0.000
4,a,00002730,0.000,0.000,acroscopic#1,facing or on the side toward the apex,2730a,0.000
...,...,...,...,...,...,...,...,...
117654,v,02771756,0.000,0.000,run_dry#1 dry_out#2,"become empty of water; ""The river runs dry in ...",2771756v,0.000
117655,v,02771888,0.000,0.125,fog_up#1,"get foggy; ""The windshield fogged up""",2771888v,-0.125
117656,v,02771997,0.000,0.000,coal#1 char#1,"burn to charcoal; ""Without a drenching rain, t...",2771997v,0.000
117657,v,02772202,0.125,0.250,haze#1,"become hazy, dull, or cloudy",2772202v,-0.125


In [56]:
len(df_sentiwordnet.synset_offset.unique())

117659

In [57]:
pwn_sentiment_mapping = dict(
    zip(
        df_sentiwordnet['synset_offset'].values,
        df_sentiwordnet['sent_score'].values,
    )
)

pwn_sentiment_mapping

{'1740a': 0.125,
 '2098a': -0.75,
 '2312a': 0.0,
 '2527a': 0.0,
 '2730a': 0.0,
 '2843a': 0.0,
 '2956a': 0.0,
 '3131a': 0.0,
 '3356a': 0.0,
 '3553a': 0.0,
 '3700a': 0.25,
 '3829a': 0.25,
 '3939a': 0.0,
 '4171a': 0.0,
 '4296a': 0.0,
 '4413a': 0.0,
 '4615a': 0.0,
 '4723a': 0.0,
 '4817a': 0.0,
 '4980a': 0.0,
 '5107a': 0.5,
 '5205a': 0.5,
 '5473a': 0.75,
 '5599a': 0.0,
 '5718a': 0.125,
 '5839a': 0.375,
 '6032a': -0.25,
 '6245a': 0.0,
 '6336a': 0.0,
 '6777a': 0.375,
 '6885a': -0.75,
 '7096a': 0.0,
 '7208a': -0.125,
 '7331a': 0.0,
 '7516a': 0.0,
 '7697a': 0.0,
 '7813a': -0.5,
 '7990a': -0.5,
 '8206a': 0.0,
 '8443a': -0.25,
 '8595a': -0.25,
 '8734a': 0.5,
 '8877a': 0.25,
 '9046a': 0.0,
 '9346a': -0.625,
 '9618a': 0.25,
 '9978a': 0.0,
 '10385a': 0.0,
 '10537a': -0.5,
 '10726a': 0.0,
 '11160a': 0.0,
 '11327a': -0.125,
 '11665a': -0.25,
 '11757a': 0.0,
 '12071a': 0.375,
 '12362a': 0.125,
 '12689a': 0.0,
 '12932a': 0.0,
 '13160a': 0.375,
 '13442a': 0.0,
 '13662a': 0.0,
 '13887a': -0.25,
 '14358a':