# NRC Preprocess

## Lexicon

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
lexicon_file = '../NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

lexicon_df = pd.read_table(lexicon_file)
lexicon_df.fillna('null', inplace=True)

len(lexicon_df)

141820

In [3]:
lexicon_df.head(11)

Unnamed: 0,term,AffectCategory,AssociationFlag
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
5,aback,negative,0
6,aback,positive,0
7,aback,sadness,0
8,aback,surprise,0
9,aback,trust,0


In [4]:
terms = lexicon_df['term'].tolist()
terms = [terms[i*10] for i in range(int(len(terms)/10))]

len(terms)

14182

In [5]:
categories = lexicon_df[lexicon_df['term']==terms[0]]['AffectCategory'].tolist()
categories.sort()

In [6]:
categories

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'negative',
 'positive',
 'sadness',
 'surprise',
 'trust']

In [7]:
flags = lexicon_df['AssociationFlag'].tolist()
len(flags)

141820

In [8]:
term2arr = dict()
for i, term in enumerate(terms):
    arr = np.array(flags[i*10:(i*10+10)])
    term2arr[term] = arr

In [9]:
len(term2arr)

14182

In [10]:
term2arr

{'aback': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abacus': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 'abandon': array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0]),
 'abandoned': array([1, 0, 0, 1, 0, 1, 0, 1, 0, 0]),
 'abandonment': array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0]),
 'abate': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abatement': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abba': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'abbot': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
 'abbreviate': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abbreviation': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abdomen': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abdominal': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abduction': array([0, 0, 0, 1, 0, 1, 0, 1, 1, 0]),
 'aberrant': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 'aberration': array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0]),
 'abeyance': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'abhor': array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0]),
 'abhorrent': array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0]),
 'abide': array(

In [11]:
term2arr['cheap']

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [12]:
# joblib.dump((categories, term2arr), 'preprocess-lexicon.pkl')

with open('preprocess-lexicon.pkl', 'wb') as f:
    pickle.dump((categories, term2arr), f)

## Intensity

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
intensity_file = '../NRC-Emotion-Intensity-Lexicon-v1.txt'

intensify_df = pd.read_table(intensity_file)
len(intensify_df)

9921

In [3]:
words = intensify_df['word'].tolist()
len(words), len(set(words))

(9921, 5975)

In [4]:
intensify_df['word'].describe()

count      9921
unique     5975
top       treat
freq          8
Name: word, dtype: object

In [5]:
terms = list(set(words))
len(terms)

5975

In [6]:
categories = intensify_df['emotion'].tolist()
categories = list(set(categories))
categories.sort()

categories

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise',
 'trust']

In [7]:
categories2index = dict(zip(categories, [i for i in range(len(categories))]))
categories2index

{'anger': 0,
 'anticipation': 1,
 'disgust': 2,
 'fear': 3,
 'joy': 4,
 'sadness': 5,
 'surprise': 6,
 'trust': 7}

In [8]:
intensify_df.head()

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.94
4,terrorize,anger,0.939


In [9]:
terms = intensify_df['word'].tolist()
emotions = intensify_df['emotion'].tolist()
scores = intensify_df['emotion-intensity-score'].tolist()

terms2arr = dict()

for i in range(len(intensify_df)):
    t, e, s = terms[i], emotions[i], scores[i]
    
    if t not in terms2arr.keys():
        arr = np.zeros(len(categories))
        arr[categories2index[e]] = s
        terms2arr[t] = arr
    else:
        terms2arr[t][categories2index[e]] = s

In [10]:
terms2arr

{'outraged': array([0.964, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'brutality': array([0.959, 0.   , 0.   , 0.922, 0.   , 0.   , 0.   , 0.   ]),
 'hatred': array([0.953, 0.   , 0.68 , 0.703, 0.   , 0.641, 0.   , 0.   ]),
 'hateful': array([0.94 , 0.   , 0.703, 0.578, 0.   , 0.575, 0.   , 0.   ]),
 'terrorize': array([0.939, 0.   , 0.   , 0.922, 0.   , 0.781, 0.   , 0.   ]),
 'infuriated': array([0.938, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'violently': array([0.938, 0.   , 0.609, 0.828, 0.   , 0.719, 0.   , 0.   ]),
 'furious': array([0.929, 0.   , 0.469, 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'enraged': array([0.927, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'furiously': array([0.927, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'screwyou': array([0.924, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]),
 'murderer': array([0.922, 0.   , 0.82 , 0.953, 0.   , 0.877, 0.   , 0.   ]),
 'fury': array([0.922, 0.   , 0.   , 0.672, 0.   , 0.406, 0.   

In [11]:
intensify_df[intensify_df['word']=='beauty']

Unnamed: 0,word,emotion,emotion-intensity-score
5600,beauty,joy,0.621


In [13]:
# joblib.dump((categories, terms2arr), 'preprocess-intensity.pkl')

with open('preprocess-intensity.pkl', 'wb') as f:
    pickle.dump((categories, terms2arr), f)