In [1]:
import spacy
from itertools import chain, combinations, repeat
import pandas as pd
import numpy as np
import string
import json
import re


# Spacy language model setting
nlp = spacy.load('en_core_web_sm')

# Target directory
DIR = "OtherLists/"

# Load from:
aod_raw_file = DIR + 'adverbs_of_degree_raw.txt'
all_raw_file = DIR + 'all_emotions_no_classification_raw.txt'
btc_raw_file = DIR + 'big_theme_classification_raw.txt'
str_raw_file = DIR + 'strong_words_raw.txt'
ei3_raw_file = DIR + 'words_of_emotions_3_intensivities_raw.txt'

# Save to:
aod_clean_file = DIR + 'adverbs_of_degree_clean.json'
all_clean_file = DIR + 'all_emotions_no_classification_clean.json'
btc_clean_file = DIR + 'big_theme_classification_clean.json'
str_clean_file = DIR + 'strong_words_clean.json'
ei3_clean_file = DIR + 'words_of_emotions_3_intensivities_clean.json'

# Group save as CSV matrice for adnotations and JSON for easy generator access:
deep_feelining_matrix_save_file = DIR + 'deep_feeling.csv'
deep_feelining_access_save_file = DIR + 'deep_feeling.json'

def save_as_json(path, data):
    with open(path, "w") as f:
        json.dump(data, f, indent=4)

In [2]:
# 1. I need to prepare Textfile of adverbs of degree 
# ----------------------------------------------------------
# it should be usefull for generating texts with action vibe 
# (source: https://www.englishclub.com/vocabulary/adverbs-degree.htm)


# Load file
lines_degree = open(aod_raw_file, "r").readlines()    

# Actual preparation... Removed '\n', '*', few lines, and showed em way to dict...
aod_dict = {'AOD' : [line.split('\n')[0].split('*')[0] for line in lines_degree[3:]]}

save_as_json(aod_clean_file, aod_dict)


aod_dict['AOD'][:10]

['almost',
 'absolutely',
 'awfully',
 'badly',
 'barely',
 'completely',
 'decidedly',
 'deeply',
 'enough',
 'enormously']

In [3]:
# 2. Lots of unclassified emotional state related words
# -------------------------------------------------------------------------------
# (source: https://www.verywellfamily.com/feelings-words-from-a-to-z-2086647 )


lines_emotions = open(all_raw_file, "r").readlines() 

# Cleaned up and chained
emotions_all = [line.lower().split('\n')[0].split(', ') for line in lines_emotions if not len(line) < 5]
alle = { 'AllE' : list(chain.from_iterable(emotions_all)),}

save_as_json(all_clean_file, alle)


alle['AllE'][:10]

['angry',
 'annoyed',
 'afraid',
 'awkward',
 'affectionate',
 'anxious',
 'alarmed',
 'awed',
 'aggravated',
 'amazed']

In [4]:
# 3. I need to map this big messy ctr-copy/paste txt from website
# ---------------------------------------------------------------
# I need it to generate specific types of emotions indicators in texts 
# for NLP project. I need to map categories, subcategories, paragraphs 
# and keywords right. I dont think i need to use scraper for this one
# (source: http://www.derose.net/steve/resources/emotionwords/ewords.html)



btc = open(btc_raw_file, "r").readlines()


# get & clear lines
btc_lines = [line.split('\n')[0] for line in btc if len(line) > 1]

# get main and paragraph titles
titles_and_lines = [ [l,"#",btc_lines[n+1] ] for n,l in enumerate(btc_lines) if l.split()[0].istitle()]

# Main titles items & only main titles
main_titles_items = [wlist for wlist in titles_and_lines if len(wlist[2]) < 24]
main_titles_only = [wlist[0] for wlist in titles_and_lines if len(wlist[2]) < 24]


si = []
n = -1
for fragment in titles_and_lines:
    # Tactic: prepare moving index to join the main titles 
    # with correspon titles and with paragraphs
    n+=1
    if fragment[0] in main_titles_only:
        si.append(n)
moving_index = list(zip(si[:], (si[1:]+[100])))


bte = {}
for i1 ,i2 in moving_index:
    # Now most the importan part, to get text sequences right
    # and map titles, sections, keywords together with no mistake
    section = titles_and_lines[i1:i2]
    for i, paragraph in enumerate(section):
        if i is 0:
            
            # seting key names as [:4] of each word in title
            title_section = "".join(paragraph[0].title().split(' ')).split('/')
            title_section = "".join([w[:4] for w in title_section])
            
            # Here comes the main container
            bte[title_section] = {}
            
        else:
            # Remove one one unneeded category i catched by appending idx 100 to moving_index (quick guess)
            if paragraph[0] in ['Unsorted']:
                continue
            
            # I might need to shorten-up acapit names also in future.
            acapit_title = "".join(paragraph[0].title().split(' '))
            
            # Preprocess some words with accidental comas, dots
            pattern = re.compile('[\W_]+')
            acapit_text = [pattern.sub('', word) for word in paragraph[2].split(', ')]
            
            bte[title_section][acapit_title] = acapit_text


save_as_json(btc_clean_file, bte)

display(bte['DireFocu'])
display(bte['SafeSecu']['Fearless'][:4])

{'Derailed': ['derailed', 'disjointed', 'disoriented', 'torn'],
 'Focused': ['committed', 'complacent', 'determined', 'focused', 'inthezone'],
 'Lost': ['baffled', 'bewildered', 'confused', 'lost', 'unfocussed'],
 'Obsessed': ['compelled', 'consumed', 'obsessed']}

['audacious', 'bold', 'brave', 'certain']

In [5]:
# 4. Here is a 'strong words' list. Looks good for building for intensivity
# ----------------------------------------------------
# I just got them from a website by copy pasta again, there seem to be 
# very easy pattern (each second line or so) (checked :D yup)
# (source: https://www.vocabulary.com/lists/152158 )


strong_words = open(str_raw_file, "r").readlines()

str_definitions = [re.compile('[\W_]+').sub(' ', word).strip() for word in strong_words[1::2]]

sw = {  'StrW'  : [re.compile('[\W_]+').sub('', word) for word in strong_words[::2]],
        'explnr': {},}

for a, b in list(zip(sw['StrW'], str_definitions)):
    sw['explnr'][a] = b


save_as_json(str_clean_file, sw)

# Ready
#display(sw['StrW'][125:130])
for each in sw['StrW'][125:130]:
    print("{:<15}{}".format(each, sw['explnr'][each]))


peevish        easily irritated or annoyed
splenetic      of or relating to the spleen
cautious       showing careful forethought
discreet       marked by prudence or modesty and wise self restraint
provident      giving something useful for the future


In [2]:
# 5. Emotion words divided by 3 levels of intensiviy, This should be preety good
# ------------------------------------------------------------------------------

ei = open(ei3_raw_file, "r").readlines()
            
lines = [item.split('\n')[0] for item in ei if len(item) > 2]

ei3 = {}
test1 = [] # For validation
for i, line in enumerate(lines):
    if not line[-1:].isalpha():
        category = line
        words = lines[i+1]
        ei3[category] = [word.strip() for word in words.lower().split(' ~ ')]

        # For testing purposes:
        test1.append((category, len(ei3[category]))) # category name / nr of items in category 


# Sanity check: are words-in & out of equal amounts?
test2 = [  (len("".join(l.split('\n')[0]).split(' ~ ')),  # words num check 1
           l.count(' ~ ')+1)                              # words num check 2
           for l in ei if l.count(' ~ ') > 0]
# Running 3 all tests at once, also in respect to the previous:   num check 3
for t1, t2 in list(zip(test2,test1)): 
    assert t1[0] == t1[0] == t2[1]  


save_as_json(ei3_clean_file, ei3)

ei3['Happiness3'][:10]

['awe-filled',
 'blissful',
 'ecstatic',
 'egocentric',
 'elated',
 'enthralled',
 'euphoric',
 'exhilarated',
 'giddy',
 'jubilant']

In [182]:
# Deep feeling JSON/dict
# ----------------------
# For quick use during generation

DFD = {**sw, **aod_dict, **alle, **ei3}
for k, v in bte.items():
    DFD[k] = v
    for kk, vv in bte[k].items():
        DFD["_"+kk] = vv

save_as_json(deep_feelining_access_save_file, DFD)


# Deep feeling CSV/pd matrice
# ---------------------------
# For matching in relations


DF_map = {
#   source      dict      specific category (column) name(s) 
#   ------      ----      ----------------------------------
    'AllE' :    (alle,     []),
    'StrW' :    (sw,       []),
    'AOD'  :    (aod_dict, []),
    'EI3'  :    (ei3,      ['IntensityGrp', 'IntensityRank']),
    'BTE'  :    (bte,      ['PolarGrp', 'PolarType']),}

base_cols = ["Entry", "Source", 'IsEmotion', 'IsStrong', 'DegreeType']
columns_order = [ 'Entry', 'lemma', 'pos', 'tag', 'Source','IsEmotion', 'IsStrong', 'DegreeType'
                 , 'PolarGrp', 'PolarType','IntensityGrp', 'IntensityRank' ]

DATAFRAMES = []
for name, data in DF_map.items():
    if name is 'BTE':
        category_words = []
        for k in data[0].keys():
            for kk, words in data[0][k].items():
                category_words.append( zip([w for w in words], repeat(name), repeat(1), repeat(0), repeat(0), repeat(k), repeat("_"+kk)) )
        rows = list( chain.from_iterable( category_words))
    
    elif name is 'EI3':
        rows = list( chain.from_iterable( [ zip([w for w in data[0][k]], repeat(name), repeat(1), repeat(0), repeat(0),repeat(k), repeat(k[-1])) 
                                     for k in data[0].keys()] ))
    elif name is 'AllE':
        rows = list(zip( data[0][name], repeat(name), repeat(1), repeat(0), repeat(0) ))
    
    elif name is 'StrW':
        rows = list(zip( data[0][name], repeat(name), repeat(0), repeat(1), repeat(0) ))
    
    elif name is 'AOD':
        rows = list(zip( data[0][name], repeat(name), repeat(0), repeat(0), repeat(1) ))
    print(name)
    
    DF = pd.DataFrame(rows, columns=[*base_cols, *data[1]])
    DATAFRAMES.append(DF)

# Unification
aggregation_functions = { 'IsEmotion': 'sum', 'IsStrong': 'sum', 'DegreeType': 'sum',
                          'PolarGrp':'any', 'PolarType':'any', 
                          'IntensityGrp':'any', 'IntensityRank':'any'}
type_cols = ['IsEmotion','IsStrong','DegreeType']
grp_cols = ['PolarGrp', 'PolarType','IntensityGrp']

DFDF = pd.concat(DATAFRAMES)
DFDF = DFDF.groupby(DFDF['Entry']).agg(aggregation_functions)
DFDF[type_cols] = DFDF[type_cols].applymap(lambda x: 1 if x > 0 else 0)
DFDF[grp_cols] = DFDF[grp_cols].applymap(lambda x: x if x else '')
DFDF['IntensityRank'] = DFDF.loc[:,'IntensityRank'].apply(lambda x: x if x else 0)
DFDF['Source'] = 'elists'
DFDF['Entry'] = DFDF.index.values.tolist()

# Tags
lemma = []
pos = []
tag = []
for doc in nlp.pipe(DFDF.Entry.values, batch_size=50, n_threads=4):
    if doc.is_parsed:
        lemma.append(" ".join([n.lemma_ for n in doc]))
        pos.append(" ".join([n.tag_ for n in doc]))
        tag.append(" ".join([n.pos_ for n in doc]))
    else:
        lemma.append(None)
        pos.append(None)
        tag.append(None)
DFDF['lemma'] = lemma
DFDF['pos'] = pos
DFDF['tag'] = tag
DFDF = DFDF[columns_order]
DFDF.index = list(range(len(DFDF)))
# Check
display(DFDF[:5])

display(DFDF.shape)
display(DFDF.describe())

#DFDF[DFDF['Entry'].str.isalpha()!=True]

# Save
DFDF.to_csv(deep_feelining_matrix_save_file, header=columns_order)

AllE
StrW
AOD
EI3
BTE


Unnamed: 0,Entry,lemma,pos,tag,Source,IsEmotion,PolarGrp,PolarType,IsStrong,DegreeType,IntensityGrp,IntensityRank
0,abandoned,abandon,VBN,VERB,elists,1,Atta,_Hated,0,0,,0
1,abashed,abash,VBD,VERB,elists,1,,,0,0,Shame1,1
2,abhorrence,abhorrence,NN,NOUN,elists,0,,,1,0,,0
3,abject,abject,JJ,ADJ,elists,0,,,1,0,,0
4,able,able,JJ,ADJ,elists,1,,,0,0,,0


(1376, 12)

Unnamed: 0,IsEmotion,IsStrong,DegreeType
count,1376.0,1376.0,1376.0
mean,0.594477,0.412791,0.037064
std,0.491172,0.492515,0.188987
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,1.0,0.0,0.0
75%,1.0,1.0,0.0
max,1.0,1.0,1.0
