In [1]:
import requests
import re
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import wordnet

## Synonyms with Thesaurus API

In [2]:
keys = ["energy", "materials", "industrials", "health care", "financials", "information technology", "utilities", "real estate"]
no_keys = [ "consumer discretionary", "consumer staples", "communication services"]

In [3]:
master_list = {}
i = 0
for word in keys:
    x = requests.get(f"https://www.dictionaryapi.com/api/v3/references/collegiate/json/{word}?key=a391c320-fea8-4167-9546-2a4931b2d191")
    data = x.json()
    master_list[keys[i]] = data
    i += 1

master_list.keys()

dict_keys(['energy', 'materials', 'industrials', 'health care', 'financials', 'information technology', 'utilities', 'real estate'])

### Problems with some words

consumer discretionary, consumer staples, communication services (DO NOT WORK)

    -> they do not exist within the Thesaurus, therefore the JSON returns a list with possible words
Definition of Consumer Discretionary:
- goods that are non-essential but desirable if their income is sufficient to purchase them

Definition of Consumer Staples:
- goods that are essential

Definition of Communication Services:
- elecommunications Services, Cable Services, Video Services, or Information Services

-> Solution for now: remove those words from here

In [9]:
# consumer discretionary, consumer staples, communication services
master_list["real estate"][0]#["def"]#.keys()

{'meta': {'id': 'real estate',
  'uuid': '4480a3e1-34ea-427a-9b28-6a59bece7c65',
  'sort': '180749000',
  'src': 'collegiate',
  'section': 'alpha',
  'stems': ['real estate', 'real estates'],
  'offensive': False},
 'hwi': {'hw': 'real estate'},
 'fl': 'noun',
 'def': [{'sseq': [[['sense',
      {'sn': '1', 'dt': [['text', '{bc}property in buildings and land']]}]],
    [['sense',
      {'sn': '2',
       'dt': [['text', '{bc}{sx|space||}, {sx|capacity||} '],
        ['vis',
         [{'t': 'her desktop {it}real estate{/it}'},
          {'t': 'the limited {it}real estate{/it} on hard drives',
           'aq': {'auth': 'Leonard Wiener'}}]]]}]]]}],
 'date': 'circa 1642{ds||1||}',
 'shortdef': ['property in buildings and land', 'space, capacity']}

In [11]:
# check that each word within master_list contains the "syns"
keys_within_keys = [master_list[key][0].keys() for key in master_list]
keys_within_keys

[dict_keys(['meta', 'hwi', 'fl', 'ins', 'def', 'syns', 'et', 'date', 'shortdef']),
 dict_keys(['meta', 'hom', 'hwi', 'fl', 'def', 'et', 'date', 'shortdef']),
 dict_keys(['meta', 'hom', 'hwi', 'fl', 'def', 'date', 'shortdef']),
 dict_keys(['meta', 'hwi', 'vrs', 'fl', 'def', 'date', 'shortdef']),
 dict_keys(['meta', 'hwi', 'fl', 'def', 'date', 'shortdef']),
 dict_keys(['meta', 'hwi', 'fl', 'def', 'date', 'shortdef']),
 dict_keys(['meta', 'hom', 'hwi', 'fl', 'ins', 'def', 'et', 'date', 'shortdef']),
 dict_keys(['meta', 'hwi', 'fl', 'def', 'date', 'shortdef'])]

## Synonyms with NLP

In [None]:
def get_synonyms(phr):
    phr_syn_lst = [phr]
    phr_syn_lst = [i.name().lower() for syn in wordnet.synsets(phr) for i in syn.lemmas() ]

    return (list(set(phr_syn_lst)))    

In [None]:
get_synonyms("agree")

## Algorithm

In [None]:
df = pd.read_csv("scopus_2k.csv")#, on_bad_lines='skip')
df.columns

In [None]:
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Author Keywords'] = df['Author Keywords'].str.lower()
df['Index Keywords'] = df['Index Keywords'].str.lower()

In [None]:
df.info()

In [None]:
df = df.fillna("blank")

In [None]:
df.head(2)

In [None]:
for col in ['Title','Abstract', 'Author Keywords', 'Index Keywords']:
    df[col] = df[col].map(lambda x: x.replace(';',''))
    df[col] = df[col].map(lambda x: x.replace(',',''))
    df[col] = df[col].map(lambda x: x.replace(')',''))
    df[col] = df[col].map(lambda x: x.replace('(',''))

In [None]:
df['content_sum'] = df['Abstract'] + df['Title'] + df['Author Keywords'] + df['Index Keywords']
df['content_sum'].values[0].split()

In [None]:
df_copy = df.copy()

In [None]:
key_words = ["energy", "materials", "industrials", "consumer discretionary", "consumer staples", "health care", "financials", "information technology", "communication services", "utilities", "real estate"]

In [None]:
for key in key_words:
    df[key] = 0
    for phr in get_synonyms(key):
        for i in range(len(df)): 
            if phr in (df['content_sum'].values[i].split()):
                df[key][i] += 1
    #df.loc[key, i] = count

In [None]:
df