In [1]:
import requests
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import wordnet

## Synonyms with Thesaurus API

In [2]:
def get_synonyms_api(key_word):
    url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/"
    key= "?key=bc125025-b3b8-4b13-b8e8-c1ef37845bad"
    syn_json = requests.get(url+key_word.lower()+key).json()
    syn_list = syn_json[0]['meta']['syns']
    if len(syn_list) > 1:
        master = sum(syn_list, [])
    elif len(syn_list) == 1:
        master = syn_list[0]
    master.append(key_word)
    return master

### Problems with some words

consumer discretionary, consumer staples, communication services (DO NOT WORK)

    -> they do not exist within the Thesaurus, therefore the JSON returns a list with possible words
Definition of Consumer Discretionary:
- goods that are non-essential but desirable if their income is sufficient to purchase them

Definition of Consumer Staples:
- goods that are essential

Definition of Communication Services:
- elecommunications Services, Cable Services, Video Services, or Information Services

-> Solution for now: remove those words from here

In [3]:
df = pd.read_csv("master_taxonomy.csv", index_col=0)#, on_bad_lines='skip')
df.columns

Index(['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords'],
      dtype='object')

In [4]:
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Author Keywords'] = df['Author Keywords'].str.lower()
df['Index Keywords'] = df['Index Keywords'].str.lower()

In [5]:
df

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords
0,"Abadía J.J.P., Fritz H., Dadoulis G., Dragos K...",automated decision making in structural health...,,https://www.scopus.com/inward/record.uri?eid=2...,the need for processing large amounts of data ...,,artificial intelligence; damage detection; dec...
1,"Abbass H.A., Hunjet R.A.",smart shepherding: towards transparent artific...,10.1007/978-3-030-60898-9_1,https://www.scopus.com/inward/record.uri?eid=2...,the aim of this chapter is to uncover the beau...,explainable artificial intelligence; interpret...,
2,"Abdollahi A., Pradhan B.",urban vegetation mapping from aerial imagery u...,10.3390/s21144738,https://www.scopus.com/inward/record.uri?eid=2...,urban vegetation mapping is critical in many a...,deep neural network; remote sensing; shap; veg...,aerial photography; antennas; biodiversity; de...
3,"Abdul A., Von Der Weth C., Kankanhalli M., Lim...",cogam: measuring and moderating cognitive load...,10.1145/3313831.3376615,https://www.scopus.com/inward/record.uri?eid=2...,interpretable machine learning models trade -o...,cognitive load; explainable artificial intelli...,computation theory; economic and social effect...
4,"Abe T., Furukawa R., Iwasaki Y., Ikemura T.",time-series trend of pandemic sars-cov-2 varia...,10.5334/dsj-2021-029,https://www.scopus.com/inward/record.uri?eid=2...,to confront the global threat of coronavirus d...,batch-learning self-organizing map (blsom); co...,conformal mapping; diseases; genes; machine le...
...,...,...,...,...,...,...,...
1414,[No author name available],icmlsc 2021 - proceedings of the 2021 5th inte...,,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 32 papers. the topics ...,,
1415,[No author name available],2021 ieee 29th international conference on net...,,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 75 papers. the topics ...,,
1416,[No author name available],10th international conference on computational...,,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 33 papers. the special...,,
1421,[No author name available],proceedings of the 2nd international conferenc...,,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 21 papers. the topics ...,,


In [6]:
df = df.fillna("blank")

In [7]:
non_alp = [';', ',', '(', ')', '[', ']', '.']
for i in non_alp:
    for col in ['Title','Abstract', 'Author Keywords', 'Index Keywords']:
        df[col] = df[col].map(lambda x: x.replace(i,''))

In [8]:
df['content_sum'] = df['Abstract'] + df['Title'] + df['Author Keywords'] + df['Index Keywords']
df["content_sum"] = df["content_sum"].str.lower()
#df['content_sum'].values[0].split()

In [17]:
keys = ["energy", "materials", "industrial", "financial",  "utilities"]
no_keys = [ "consumer discretionary", "consumer staples", "information technology", "communication services", "healthcare", "real estate"]

In [19]:
# Saving into a dictionary all of the syns
keys_dict = {}
for i in keys:
    keys_dict[i] = get_synonyms_api(i)
 
#keys_dict

In [20]:
keys_dict["consumer discretionary"] = ["non-essential goods", "non essential goods", "non-essential products", "non essential products", "unrestricted goods", "unrestricted products", "nonobligatory goods", "nonobligatory products"]
keys_dict["consumer staples"] = ["essential goods", "essential goods", "essential products", "essential products", "restricted goods", "restricted products", "obligatory goods", "obligatory products"]
keys_dict["information technology"] = ["products", "heloo"]
keys_dict["communication services"] = ["products", "heloo"]
keys_dict["healthcare"] = ["products", "heloo"]
keys_dict["real estate"] = ["products", "heloo"]

In [21]:
keys_dict

{'energy': ['aura',
  'chi',
  'ki',
  'vibe(s)',
  'vibration(s)',
  'beans',
  'bounce',
  'brio',
  'dash',
  'drive',
  'dynamism',
  'esprit',
  'gas',
  'get-up-and-go',
  'ginger',
  'go',
  'gusto',
  'hardihood',
  'juice',
  'life',
  'moxie',
  'oomph',
  'pep',
  'punch',
  'sap',
  'snap',
  'starch',
  'verve',
  'vigor',
  'vim',
  'vinegar',
  'vitality',
  'zing',
  'zip',
  'fuel',
  'power',
  'firepower',
  'force',
  'horsepower',
  'might',
  'muscle',
  'potence',
  'potency',
  'power',
  'puissance',
  'sinew',
  'strength',
  'vigor',
  'energy'],
 'materials': ['accoutrements',
  'apparatus',
  'equipment',
  'gear',
  'hardware',
  'kit',
  'matériel',
  'outfit',
  'paraphernalia',
  'stuff',
  'tackle',
  'materials'],
 'industrial': ['mechanical',
  'fabricated',
  'manufactured',
  'cultivated',
  'processed',
  'refined',
  'artificial',
  'man-made',
  'nonnatural',
  'synthetic',
  'ersatz',
  'faux',
  'imitation',
  'industrial'],
 'financial': ['do

In [None]:
for key in key_words:
    df[key] = 0
    for phr in get_synonyms_api(key):
        for i in range(len(df)): 
            if phr in (df['content_sum'].values[i].split()):
                df[key][i] += 1
    #df.loc[key, i] = count
                
df

In [33]:
x = df["content_sum"].values[1].split()
x#1]

['the',
 'aim',
 'of',
 'this',
 'chapter',
 'is',
 'to',
 'uncover',
 'the',
 'beauty',
 'and',
 'complexity',
 'in',
 'the',
 'world',
 'of',
 'shepherding',
 'as',
 'we',
 'view',
 'it',
 'through',
 'the',
 'lens',
 'of',
 'artificial',
 'intelligence',
 'ai',
 'and',
 'autonomous',
 'systems',
 'as',
 'in',
 'the',
 'pursuit',
 'of',
 'imitating',
 'human',
 'intelligence',
 'ai',
 'researchers',
 'have',
 'made',
 'significant',
 'and',
 'vast',
 'contributions',
 'over',
 'decades',
 'yet',
 'even',
 'with',
 'such',
 'interest',
 'and',
 'activity',
 'from',
 'within',
 'industry',
 'and',
 'the',
 'academic',
 'community',
 'general',
 'ai',
 'remains',
 'out',
 'of',
 'our',
 'reach',
 'by',
 'comparison',
 'this',
 'book',
 'aims',
 'for',
 'a',
 'less',
 'ambitious',
 'goal',
 'in',
 'trying',
 'to',
 'recreate',
 'the',
 'intelligence',
 'of',
 'a',
 'sheepdog',
 'as',
 'our',
 'efforts',
 'display',
 'even',
 'with',
 'this',
 'seemingly',
 'modest',
 'goal',
 'there',
 '

In [34]:
count1 = 0
count2 = 1

master_list = []

while count2 < (len(df) - 1):
    val = df["content_sum"].values[count1].split()
    for i in range(len(val)):
        print(count1)
        x = val[count1]
        y = val[count2]
        master_list.append(x + " " + y) #+ " " + val[count2])
        count1 += 1
        count2 += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204


IndexError: list index out of range

In [46]:
for m in range(len(df)):
    df['master_content_sum'] = " "
    for i in df['content_sum'].values[m].split():
        for j in df['content_sum'].values[m].split():
            df.iloc[m, -1] = [i + " " + j]
            #df['master_content_sum'][m].append()

In [None]:
for key in key_words:
    df[key] = 0
    for phr in get_synonyms(key):
        for i in range(len(df)): 
            if phr in (df['content_sum'].values[i].split()):
                df[key][i] += 1
    #df.loc[key, i] = count