In [1]:
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kalebalemayehu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Synonyms with Thesaurus API

In [2]:
def get_synonyms_api(key_word):
    url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/"
    key= "?key=bc125025-b3b8-4b13-b8e8-c1ef37845bad"
    syn_json = requests.get(url+key_word.lower()+key).json()
    syn_list = syn_json[0]['meta']['syns']
    if len(syn_list) > 1:
        master = sum(syn_list, [])
    elif len(syn_list) == 1:
        master = syn_list[0]
    master.append(key_word)
    return master

### Problems with some words

consumer discretionary, consumer staples, communication services (DO NOT WORK)

    -> they do not exist within the Thesaurus, therefore the JSON returns a list with possible words
Definition of Consumer Discretionary:
- goods that are non-essential but desirable if their income is sufficient to purchase them

Definition of Consumer Staples:
- goods that are essential

Definition of Communication Services:
- elecommunications Services, Cable Services, Video Services, or Information Services

-> Solution for now: remove those words from here

In [3]:
df = pd.read_csv("master_taxonomy.csv", index_col=0)#, on_bad_lines='skip')
df = df.reset_index(drop=True)
df.columns

Index(['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords'],
      dtype='object')

In [4]:
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Author Keywords'] = df['Author Keywords'].str.lower()
df['Index Keywords'] = df['Index Keywords'].str.lower()

In [5]:
df = df.fillna("blank")

In [6]:
non_alp = [';', ',', '(', ')', '[', ']', '.']
for i in non_alp:
    for col in ['Title','Abstract', 'Author Keywords', 'Index Keywords']:
        df[col] = df[col].map(lambda x: x.replace(i,''))

In [26]:
#df['content_sum'] = df['Abstract'] + df['Title'] + df['Author Keywords'] + df['Index Keywords']
#df["content_sum"] = df["content_sum"].str.lower()
stop = stopwords.words('english')
for i in [df['Abstract'],df['Title'], df['Author Keywords'], df['Index Keywords']]:
df["stripped_content"] = df['content_sum'].apply(lambda x: " ".join([word for word in x.split() if word not in (stop)]))

In [27]:
keys = ["energy", "materials", "industrial", "financial",  "utilities"]
no_keys = [ "consumer discretionary", "consumer staples", "information technology", "communication services", "healthcare", "real estate"]

In [30]:
# Saving into a dictionary all of the syns
keys_dict = {}
for i in keys:
    keys_dict[i] = get_synonyms_api(i)
 
#keys_dict

In [31]:

keys_dict["consumer discretionary"] = ["consumer discretionary","non-essential goods", "non essential goods", "non-essential products", "non essential products", "unrestricted goods", "unrestricted products", "nonobligatory goods", "nonobligatory products"]
keys_dict["consumer staples"] = ["essential goods", "essential goods", "essential products", "essential products", "restricted goods", "restricted products", "obligatory goods", "obligatory products"]
keys_dict["information technology"] = ["IT", "information techonology", "i.t", "communication science", "communication sciences", "computer science", "info tech", "cybernetics"]
keys_dict["communication services"] = ["call service", "communication department",  "communication facilities", "communication service","communications department","communications facilities", "communications service","communications services", "facilities-based", "telecommunications service","media service","reporting services", "telecom service", "telecom services", "telecommunication service","telecommunication services"]
keys_dict["healthcare"] = ["medical care","healthcare","medical treatment","medical assistance","medical attention","medical aid","health protection","health maintenance","medical attendance","medical help","medical management","medical service","preventive medicine","wellness program","health management","curative care","public health","maintenance of health","health care service","primary care","medical services","care insurance","health insurance","medical coverage","medical insurance"]
keys_dict["real estate"] = ["building","buildings","housing","immovable","land","motionless","premises","Properties","Property","realtor"]

In [32]:
#for key in keys_dict_simple.keys():
#    df[key] = 0
#    print(key)
#    for phr in keys_dict_simple[key]:
#        #print(phr)
#        for i in range(len(df)):
#            x = df['stripped_content'].values[i].split()
#            if phr in (x):
#                df[key][i] += 1

                
#df

In [33]:
df["stripped_content"].str.count("explainable artificial intelligence")

0       2
1       1
2       0
3       1
4       0
       ..
4586    0
4587    0
4588    0
4589    0
4590    0
Name: stripped_content, Length: 4591, dtype: int64

In [34]:
for key in keys_dict.keys():
    df[key] = 0
    for phr in keys_dict[key]:
        df[key] += df["stripped_content"].str.count(phr)


In [35]:
df[df["materials"] > 8]

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords,content_sum,stripped_content,energy,materials,industrial,financial,utilities,consumer discretionary,consumer staples,information technology,communication services,healthcare,real estate
3720,"Hoffmann N., Cerqueira T.F.T., Schmidt J., Mar...",superconductivity in antiperovskites,10.1038/s41524-022-00817-4,https://www.scopus.com/inward/record.uri?eid=2...,we present a comprehensive theoretical study o...,blank,machine learning perturbation techniques super...,we present a comprehensive theoretical study o...,present comprehensive theoretical study conven...,12,12,0,0,0,0,0,4,0,0,0
3838,"Liu C., Meerten Y., Declercq K., Gryllias K.",vibration-based gear continuous generating gri...,10.1016/j.jmapro.2022.04.068,https://www.scopus.com/inward/record.uri?eid=2...,continuous generating grinding plays an essent...,convolutional neural network deep learning gea...,chemical activation convolutional neural netwo...,continuous generating grinding plays an essent...,continuous generating grinding plays essential...,8,9,2,0,1,0,0,5,0,0,0
4192,"Zhang L., Hu W., He M., Xu K., Pan Z.",interpretable machine learning for investigati...,10.1021/acs.jpcc.2c00859,https://www.scopus.com/inward/record.uri?eid=2...,halide perovskite materials exhibit poor optoe...,blank,chemical analysis machine learning perovskite ...,halide perovskite materials exhibit poor optoe...,halide perovskite materials exhibit poor optoe...,18,9,0,0,0,0,0,10,0,0,0
4402,"Liu K., Sadoune N., Rao N., Greitemann J., Pol...",revealing the phase diagram of kitaev material...,10.1103/PhysRevResearch.3.023016,https://www.scopus.com/inward/record.uri?eid=2...,kitaev materials are promising materials for h...,blank,liquids magnets phase diagrams support vector ...,kitaev materials are promising materials for h...,kitaev materials promising materials hosting q...,16,10,0,0,1,0,0,5,0,0,0
4512,"Velasco L., Castillo J.S., Kante M.V., Olaya J...",phase–property diagrams for multicomponent oxi...,10.1002/adma.202102301,https://www.scopus.com/inward/record.uri?eid=2...,exploring the vast compositional space offered...,high entropy materials high-throughput techniq...,data handling digital libraries energy gap ent...,exploring the vast compositional space offered...,exploring vast compositional space offered mul...,7,11,1,0,0,0,0,7,0,0,0
4566,"Zhang S., Lu T., Xu P., Tao Q., Li M., Lua W.",predicting the formability of hybrid organic-i...,10.1021/acs.jpclett.1c01939,https://www.scopus.com/inward/record.uri?eid=2...,predicting the formability of perovskite struc...,blank,forecasting formability machine learning perov...,predicting the formability of perovskite struc...,predicting formability perovskite structure hy...,11,9,0,0,0,0,0,11,0,0,0


In [36]:
sum_dic = {}
for i in keys_dict.keys():
    sum_dic[i] = df[i].sum()
 
sum_dic

{'energy': 28734,
 'materials': 585,
 'industrial': 7592,
 'financial': 295,
 'utilities': 9396,
 'consumer discretionary': 0,
 'consumer staples': 0,
 'information technology': 44385,
 'communication services': 19,
 'healthcare': 787,
 'real estate': 1801}

In [75]:
df.columns

Index(['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords', 'content_sum', 'stripped_content', 'energy',
       'materials', 'industrial', 'financial', 'utilities',
       'consumer discretionary', 'consumer staples', 'information technology',
       'communication services', 'healthcare', 'real estate'],
      dtype='object')

In [37]:
df['Taxonomy'] = df[['energy','materials', 'industrial', 'financial', 'utilities',
                    'consumer discretionary', 'consumer staples', 'information technology',
                    'communication services', 'healthcare', 'real estate']].idxmax(axis=1)

In [38]:
final_df = df[['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords', "Taxonomy"]]

In [86]:
final_df

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords,Taxonomy
0,"Abadía J.J.P., Fritz H., Dadoulis G., Dragos K...",automated decision making in structural health...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the need for processing large amounts of data ...,blank,artificial intelligence damage detection decis...,energy
1,"Abbass H.A., Hunjet R.A.",smart shepherding: towards transparent artific...,10.1007/978-3-030-60898-9_1,https://www.scopus.com/inward/record.uri?eid=2...,the aim of this chapter is to uncover the beau...,explainable artificial intelligence interpreta...,blank,information technology
2,"Abdollahi A., Pradhan B.",urban vegetation mapping from aerial imagery u...,10.3390/s21144738,https://www.scopus.com/inward/record.uri?eid=2...,urban vegetation mapping is critical in many a...,deep neural network remote sensing shap vegeta...,aerial photography antennas biodiversity deep ...,information technology
3,"Abdul A., Von Der Weth C., Kankanhalli M., Lim...",cogam: measuring and moderating cognitive load...,10.1145/3313831.3376615,https://www.scopus.com/inward/record.uri?eid=2...,interpretable machine learning models trade -o...,cognitive load explainable artificial intellig...,computation theory economic and social effects...,energy
4,"Abe T., Furukawa R., Iwasaki Y., Ikemura T.",time-series trend of pandemic sars-cov-2 varia...,10.5334/dsj-2021-029,https://www.scopus.com/inward/record.uri?eid=2...,to confront the global threat of coronavirus d...,batch-learning self-organizing map blsom covid...,conformal mapping diseases genes machine learn...,energy
...,...,...,...,...,...,...,...,...
4586,[No author name available],icmlsc 2021 - proceedings of the 2021 5th inte...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 32 papers the topics d...,blank,blank,energy
4587,[No author name available],2021 ieee 29th international conference on net...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 75 papers the topics d...,blank,blank,information technology
4588,[No author name available],10th international conference on computational...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 33 papers the special ...,blank,blank,energy
4589,[No author name available],proceedings of the 2nd international conferenc...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 21 papers the topics d...,blank,blank,energy
