In [1]:
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kalebalemayehu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Synonyms with Thesaurus API

In [11]:
def get_synonyms_api(key_word):
    url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/"
    key= "?key=bc125025-b3b8-4b13-b8e8-c1ef37845bad"
    syn_json = requests.get(url+key_word.lower()+key).json()
    syn_list = syn_json[0]['meta']['syns']
    if len(syn_list) > 1:
        master = sum(syn_list, [])
    elif len(syn_list) == 1:
        master = syn_list[0]
    master.append(key_word)
    return master

### Problems with some words

consumer discretionary, consumer staples, communication services (DO NOT WORK)

    -> they do not exist within the Thesaurus, therefore the JSON returns a list with possible words
Definition of Consumer Discretionary:
- goods that are non-essential but desirable if their income is sufficient to purchase them

Definition of Consumer Staples:
- goods that are essential

Definition of Communication Services:
- elecommunications Services, Cable Services, Video Services, or Information Services

-> Solution for now: remove those words from here

In [2]:
df = pd.read_csv("master_taxonomy.csv", index_col=0)#, on_bad_lines='skip')
df = df.reset_index(drop=True)
df.columns

Index(['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords'],
      dtype='object')

In [3]:
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Author Keywords'] = df['Author Keywords'].str.lower()
df['Index Keywords'] = df['Index Keywords'].str.lower()

In [4]:
df = df.fillna("blank")

In [5]:
non_alp = [';', ',', '(', ')', '[', ']', '.']
for i in non_alp:
    for col in ['Title','Abstract', 'Author Keywords', 'Index Keywords']:
        df[col] = df[col].map(lambda x: x.replace(i,''))

In [6]:
#df['content_sum'] = df['Abstract'] + df['Title'] + df['Author Keywords'] + df['Index Keywords']
#df["content_sum"] = df["content_sum"].str.lower()
stop = stopwords.words('english')
for i in [df['Abstract'],df['Title'], df['Author Keywords'], df['Index Keywords']]:
    i = i.apply(lambda x: " ".join([word for word in x.split() if word not in (stop)]))

In [7]:
keys = ["energy", "materials", "industrial", "financial",  "utilities"]
no_keys = [ "consumer discretionary", "consumer staples", "information technology", "communication services", "healthcare", "real estate"]

In [12]:
# Saving into a dictionary all of the syns
keys_dict = {}
for i in keys:
    keys_dict[i] = get_synonyms_api(i)
 
#keys_dict

In [13]:

keys_dict["consumer discretionary"] = ["consumer discretionary","non-essential goods", "non essential goods", "non-essential products", "non essential products", "unrestricted goods", "unrestricted products", "nonobligatory goods", "nonobligatory products"]
keys_dict["consumer staples"] = ["essential goods", "essential goods", "essential products", "essential products", "restricted goods", "restricted products", "obligatory goods", "obligatory products"]
keys_dict["information technology"] = ["IT", "information techonology", "i.t", "communication science", "communication sciences", "computer science", "info tech", "cybernetics"]
keys_dict["communication services"] = ["call service", "communication department",  "communication facilities", "communication service","communications department","communications facilities", "communications service","communications services", "facilities-based", "telecommunications service","media service","reporting services", "telecom service", "telecom services", "telecommunication service","telecommunication services"]
keys_dict["healthcare"] = ["medical care","healthcare","medical treatment","medical assistance","medical attention","medical aid","health protection","health maintenance","medical attendance","medical help","medical management","medical service","preventive medicine","wellness program","health management","curative care","public health","maintenance of health","health care service","primary care","medical services","care insurance","health insurance","medical coverage","medical insurance"]
keys_dict["real estate"] = ["building","buildings","housing","immovable","land","motionless","premises","Properties","Property","realtor"]

In [14]:
keys_dict = {'energy': [
  'fuel',
  'power',
  'horsepower', 'electricity', 'power output', 'power generation', 'electricity generation', 'dynamic power', 'energy'],
 'materials': ['accoutrements', 'consumer products', 'accesories',  'accesories production',
  'apparatus',
  'equipment',
  'materials'],
 'industrial': ['mechanical',
  'fabricated',
  'manufactured',
  'cultivated',
  'processed',
  'refined',
  'artificial',
  'man-made',
  'nonnatural',
  'synthetic',
  'industrial'],
 'financial': ['dollars-and-cents',
  'fiscal',
  'monetary',
  'pecuniary',
  'pocket',
  'financial'],
 'utilities': ['account',
  'avail',
  'mileage',
  'service',
  'serviceability',
  'serviceableness',
  'use',
  'usefulness',
  'utilities'],
 'consumer discretionary': ['consumer discretionary',
  'non-essential goods',
  'non essential goods',
  'non-essential products',
  'non essential products',
  'unrestricted goods',
  'unrestricted products',
  'nonobligatory goods',
  'nonobligatory products'],
 'consumer staples': ['essential goods',
  'essential goods',
  'essential products',
  'essential products',
  'restricted goods',
  'restricted products',
  'obligatory goods',
  'obligatory products'],
 'information technology': ['IT',
  'information techonology',
  'i.t',
  'communication science',
  'communication sciences',
  'computer science',
  'info tech',
  'cybernetics'],
 'communication services': ['call service',
  'communication department',
  'communication facilities',
  'communication service',
  'communications department',
  'communications facilities',
  'communications service',
  'communications services',
  'facilities-based',
  'telecommunications service',
  'media service',
  'reporting services',
  'telecom service',
  'telecom services',
  'telecommunication service',
  'telecommunication services'],
 'healthcare': ['medical care',
  'healthcare',
  'medical treatment',
  'medical assistance',
  'medical attention',
  'medical aid',
  'health protection',
  'health maintenance',
  'medical attendance',
  'medical help',
  'medical management',
  'medical service',
  'preventive medicine',
  'wellness program',
  'health management',
  'curative care',
  'public health',
  'maintenance of health',
  'health care service',
  'primary care',
  'medical services',
  'care insurance',
  'health insurance',
  'medical coverage',
  'medical insurance'],
 'real estate': ['building',
  'buildings',
  'housing',
  'immovable',
  'land',
  'motionless',
  'premises',
  'Properties',
  'Property',
  'realtor']}

In [18]:
weights = {'Title': 0.4, 'Author Keywords': 0.3, 'Index Keywords': 0.2, 'Abstract':0.1}

# Create a function to count the number of times the words appear in a row
def count_words(row):
    counts = {}
    for key in list(keys_dict.keys()):
        counts[key] = 0
        for word in keys_dict[key]:
            for col in row.index:
                if col not in ['Authors', 'DOI', 'Link'] and word in row[col]:
                    count = counts.get(key, 0)
                    counts[key] = count + weights.get(col, 1) if count > 0 else weights.get(col, 1)
    max_word = max(counts, key=counts.get) if any(counts.values()) else np.nan                    
    return pd.Series(counts), max_word

# Apply the function to each row of the dataframe
word_counts, max_words = zip(*df.apply(count_words, axis=1))

# Create a new dataframe with the word counts as columns
words_count_df = pd.DataFrame(list(word_counts), columns=list(keys_dict.keys()))

# Add the max_word column to the original dataframe
df['FSC_stage'] = max_words

# Concatenate the two dataframes
result_df = pd.concat([df, words_count_df], axis=1)
result_df

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords,FSC_stage,energy,materials,industrial,financial,utilities,consumer discretionary,consumer staples,information technology,communication services,healthcare,real estate
0,"Abadía J.J.P., Fritz H., Dadoulis G., Dragos K...",automated decision making in structural health...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the need for processing large amounts of data ...,blank,artificial intelligence damage detection decis...,industrial,0.0,0.0,0.7,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
1,"Abbass H.A., Hunjet R.A.",smart shepherding: towards transparent artific...,10.1007/978-3-030-60898-9_1,https://www.scopus.com/inward/record.uri?eid=2...,the aim of this chapter is to uncover the beau...,explainable artificial intelligence interpreta...,blank,industrial,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
2,"Abdollahi A., Pradhan B.",urban vegetation mapping from aerial imagery u...,10.3390/s21144738,https://www.scopus.com/inward/record.uri?eid=2...,urban vegetation mapping is critical in many a...,deep neural network remote sensing shap vegeta...,aerial photography antennas biodiversity deep ...,real estate,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.4
3,"Abdul A., Von Der Weth C., Kankanhalli M., Lim...",cogam: measuring and moderating cognitive load...,10.1145/3313831.3376615,https://www.scopus.com/inward/record.uri?eid=2...,interpretable machine learning models trade -o...,cognitive load explainable artificial intellig...,computation theory economic and social effects...,industrial,0.0,0.0,0.3,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
4,"Abe T., Furukawa R., Iwasaki Y., Ikemura T.",time-series trend of pandemic sars-cov-2 varia...,10.5334/dsj-2021-029,https://www.scopus.com/inward/record.uri?eid=2...,to confront the global threat of coronavirus d...,batch-learning self-organizing map blsom covid...,conformal mapping diseases genes machine learn...,utilities,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4586,[No author name available],icmlsc 2021 - proceedings of the 2021 5th inte...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 32 papers the topics d...,blank,blank,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4587,[No author name available],2021 ieee 29th international conference on net...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 75 papers the topics d...,blank,blank,energy,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4588,[No author name available],10th international conference on computational...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 33 papers the special ...,blank,blank,utilities,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
4589,[No author name available],proceedings of the 2nd international conferenc...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 21 papers the topics d...,blank,blank,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ddf = result_df[result_df["FSC_stage"].notna()]
ddf

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords,FSC_stage,energy,materials,industrial,financial,utilities,consumer discretionary,consumer staples,information technology,communication services,healthcare,real estate
0,"Abadía J.J.P., Fritz H., Dadoulis G., Dragos K...",automated decision making in structural health...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the need for processing large amounts of data ...,blank,artificial intelligence damage detection decis...,industrial,0.0,0.0,0.7,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
1,"Abbass H.A., Hunjet R.A.",smart shepherding: towards transparent artific...,10.1007/978-3-030-60898-9_1,https://www.scopus.com/inward/record.uri?eid=2...,the aim of this chapter is to uncover the beau...,explainable artificial intelligence interpreta...,blank,industrial,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
2,"Abdollahi A., Pradhan B.",urban vegetation mapping from aerial imagery u...,10.3390/s21144738,https://www.scopus.com/inward/record.uri?eid=2...,urban vegetation mapping is critical in many a...,deep neural network remote sensing shap vegeta...,aerial photography antennas biodiversity deep ...,real estate,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.4
3,"Abdul A., Von Der Weth C., Kankanhalli M., Lim...",cogam: measuring and moderating cognitive load...,10.1145/3313831.3376615,https://www.scopus.com/inward/record.uri?eid=2...,interpretable machine learning models trade -o...,cognitive load explainable artificial intellig...,computation theory economic and social effects...,industrial,0.0,0.0,0.3,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
4,"Abe T., Furukawa R., Iwasaki Y., Ikemura T.",time-series trend of pandemic sars-cov-2 varia...,10.5334/dsj-2021-029,https://www.scopus.com/inward/record.uri?eid=2...,to confront the global threat of coronavirus d...,batch-learning self-organizing map blsom covid...,conformal mapping diseases genes machine learn...,utilities,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4584,"Zinemanas P., Rocamora M., Miron M., Font F., ...",an interpretable deep learning model for autom...,10.3390/electronics10070850,https://www.scopus.com/inward/record.uri?eid=2...,deep learning models have improved cutting-edg...,deep learning explainability interpretability ...,blank,real estate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
4585,"Zumwald M., Baumberger C., Bresch D.N., Knutti R.",assessing the representational accuracy of dat...,10.1016/j.envsoft.2021.105048,https://www.scopus.com/inward/record.uri?eid=2...,data-driven modelling with machine learning ml...,data-driven modelling interpretable machine le...,uncertainty analysis background knowledge case...,utilities,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
4587,[No author name available],2021 ieee 29th international conference on net...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 75 papers the topics d...,blank,blank,energy,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4588,[No author name available],10th international conference on computational...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 33 papers the special ...,blank,blank,utilities,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
result_df["FSC_stage"].value_counts()

industrial                2264
utilities                 1405
energy                     252
real estate                166
healthcare                 106
materials                   68
financial                   48
information technology       6
Name: FSC_stage, dtype: int64

In [36]:
sum_dic = {}
for i in keys_dict.keys():
    sum_dic[i] = df[i].sum()
 
sum_dic

{'energy': 28734,
 'materials': 585,
 'industrial': 7592,
 'financial': 295,
 'utilities': 9396,
 'consumer discretionary': 0,
 'consumer staples': 0,
 'information technology': 44385,
 'communication services': 19,
 'healthcare': 787,
 'real estate': 1801}

In [75]:
df.columns

Index(['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords', 'content_sum', 'stripped_content', 'energy',
       'materials', 'industrial', 'financial', 'utilities',
       'consumer discretionary', 'consumer staples', 'information technology',
       'communication services', 'healthcare', 'real estate'],
      dtype='object')

In [37]:
df['Taxonomy'] = df[['energy','materials', 'industrial', 'financial', 'utilities',
                    'consumer discretionary', 'consumer staples', 'information technology',
                    'communication services', 'healthcare', 'real estate']].idxmax(axis=1)

In [38]:
final_df = df[['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords', "Taxonomy"]]

In [86]:
final_df

Unnamed: 0,Authors,Title,DOI,Link,Abstract,Author Keywords,Index Keywords,Taxonomy
0,"Abadía J.J.P., Fritz H., Dadoulis G., Dragos K...",automated decision making in structural health...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the need for processing large amounts of data ...,blank,artificial intelligence damage detection decis...,energy
1,"Abbass H.A., Hunjet R.A.",smart shepherding: towards transparent artific...,10.1007/978-3-030-60898-9_1,https://www.scopus.com/inward/record.uri?eid=2...,the aim of this chapter is to uncover the beau...,explainable artificial intelligence interpreta...,blank,information technology
2,"Abdollahi A., Pradhan B.",urban vegetation mapping from aerial imagery u...,10.3390/s21144738,https://www.scopus.com/inward/record.uri?eid=2...,urban vegetation mapping is critical in many a...,deep neural network remote sensing shap vegeta...,aerial photography antennas biodiversity deep ...,information technology
3,"Abdul A., Von Der Weth C., Kankanhalli M., Lim...",cogam: measuring and moderating cognitive load...,10.1145/3313831.3376615,https://www.scopus.com/inward/record.uri?eid=2...,interpretable machine learning models trade -o...,cognitive load explainable artificial intellig...,computation theory economic and social effects...,energy
4,"Abe T., Furukawa R., Iwasaki Y., Ikemura T.",time-series trend of pandemic sars-cov-2 varia...,10.5334/dsj-2021-029,https://www.scopus.com/inward/record.uri?eid=2...,to confront the global threat of coronavirus d...,batch-learning self-organizing map blsom covid...,conformal mapping diseases genes machine learn...,energy
...,...,...,...,...,...,...,...,...
4586,[No author name available],icmlsc 2021 - proceedings of the 2021 5th inte...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 32 papers the topics d...,blank,blank,energy
4587,[No author name available],2021 ieee 29th international conference on net...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 75 papers the topics d...,blank,blank,information technology
4588,[No author name available],10th international conference on computational...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 33 papers the special ...,blank,blank,energy
4589,[No author name available],proceedings of the 2nd international conferenc...,blank,https://www.scopus.com/inward/record.uri?eid=2...,the proceedings contain 21 papers the topics d...,blank,blank,energy
