In [None]:
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
pd.options.mode.chained_assignment = None  # default='warn'

## Synonyms with Thesaurus API

In [None]:
def get_synonyms_api(key_word):
    url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/"
    key= "?key=bc125025-b3b8-4b13-b8e8-c1ef37845bad"
    syn_json = requests.get(url+key_word.lower()+key).json()
    syn_list = syn_json[0]['meta']['syns']
    if len(syn_list) > 1:
        master = sum(syn_list, [])
    elif len(syn_list) == 1:
        master = syn_list[0]
    master.append(key_word)
    return master

### Problems with some words

consumer discretionary, consumer staples, communication services (DO NOT WORK)

    -> they do not exist within the Thesaurus, therefore the JSON returns a list with possible words
Definition of Consumer Discretionary:
- goods that are non-essential but desirable if their income is sufficient to purchase them

Definition of Consumer Staples:
- goods that are essential

Definition of Communication Services:
- elecommunications Services, Cable Services, Video Services, or Information Services

-> Solution for now: remove those words from here

In [None]:
df = pd.read_csv("master_taxonomy.csv", index_col=0)#, on_bad_lines='skip')
df = df.reset_index(drop=True)
df.columns

In [None]:
df['Abstract'] = df['Abstract'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Author Keywords'] = df['Author Keywords'].str.lower()
df['Index Keywords'] = df['Index Keywords'].str.lower()

In [None]:
df = df.fillna("blank")

In [None]:
non_alp = [';', ',', '(', ')', '[', ']', '.']
for i in non_alp:
    for col in ['Title','Abstract', 'Author Keywords', 'Index Keywords']:
        df[col] = df[col].map(lambda x: x.replace(i,''))

In [None]:
#df['content_sum'] = df['Abstract'] + df['Title'] + df['Author Keywords'] + df['Index Keywords']
#df["content_sum"] = df["content_sum"].str.lower()
stop = stopwords.words('english')
for i in [df['Abstract'],df['Title'], df['Author Keywords'], df['Index Keywords']]:
    i = i.apply(lambda x: " ".join([word for word in x.split() if word not in (stop)]))

In [None]:
keys = ["energy", "materials", "industrial", "financial",  "utilities"]
no_keys = [ "consumer discretionary", "consumer staples", "information technology", "communication services", "healthcare", "real estate"]

In [None]:
# Saving into a dictionary all of the syns
keys_dict = {}
for i in keys:
    keys_dict[i] = get_synonyms_api(i)
 
#keys_dict

In [None]:
## Creating, by hand, the rest of the synonyms
keys_dict["consumer discretionary"] = ["consumer discretionary","non-essential goods", "non essential goods", "non-essential products", "non essential products", "unrestricted goods", "unrestricted products", "nonobligatory goods", "nonobligatory products"]
keys_dict["consumer staples"] = ["essential goods", "essential goods", "essential products", "essential products", "restricted goods", "restricted products", "obligatory goods", "obligatory products"]
keys_dict["information technology"] = ["IT", "information techonology", "i.t", "communication science", "communication sciences", "computer science", "info tech", "cybernetics"]
keys_dict["communication services"] = ["call service", "communication department",  "communication facilities", "communication service","communications department","communications facilities", "communications service","communications services", "facilities-based", "telecommunications service","media service","reporting services", "telecom service", "telecom services", "telecommunication service","telecommunication services"]
keys_dict["healthcare"] = ["medical care","healthcare","medical treatment","medical assistance","medical attention","medical aid","health protection","health maintenance","medical attendance","medical help","medical management","medical service","preventive medicine","wellness program","health management","curative care","public health","maintenance of health","health care service","primary care","medical services","care insurance","health insurance","medical coverage","medical insurance"]
keys_dict["real estate"] = ["building","buildings","housing","immovable","land","motionless","premises","Properties","Property","realtor"]

In [None]:
## Creating the dictionary for Stage Classification (Post Hoc / Ante Hoc)
stage_dict = {}
stage_dict["Ante-hoc"] = ["fuzzy","treebased", "Classification and Regression Tree", "CART Conditional Inference Tree", "CTree", "Decision Tree", "Fast and Frugal Trees", "FFTs", "Fuzzy Hoeffding Decision Tree", "FHDT", "J48", "One-Class Tree", "OCTree", "Multi Operator Temporal Decision Tree", "MTDT", "Recursive Partitioning and Regression Trees (RPART)", "Big Bang–Big Crunch Interval Type-2 Fuzzy Logic System", "BB-BC IT2FLS"]
stage_dict["Post-hoc"] = ["support vector machines", 'neural network','model-agnostic','model-specific','ensemble methods','SVM with Linear and Radial Basis Function (RBF) Kernels','ApparentFlow-net', 'Convolutional Neural Network (CNN)','Adaptive Boosting (AdaBoost)']

In [None]:
## Classification of Stage
for term in stage_dict.keys():
    df[term] = 0
    for wrd in stage_dict[term]:
        for col in [df['Abstract'], df['Title'], df['Author Keywords'], df['Index Keywords']]:
            if col is df['Title']:
                df[term] += col.str.count(wrd) + 0.0004
            elif col is df['Author Keywords']:
                #df[key] += 0.00003
                df[term] += col.str.count(wrd) + 0.0003
            elif col is df['Index Keywords']:
                #df[key] += 0.00002
                df[term] += col.str.count(wrd)  + 0.0002
            else:
                #df[key] += 0.00001
                df[term] += col.str.count(wrd) + 0.0001
df.head(5)

In [None]:
## Classification of Taxonomy
for key in keys_dict.keys():
    df[key] = 0
    for phr in keys_dict[key]:
        for col in [df['Abstract'], df['Title'], df['Author Keywords'], df['Index Keywords']]:
            if col is df['Title']:
                df[key] += col.str.count(phr) + 0.0004
            elif col is df['Author Keywords']:
                #df[key] += 0.00003
                df[key] += col.str.count(phr) + 0.0003
            elif col is df['Index Keywords']:
                #df[key] += 0.00002
                df[key] += col.str.count(phr)  + 0.0002
            else:
                #df[key] += 0.00001
                df[key] += col.str.count(phr) + 0.0001
df.head(5)

In [None]:
''''
for key in keys_dict.keys():
    df[key] = 0
    for phr in keys_dict[key]:
        for col in [df['Abstract'], df['Title'], df['Author Keywords'], df['Index Keywords']]:
            if col is df['Title']:
                df[key] += 0.00004
                df[key] += col.str.count(phr)
            elif col is df['Author Keywords']:
                df[key] += 0.00003
                df[key] += col.str.count(phr) 
            elif col is df['Index Keywords']:
                df[key] += 0.00002
                df[key] += col.str.count(phr) 
            else:
                df[key] += 0.00001
                df[key] += col.str.count(phr) 
df
'''

In [None]:
df[df["materials"] > 8]

In [None]:
sum_dic = {}
for i in keys_dict.keys():
    sum_dic[i] = df[i].sum()
 
sum_dic

In [None]:
df.columns

### Creating the Final Dataframe

----
Made for cleaner looks and presentability

Eliminates the processing columns, and just adds the result of the differented used algorithms

In [None]:
df['Taxonomy'] = df[['energy','materials', 'industrial', 'financial', 'utilities',
                    'consumer discretionary', 'consumer staples', 'information technology',
                    'communication services', 'healthcare', 'real estate']].idxmax(axis=1)
df['Stage_Taxonomy'] = df[['Ante-hoc', 'Post-hoc']].idxmax(axis=1)
df.sample(5)

In [None]:
final_df = df[['Authors', 'Title', 'DOI', 'Link', 'Abstract', 'Author Keywords',
       'Index Keywords', "Taxonomy", "Stage_Taxonomy"]]

In [None]:
final_df.sample(10)