## This notebook is used for the generation of keywords variations which can be used to sieve out sentences that mirrors our ESG sub-pillar features

For instance see the image below for some illustration on ESG sub-pillar constituents

![esgmsci](https://www.visualcapitalist.com/wp-content/uploads/2021/03/shareable-5.jpg)

For each sub-pillar, we will pick out the keywords and use NLP derive the vector for each word so as to find it's nearest neighbours. This gives us a series of words that are similar/closely related to the features.

In [1]:
# List is not exhaustive. Feel free to populate more words related to each sub-pillar

environment_features = ['climate', 'pollution', 'energy', 'waste', 'electric', 'emission', 'green', 'technology', 'toxic']

social_features = ['human', 'labor', 'privacy', 'data security', 'safety', 'welfare', 'management', 'equality', 'equal', 'equity', 'pay', 'salary', 'allowance']

governance_features = ['governance', 'corrupt', 'equity', 'equal', 'equality', 'accounting', 'ethics', 'transparency', 'tax', 'ownership', 'shareholder', 'control', 'pay']

esg_keywords = [
                'best-in-class', 'carbon footprint', 'carbon pricing', 'clean technology', 'engagement', 'environmental factors', 'esg integration', 'ethical investing', 'exclusions', 
                'negative screening', 'governance factors', 'green bond', 'greenwashing', 'human rights', 'impact investments', 'modern slavery', 'PRI', 'proxy voting', 
                'renewable energy', 'screening', 'social factors', 'SRI', 'stewardship', 'thematic investing', 'SDG', 'values-based investing', 'voting rights', 'biodiversity', 
                'carbon capture and storage', 'circular economy', 'climate action tracker', 'climate clocks', 'climate funds', 
                'climate transition benchmarks', 'greenhouse gas emissions', 'net zero carbon pledge and initiative', 'paris agreement', 'paris-aligned benchmarks', 
                'PFAS', 'scope 1', 'scope 2', 'scope 3', 'sdg funds', 'sin stocks', 'smart esg scores', 'social sustainability', 'stewardship code', 'stranded assets', 
                'sustainable investing', 'sustainability reporting', 'sustainable supply chains', 'sustainable technology', 'thermal coal exposure', 'triple bottom line', 'un global impact','green','low-carbon'
                ]
  
lst = [
        'acidification','biofuel','carbon','carbon dioxide','climate','co2','climate change','decarbonisation','decarbonization','energy transmission','energy','energy transition',
        'energy storage','emissions','emission control','fossil fuels','geothermal energy','geothermal','greenhouse gas','greenhouse','hydrocarbons','LNG','liquefied natural gas',
        'ozone','renewable resources','sng','synthetic natural gas','thermal energy','thermal','wind power','wind'
        ]

In [2]:
# Import the libraries
from scipy import spatial
import gensim
from numba import jit
import nltk, time, spacy, numpy as np

nlp = spacy.load('en_core_web_lg')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features = 50 ,stop_words='english')

In [4]:
vect

CountVectorizer(max_features=50, stop_words='english')

In [27]:
# Udemy Version
def cosine_similarity(vect1, vect2):
    return 1 - spatial.distance.cosine(vect1, vect2)

def similar_words(wordOfInterest, topwords):
    computed_similarities = []

    # Convert word of interest to as vector
    woi_vector = nlp(wordOfInterest).vector
    for word in nlp.vocab:
        if word.has_vector and word.is_lower and word.is_alpha:
            similarity = cosine_similarity(woi_vector, word.vector)
            computed_similarities.append((word, similarity))
    sortedWords = sorted(computed_similarities, key = lambda item: -item[1])
    return [w[0].text for w in sortedWords[:topwords]]


# Towards Data Science Version
def most_similar(word, topn):
    word = nlp.vocab[str(word)]
    queries = [
        w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
    ]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]


@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta


# Stack Overflow
def similar_words_to_given_word(word, top):
    doc = nlp.vocab
    similarities = {}  
    tok = nlp(word)
    similarities[tok.text] = {}
    for tok_ in doc:
        similarities[tok.text].update({tok_.text:tok.similarity(tok_)})
    
    topWords = lambda x: {k: v for k, v in sorted(similarities[x].items(), key=lambda item: item[1], reverse=True)[:top]}
    return topWords(word)



In [36]:
print(similar_words(environment_features[0], 10))
# print(most_similar(environment_features[0], 10))
print(similar_words_to_given_word(environment_features[1], 10))

['climate', 'pollution', 'energy', 'that', 'what', 'there', 'how', 'we', 'why', 'could']
{'pollution': 1.0, 'toxic': 0.6272381319120371, 'waste': 0.5875696368785799, 'climate': 0.5568534621970989, 'energy': 0.45739895449558304, 'cause': 0.4362784676469133, 'Cause': 0.4362784676469133, 'Mass': 0.32694304829807114, 'could': 0.2815637073990229, 'Could': 0.2815637073990229}




In [None]:
# Try this: https://github.com/kavgan/nlp-in-practice/blob/master/word2vec/Word2Vec.ipynb

## Next, we will be generating all the ESG keywords for the individual pillar and store them into a text.file