# Zero shot multi label classification.

Theory is explained here: https://joeddav.github.io/blog/2020/05/29/ZSL.html

In [None]:
import pandas as pd
import numpy as np
import pickle
from google.colab import drive
from sklearn import preprocessing
import scipy.stats as stats
import datetime as dt
import math
import matplotlib.dates as md
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
!pip install git+https://github.com/huggingface/transformers.git &> /dev/null
from transformers import pipeline

In [None]:
classifier = pipeline("zero-shot-classification", device=0)

Loading Data: Choose Titles or Body

In [None]:
#Loading the data. Either only the titles or the whole text can be loaded, as well as the company names and the indices in the data that correspond to each company. 
#Using only the titles is inaccurate but much faster.

df = pd.read_csv('./Misc- Projects/titles.csv')
df = df['title']

#df = pd.read_csv('./body.csv')
#df = df['body']

with open('./comp_id_list.pkl', 'rb') as f:
  comp_id_list = pickle.load(f)

with open('./comp_name_list.pkl', 'rb') as f:
  comp_name_list = pickle.load(f)

with open('./comp_time_list.pkl', 'rb') as f:
  time_df = pickle.load(f)

Scaling Factor Finder

In [None]:
conf_score_env = 0.35
conf_score_soc = 0.10
conf_score_gov = 0.05
lables = ['positive', 'negative', 'environment', 'emissions', 'pollution', 'Climate change', 'equality', 'inclusion', 'human rigths', 'education', 'union', 'compensation', 'employee relations', 'management']

In [None]:
def lambda_score_eval(pred, score, i):
    #This is a helper function for the score calculation.
    t_score = np.zeros(3)
    positivity = (pred["scores"][0] - pred["scores"][1])/(pred["scores"][0] + pred["scores"][1])
    env_score = pred["scores"][2] + pred["scores"][3] + pred["scores"][4] + pred["scores"][5]
    soc_score = pred["scores"][6] + pred["scores"][7] + pred["scores"][8] + pred["scores"][9]
    gov_score = pred["scores"][10] + pred["scores"][11] + pred["scores"][12] + pred["scores"][13]
    if env_score > conf_score_env:
      t_score[0] = env_score*positivity
    if soc_score > conf_score_soc:
      t_score[1] = soc_score*positivity
    if gov_score > conf_score_gov:
      t_score[2] = gov_score*positivity
    if t_score[0] == 0 and i > 1:
      t_score[0] = score[i-1, 0]
    if t_score[1] == 0 and i > 1:
      t_score[1] = score[i-1, 1]
    if t_score[2] == 0 and i > 1:
      t_score[2] = score[i-1, 2]
    return t_score

In [None]:
def confidence_finder(conf_score_env = 0.35, conf_score_soc = 0.25, conf_score_gov = 0.15, samples = 50, lables = lables):
  #This function outputs the estimated number of articles classified as "E", "S" or "G" as well as outputting sample articles that were classified, as a file to evaluate the classification.
  rand_ind = np.random.randint(0, df.shape[0], 50)
  tot_count = 0
  env_count = 0
  soc_count = 0
  gov_count = 0
  for sequence in df[rand_ind].tolist():
    with open('drive/My Drive/norm.txt', 'a+') as f:
      save = False
      tot_count += 1
      pred = classifier(sequence, lables)
      score = np.zeros(shape = (4))
      score[0] = (pred["scores"][0] - pred["scores"][1])/(pred["scores"][0] + pred["scores"][1])
      env_score = pred["scores"][2] + pred["scores"][3] + pred["scores"][4] + pred["scores"][5]
      soc_score = pred["scores"][6] + pred["scores"][7] + pred["scores"][8] + pred["scores"][9]
      gov_score = pred["scores"][10] + pred["scores"][11] + pred["scores"][12] + pred["scores"][13]
      if env_score > conf_score_env:
        score[1] = env_score
        env_count += 1
        save = True
      if soc_score > conf_score_soc:
        score[2] = soc_score
        soc_count += 1
        save = True
      if gov_score > conf_score_gov:
        score[3] = gov_score
        gov_count += 1
        save = True
      if save == True:
        temp_str = pred["sequence"] + "><" + str(score[0]) + "><" + str(score[1]) + "><" + str(score[2]) + "><" + str(score[3])
        f.write(temp_str + '\n')
  print("{:.1f}".format(env_count/tot_count*100)+"% of Articles classified as E.")
  print("{:.1f}".format(soc_count/tot_count*100)+"% of Articles classified as S.")
  print("{:.1f}".format(gov_count/tot_count*100)+"% of Articles classified as G.")

In [None]:
confidence_finder(conf_score_env = 0.35, conf_score_soc = 0.15, conf_score_gov = 0.05, samples = 200)

ESG Calculation Multi Company

The first function evaluates a random list of articles to find a transformation between calculated scores to a normal distribution. 

In the second function the companies are evaluated one after the other. Using the calculated sentiment and topic score to assign an esg score that consists of a separate e, s and g scores. These individual scores are calculated by multiplying the difference in the positive and negative sentiment with the topic score. For example, a negative article with an environment topic score above the threshold will get a low "e" score while a positive one will get a high score.

Collection of Statistics for Data Normalization


In [None]:
def norm(samples = 500):
  #This function collects the distribution of the esg score in order to standardize them.
  rand_ind = np.random.randint(0, df.shape[0], samples)
  score = np.zeros(shape =(samples, 3))
  i = 0
  for sequence in df[rand_ind].tolist():
    pred = classifier(sequence, lables)
    score[i, ::] = lambda_score_eval(pred, score, i)
    i += 1
  scaler = preprocessing.StandardScaler()
  scaler.fit(score)
  with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
  print("Scaling Statistics collected!")

In [None]:
norm(samples = 500)

ESG over time

In [None]:
def esg_calculator(df = df, lables = lables, comp_num = 2, conf_score_env = 0.15, conf_score_soc = 0.15, conf_score_gov = 0.01, cumulative = 'off'):
  #df: Your data
  #lables: Your labels
  #comp_num: The number of companies you want to evaluate. If the whole article is used this can take around 15 minutes on a powerful PC for one company.
  #conf_score_ : The minimum scores for the cumulative four labels per score to be reached in order to be considered to be about either e, s or g.
  #Cumluative this calculates the average if one is not interested in the over time trend.
  final_score = []
  #Generating scores per company.
  with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
  for j in range(0, comp_num):
    i = 0
    complist = df[comp_id_list[j]].tolist()
    score = np.zeros(shape = (len(complist), 4))
    for sequence in complist:
      pred = classifier(sequence, lables)
      score[i, 0:3] = lambda_score_eval(pred, score, i)
      i += 1
    zero_mask = np.where(score == 0)
    score[:, 0:3] = scaler.transform(score[:, 0:3])
    score[zero_mask] = 0
    for i in range(score.shape[0]):
      if score[i, 0] != 0 and score[i, 1] != 0 and score[i, 2] != 0:
        score[i, 3] = (score[i, 0]+score[i, 1]+score[i, 2])/3
    if cumulative == 'on':
      sum_score = np.zeros(4)
      sum_score[0] = np.mean(np.unique(score[:, 0]))
      sum_score[1] = np.mean(np.unique(score[:, 1]))
      sum_score[2] = np.mean(np.unique(score[:, 2]))
      sum_score[3] = np.mean(np.unique(score[:, 3]))
      final_score.append(sum_score)    
    else:
      final_score.append(score)
    print("Number of Companies Evaluated:"+str(j+1))
  return final_score

In [None]:
final_score = esg_calculator()

In [None]:
def esg_plot(score = final_score, comp_name_list = comp_name_list, smoothing = 30):
  for comp in range(len(score)):
      w_e = pd.DataFrame(final_score[comp][:, 0])
      w_e = w_e.rolling(window=smoothing).mean()
      w_e = np.array(w_e)
      w_s = pd.DataFrame(final_score[comp][:, 1])
      w_s = w_s.rolling(window=smoothing).mean()
      w_s = np.array(w_s)
      w_g = pd.DataFrame(final_score[comp][:, 2])
      w_g = w_g.rolling(window=smoothing).mean()
      w_g = np.array(w_g)
      w_esg = pd.DataFrame(final_score[comp][:, 3])
      w_esg = w_esg.rolling(window=smoothing).mean()
      w_esg = np.array(w_esg)
      min_time = np.amin(np.array(time_df[comp_id_list[comp]]))
      max_time = np.amax(np.array(time_df[comp_id_list[comp]]))
      timestamps=np.linspace(min_time/1000,max_time/1000, w_e.shape[0])
      dates=[dt.datetime.fromtimestamp(int(ts)) for ts in timestamps]
    
      fig, ax = plt.subplots(figsize=(9,6))
      plt.style.use('fivethirtyeight')
      ax.plot(dates, w_esg, label='ESG Score')
      ax.plot(dates, w_e, label='Environmental Score')
      ax.plot(dates, w_s, label='Social Score')
      ax.plot(dates, w_g, label='Governance Score')

      ax.set_ylim([-2.5,2.5])
      ax.set_ylabel('ESG Score in Standard Deviations')
      ax.set_xlabel('Time')
      ax.set_title('ESG Score for '+comp_name_list[comp])
      ax.legend()

      plt.show()

In [None]:
esg_plot()

Average ESG

In [None]:
final_score = esg_calculator(cumulative = 'on')

In [None]:
def esg_gauss_plot(esg = final_score, comp_name_list = comp_name_list):
  for comp in range(len(esg)):
    mu = 0
    variance = 1

    sigma = math.sqrt(variance)
    x = np.linspace(mu - 44*sigma, mu + 4*sigma, 500)
    y = stats.norm.pdf(x, mu, sigma)

    fig, ax = plt.subplots(figsize=(9,6))
    plt.style.use('fivethirtyeight')
    ax.plot(x, y)
    ax.scatter(esg[comp][3], stats.norm.pdf(esg[comp][3], mu, sigma), s=200, label='ESG Score')
    ax.scatter(esg[comp][0], stats.norm.pdf(esg[comp][0], mu, sigma), s=200, label='Environmental Score')
    ax.scatter(esg[comp][1], stats.norm.pdf(esg[comp][1], mu, sigma), s=200, label='Social Score')
    ax.scatter(esg[comp][2], stats.norm.pdf(esg[comp][2], mu, sigma), s=200, label='Governance Score')

    ax.fill_between(x,y,0, alpha=0.3, color='b')
    ax.set_xlim([-4,4])
    ax.set_xlabel('ESG Score in Standard Deviations')
    ax.set_yticklabels([])
    ax.set_title('ESG Score for '+comp_name_list[comp])
    ax.legend()

    plt.show()

In [None]:
esg_gauss_plot()