In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import os
%run -i ../helpers.py
%matplotlib inline
pd.options.display.max_columns = 999

# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Do this once 
# nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/z.askary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use mps:0
Device set to use mps:0
[nltk_data] Downloading package punkt to /Users/z.askary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z.askary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data pre processing 
Stop words removal
Tokenization etc

In [2]:
tp_splits = pd.read_csv('../data/tp_splits.csv')

In [3]:
tp_splits[tp_splits.ticker=='AAPL'][['ticker', 'q','dt' ,'prep_remarks','QnA',]].sort_values('dt', ascending=False).head()

Unnamed: 0,ticker,q,dt,prep_remarks,QnA
2821,AAPL,2023-Q1,2023-02-02 17:00:00,"prepared remarks:\noperator\ngood day, everyon...",:\noperator\ncertainly. we will go ahead and t...
3618,AAPL,2022-Q4,2022-10-27 17:00:00,"prepared remarks:\noperator\ngood day, and wel...",:\noperator\nwe'll go ahead and take our first...
121,AAPL,2022-Q3,2022-07-28 17:00:00,"prepared remarks:\noperator\ngood day, and wel...",:\noperator\nwe'll take our first question fro...
3021,AAPL,2022-Q2,2022-04-28 17:00:00,"prepared remarks:\noperator\ngood day, and wel...",:\noperator\nabsolutely. we'll take our first ...
2345,AAPL,2022-Q1,2022-01-27 17:00:00,"prepared remarks:\noperator\ngood day, and wel...",:\noperator\nabsolutely. we'll take our first ...


### Load data and ESG Sentiment

In [None]:
#tp_splits = pd.read_csv('./tp_splits_processed.csv')

In [1]:
#print(tp_splits.iloc[0].QnA)
#print(tp_splits.iloc[0].QnA_processed)

In [None]:
#tp_splits.columns
tp_splits = tp_splits[['ticker', 'q', 'Sector', 'Industry', 'Shortname',
                        'dt', 'prep_remarks', 'QnA']]
tp_splits.head()

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

finbert_sentiment = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer_sentiment = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
sentiment_pipeline = pipeline("text-classification", model=finbert_sentiment, tokenizer=tokenizer_sentiment)

tokenizer_esg = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg', num_labels=4) #do_lower_case=True, add_special_tokens=True, max_length=512, pad_to_max_length=True
finbert_esg = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg')
esg_label_pip = pipeline("text-classification", model=finbert_esg, tokenizer=tokenizer_esg, device=0) # use device CPU




# The tokenizer_esg is initialized with the FinBERT ESG model, which is specifically fine-tuned for ESG-related text classification tasks.
# The finbert_esg model is a BERT-based sequence classification model fine-tuned for ESG classification.
# The esg_label_pip pipeline combines the tokenizer and model to classify text into ESG categories.
# The device parameter is set to 0, which means the pipeline will use GPU if available; otherwise, it will default to CPU.

# BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model designed for natural language understanding tasks.
# It uses a bidirectional training approach, which allows it to understand the context of a word based on both its left and right surroundings.
# The FinBERT ESG model is a specialized version of BERT, fine-tuned for financial and ESG-related text classification tasks.
# The pipeline function simplifies the process of tokenizing input text, passing it through the model, and decoding the output.
# The text-classification pipeline is particularly useful for tasks like sentiment analysis, topic classification, and ESG label prediction.

Device set to use mps:0
Device set to use mps:0


In [None]:
# all_esg_labels = esg_label_pip(tp_splits['prep_remarks_processed'][0][-2048:])

In [None]:

cols_expect = ['Environmental','Social','Governance']

# Define a function to preprocess text and extract ESG labels and scores(confidence). NAMED ENTITY RECOGNITION
def get_esg_label_transcript(tr):
    sent_label_scores = []  # Initialize a list to store sentence-level ESG labels and scores

    for sent in sent_tokenize(tr):  # Tokenize the transcript into sentences
         # if(len(sent)>2048):
             # sent = sent[-2048:]
        all_esg_labels = esg_label_pip(sent)  # Use the ESG classification pipeline to classify the sentence
        non_none_labels = [x for x in all_esg_labels if x['label'] != 'None']  # Filter out labels with 'None'
        if len(non_none_labels) > 0:  # If there are valid ESG labels
            # Append the label, score, and sentence
            sent_label_scores.append([non_none_labels[0]['label'], non_none_labels[0]['score'], sent])  
            
    # Convert the list of sentence-level labels and scores into a DataFrame
    df = pd.DataFrame(sent_label_scores, columns=['esg_label', 'label_score', 'sent'])
    return df  # Return the DataFrame

# Define a function to calculate sentiments based on ESG labels
def create_sentiment_output(all_labels):
    non_none_labels = [x for x in all_labels if x['label'] != 'None']  # Filter out labels with 'None'
    if len(non_none_labels) > 0:  # If there are valid labels
        label = non_none_labels[0]['label']  # Extract the first label
        score = non_none_labels[0]['score']  # Extract the corresponding score
        # sentiment = 0  # Initialize sentiment score
        if label == 'Positive':  # If the label is 'Positive'
            return 1 * score  # Return a positive sentiment score
        elif label == 'Negative':  # If the label is 'Negative'
            return -1 * score  # Return a negative sentiment score  
        else:
            return 0  # Return neutral sentiment for other labels
    else:
        return 0  # Return neutral sentiment if no valid labels are found

# Define a function to generate ESG columns for a given row in the DataFrame
def generate_esg_cols(row, section='transcript'):
    # Check if the specified section (e.g., 'prep_remarks' or 'QnA') is not NaN
    if row[section] != np.nan:
        # Get ESG labels and scores for the transcript using the get_esg_label_transcript function
        label_scores_df = get_esg_label_transcript(row[section])
        
        # Calculate sentiment scores for each sentence in the transcript using the sentiment pipeline
        label_scores_df['sentiment'] = label_scores_df.sent.apply(lambda x: create_sentiment_output(sentiment_pipeline(x)))
        
        # Round sentiment scores to 4 decimal places for consistency
        label_scores_df.sentiment = label_scores_df.sentiment.apply(lambda x: np.round(x, 4))
        
        # Filter sentences with high ESG label scores (> 0.7) and non-zero sentiment scores
        clean_scores = label_scores_df[((label_scores_df.label_score > 0.7) & (label_scores_df.sentiment != 0))]
        #print(clean_scores.head())
        
        # Group the filtered sentences by ESG label and calculate the median sentiment score for each label
        group_senti = clean_scores.groupby('esg_label')['sentiment'].median().reset_index()
        
        # Iterate over the ESG labels present in the grouped data
        for e in group_senti.esg_label.to_list():
            # Assign the median sentiment score for each ESG label to the corresponding column in the row
            row[e + '_' + section] = group_senti[group_senti.esg_label == e].sentiment.iloc[0]
        
        # Identify ESG labels that are missing (not present in the grouped data)
        missing_cols = list(set(cols_expect) - set(group_senti.esg_label.to_list()))
        
        # Assign NaN to the columns corresponding to the missing ESG labels
        for c in missing_cols:
            row[c + '_' + section] = np.nan
    else:
        # If the section is NaN, assign NaN to all ESG-related columns
        for c in cols_expect:
            row[c + '_' + section] = np.nan
    
    # Return the updated row with the new ESG-related columns
    return row

### Example of ESG labels for a single transcript - prepared remarks or QnA section

In [11]:
#print(tp_splits['QnA'][0])

In [11]:
idx = 188
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
print(tp_splits.iloc[idx].Shortname)
example_df_esg_labels = get_esg_label_transcript(tp_splits['QnA'][idx])
example_df_esg_labels[example_df_esg_labels.esg_label=='Governance']

Discover Financial Services


Unnamed: 0,esg_label,label_score,sent
31,Governance,0.533918,"john t. greene -- executive vice president, chief financial officer\nnot so much structurally."
32,Governance,0.927704,"so, there's certainly a very strong governance element."


In [13]:
example_df_esg_labels[example_df_esg_labels.esg_label=='Social']

Unnamed: 0,esg_label,label_score,sent
0,Social,0.80647,"and if craig is listening, it's been a pleasure working with you too."
1,Social,0.497047,"now, opportunities in 2021 will dictate how much marketing dollars we ultimately end up spending for new customer acquisition."
2,Social,0.718214,so first is the overall unemployment numbers.
3,Social,0.987297,there's about 10.7 million people out of work.
4,Social,0.973585,there's another 7.3 million people that aren't included in the unemployment number due to the fact that they haven't actively worked -- looked for work in the past four weeks.
5,Social,0.774729,we're going to look specifically at the trajectory of unemployment and the type of unemployment.
6,Social,0.977731,"so, we're seeing unemployment levels transition from service workers to white collar workers, who would more likely be representative of our customer base and the impact of stimulus."
7,Social,0.488067,"when i saw the voluntary early retirement, i hope you're a reasonable chunk of that number."
8,Social,0.859147,we tend to keep our rewards program very stable.
9,Social,0.487303,it provides a lot of value.


In [15]:
pd.reset_option('display.max_columns', 0) 
pd.reset_option('display.max_colwidth', 0)
# tp_splits.head(1).apply(lambda x: generate_esg_cols(x,'prep_remarks'), axis = 1)

### Implement run for a single company

Very resource intensive need to find ways to run it locally with garbage collection and parallelization

In [None]:
tp_splits.head(1).apply(lambda x: generate_esg_cols(x,'QnA'), axis = 1)


### Run for few companies

In [None]:
from multiprocessing import Pool

# Sample 10 tickers for testing
np.random.seed(10)
sample_tics = np.random.choice(tp_splits.ticker.unique(),10)
# print(f'Sample Tickers: '+str(sample_tics))
# tp_splits.head().apply(generate_esg_cols, axis=1)
# get_esg_label_transcript(tp_splits.iloc[0]['prep_remarks'])

def generate_esg_cols_sections(tic):
    """
    Processes ESG labels and sentiments for a given ticker's data.

    Args:
        tic (str): The ticker symbol of the company.

    Returns:
        pd.DataFrame: A DataFrame with ESG labels and sentiment scores for the specified ticker.
    """
    df = tp_splits[tp_splits.ticker == tic]
    try:
        # Apply ESG label generation for 'prep_remarks' and 'QnA' sections
        df = df.apply(lambda x: generate_esg_cols(x, 'prep_remarks'), axis=1)
        df = df.apply(lambda x: generate_esg_cols(x, 'QnA'), axis=1)
        #print(df)
        #input()
    except Exception as e:
        print(e)
        return df
    return df

def process_ticker(tic):
    if not exists(f'./tickers/{tic}.csv'):
        print(f'running for: {tic}')
        df = generate_esg_cols_sections(tic)
        df.to_csv(f'./tickers/{tic}.csv', index=None)
        del df
        gc.collect()
        
# for tic in sample_tics:
for tic in tp_splits.ticker.unique():
    """
    Iterates over a list of sample tickers, processes ESG data for each ticker,
    and saves the results to a CSV file if it doesn't already exist.

    Args:
        sample_tics (list): A list of ticker symbols to process.

    Returns:
        None
    """
    if not exists(f'./tickers/{tic}.csv'):
        # with Pool(processes=4) as pool:  # Adjust the number of processes as needed
            # pool.map(process_ticker, tp_splits.ticker.unique())
        print(f'running for: {tic}')
        # Call the function to process ESG data for the ticker
        df = generate_esg_cols_sections(tic)
        # Save the processed DataFrame to a CSV file
        df.to_csv(f'./tickers/{tic}.csv', index=None, sep=';')
        # Clean up memory
        del df
        # Force garbage collection to free up memory
        gc.collect()
    else:
        print(f'File already exists for {tic}, skipping...')
        continue
        