# Import Packages

In [None]:
import pandas as pd
import calendar
import numpy as np
import time
import unicodedata
import re
from functools import partial
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer(language='english')
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from multiprocessing import Pool
from nltk.sentiment.util import mark_negation
import warnings
import datetime
import sys
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from tqdm import tqdm
import pickle
warnings.filterwarnings('ignore')
import gc

# Connect with Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
print("✅ GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


✅ GPU available: True
GPU name: NVIDIA A100-SXM4-40GB


In [None]:
file_path = "/content/drive/My Drive/Thesis/cleaned_keywords.txt"

# Read the bigrams data
with open(file_path, 'r') as f:
    cleaned_keywords = [line.strip() for line in f]
len(cleaned_keywords)

805

In [None]:
file_path = "/content/drive/My Drive/Thesis/id_link.csv"
id_link = pd.read_csv(file_path)

# Changec column 'startdate' and 'enddate' to datetime
# Keep 'B', 'E'
id_link['startdate'] = pd.to_datetime(id_link['startdate'], errors='coerce')
id_link['enddate'] = pd.to_datetime(id_link['enddate'], errors='coerce')
id_link


Unnamed: 0,companyid,gvkey,startdate,enddate,companyname
0,18507,235716,NaT,NaT,2M Invest A/S
1,18511,210835,NaT,NaT,3i Group plc
2,18527,210418,NaT,NaT,ABB Ltd
3,18671,29751,NaT,NaT,Albemarle Corporation
4,18711,28349,NaT,NaT,The Allstate Corporation
...,...,...,...,...,...
135923,1930908128,365887,NaT,NaT,"Huatai-Pinebridge Fund Management Co., Ltd. - ..."
135924,1931066122,365886,NaT,NaT,GLOBAL X MANAGEMENT (AUS) LTD-RUSSELL 2000 ETF
135925,1931521679,365952,NaT,NaT,Yaqeen S and P ESG Mena ETF Fund - Mena ETF Fund
135926,1931855556,365970,NaT,NaT,Chimera Iboxx US treasury Bill ETF - Class B Fund


In [None]:
# Read the topics and bigrams dictionary
with open('/content/drive/My Drive/Thesis/bigrams.pkl', 'rb') as f:
    metatopic_keywords = pickle.load(f)

# Construct dictionary from keywords to metatopic
keyword_to_metatopic = {}
for metatopic, keywords in metatopic_keywords.items():
    for kw in keywords:
        keyword_to_metatopic[kw] = metatopic

# Get signals

In [None]:
def step1_generate_embeddings(year, file_prefix, cleaned_keywords):

    print(f"🔹 Step 1: Embedding for year {year}")

    # Load the parquet file
    file_path = f"{file_prefix}.parquet"
    ec_data = pd.read_parquet(file_path)

    # 1. Load the pre-trained model
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # 2. Encode the 'componenttext' column to generate embeddings and each bigram term
    text_embeddings = model.encode(ec_data['componenttext'].tolist())
    term_embeddings = model.encode(cleaned_keywords)

    # 3. Save the embeddings as a new column (converted to list format)
    ec_data['embedding'] = text_embeddings.tolist()

    # 4. Compute cosine similarity between each text embedding and all term embeddings
    similarities = cosine_similarity(text_embeddings, term_embeddings)

    # 5. For each row, find the maximum similarity value and the corresponding matched term
    max_indices = np.argmax(similarities, axis=1)
    ec_data['max_similarity'] = similarities[np.arange(len(ec_data)), max_indices]
    ec_data['matched_term'] = [cleaned_keywords[i] for i in max_indices]

    # 6. Save the result
    output_path = f"{file_prefix}_with_embedding.parquet"
    ec_data.to_parquet(output_path, index=False)

    # Release memory
    del ec_data, text_embeddings, term_embeddings, similarities, max_indices
    gc.collect()


In [None]:
def step2_sentiment_analysis(year, file_prefix, keyword_to_metatopic):

    print(f"🔹 Step 2: Sentiment analysis for year {year}")

    # Load the parquet file
    file_path = f"{file_prefix}_with_embedding.parquet"
    df = pd.read_parquet(file_path)

    # 1: Create the metatopic column
    df['metatopic'] = df['matched_term'].map(keyword_to_metatopic)

    # 2: Load the FinBERT model and tokenizer
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # 3: Create a sentiment analysis pipeline
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        return_all_scores=True,
        device=0,  # Use GPU
        truncation=True,
        max_length=512
    )


    # 4: Perform sentiment analysis in batches
    texts = df['componenttext'].tolist()
    batched_results = []
    batch_size = 32

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        try:
            batch_output = sentiment_pipeline(batch)
            batched_results.extend(batch_output)
        except Exception as e:
            print(f"⚠️ Error at batch {i}: {e}")
            batched_results.extend([None] * len(batch))

    # 5: Convert each result into a row (extract scores into columns)
    def extract_scores(score_list):
        return {entry['label']: entry['score'] for entry in score_list}

    score_df = pd.DataFrame([
        extract_scores(x) if x else {"Positive": None, "Neutral": None, "Negative": None}
        for x in batched_results
    ])


    # 6: Merge sentiment scores back into the original DataFrame
    df = df.reset_index(drop=True)
    df = pd.concat([df, score_df], axis=1)

    # 7: Add sentiment label and calculate sentiment score
    df["sentiment"] = df[["Positive", "Neutral", "Negative"]].idxmax(axis=1)
    df["sentiment_score"] = df["Positive"] - df["Negative"]

    # 8. Save the result
    output_path = f"{file_prefix}_with_sentence_sentiment_score.parquet"
    df.to_parquet(output_path, index=False)

    # Release memory
    del df, batched_results, score_df, texts, sentiment_pipeline, model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()


In [None]:
def step3_topic_sentiment_aggregation(year, file_prefix, id_link):

    print(f"🔹 Step 3: Topic sentiment aggregation for year {year}")

    # Load the parquet file
    file_path = f"{file_prefix}_with_sentence_sentiment_score.parquet"
    df = pd.read_parquet(file_path)

    # Convert 'mostimportantdateutc' to datetime type
    df['mostimportantdateutc'] = pd.to_datetime(df['mostimportantdateutc'])
    df['date'] = df['mostimportantdateutc'].dt.date

    # Select relevant columns
    cols = ['transcriptid', 'keydevid', 'companyid', 'date', 'headline', 'metatopic', 'sentiment_score', 'section']
    df_selected = df[cols]

    # 1. Aggregate without section separation (full transcript version)
    df_all = df_selected.groupby(['transcriptid', 'metatopic']).agg({
        'sentiment_score': 'mean',
        'keydevid': 'first',
        'companyid': 'first',
        'date': 'first',
        'headline': 'first'
    }).reset_index()

    df_all_pivot = df_all.pivot_table(
        index=['transcriptid', 'keydevid', 'companyid', 'date', 'headline'],
        columns='metatopic',
        values='sentiment_score'
    ).reset_index()
    df_all_pivot.columns.name = None

    # 2. Aggregate Pre and QA sections separately
    df_split = df_selected.groupby(['transcriptid', 'section', 'metatopic'])['sentiment_score'].mean().reset_index()
    df_split_pivot = df_split.pivot(
        index='transcriptid',
        columns=['section', 'metatopic'],
        values='sentiment_score'
    )

    # Flatten the multi-index columns to format like "metatopic_Pre" or "metatopic_QA"
    df_split_pivot.columns = [f"{topic}_{section}" for section, topic in df_split_pivot.columns]
    df_split_pivot = df_split_pivot.reset_index()

    # 3. Merge the overall and section-level results
    df_final = df_all_pivot.merge(df_split_pivot, on='transcriptid', how='left')
    merged = df_final.merge(id_link, on='companyid', how='left')

    # Filter records where the date falls within the valid [startdate, enddate] range
    valid_mask = (
        ((merged['startdate'].isna()) | (merged['date'] >= merged['startdate'])) &
        ((merged['enddate'].isna()) | (merged['date'] <= merged['enddate']))
    )
    merged_df = merged[valid_mask]

    # Save the result
    output_path = f"/content/drive/My Drive/Thesis/regression_data_X_{year}_v2.parquet"
    merged_df.to_parquet(output_path, index=False)

    # Release memory
    del df, df_selected, df_all, df_all_pivot, df_split, df_split_pivot, df_final, merged, merged_df
    gc.collect()


In [None]:
def step4_word_count(year, file_prefix):

    print(f"🔹 Step 4: Word count for year {year}")

    # Load the parquet file
    file_path = f"{file_prefix}_with_sentence_sentiment_score.parquet"
    df = pd.read_parquet(file_path)

    # Calculate the number of words in the full transcript and pre/q&a section (to be used as control variables in regression)
    df['word_count'] = df['componenttext'].str.split().str.len()

    total_wordcount = df.groupby('transcriptid')['word_count'].sum().reset_index()
    total_wordcount = total_wordcount.rename(columns={'word_count': 'total_word_count'})

    section_wordcount = df.groupby(['transcriptid', 'section'])['word_count'].sum().unstack(fill_value=0).reset_index()
    section_wordcount = section_wordcount.rename(columns={'Pre': 'pre_word_count', 'QA': 'qa_word_count'})

    date_info = df.groupby('transcriptid')['mostimportantdateutc'].first().reset_index()

    # Merge the data
    final_df = total_wordcount.merge(section_wordcount, on='transcriptid', how='left')
    final_df = final_df.merge(date_info, on='transcriptid', how='left')
    final_df = final_df[['transcriptid', 'mostimportantdateutc', 'total_word_count', 'pre_word_count', 'qa_word_count']]

    # Save the result
    output_path = f"/content/drive/My Drive/Thesis/{year}_ec_words_count.parquet"
    final_df.to_parquet(output_path, index=False)

    # Release memory
    del df, total_wordcount, section_wordcount, date_info, final_df
    gc.collect()


In [None]:
def step5_merge_sentiment_wordcount(year, file_prefix):

    print(f"🔹 Step 5: Final merge for year {year}")

    # Load the parquet file
    df = pd.read_parquet(f"/content/drive/My Drive/Thesis/regression_data_X_{year}_v2.parquet")
    words_count = pd.read_parquet(f"/content/drive/My Drive/Thesis/{year}_ec_words_count.parquet")

    # Merge with sentiment data
    merged_df = pd.merge(df, words_count, on='transcriptid', how='left')
    output_path = f"/content/drive/My Drive/Thesis/regression_data_X_{year}_v3.parquet"
    merged_df.to_parquet(output_path, index=False)

    # Release memory
    del df, words_count, merged_df
    gc.collect()


In [None]:
def run_all_years(cleaned_keywords, keyword_to_metatopic, id_link):
    for year in range(2015, 2025):
        print(f"\n\n📅 ===== Starting processing for year: {year} =====")
        file_prefix = f"/content/drive/My Drive/Thesis/{year}_ec_data"

        step1_generate_embeddings(year, file_prefix, cleaned_keywords)
        step2_sentiment_analysis(year, file_prefix, keyword_to_metatopic)
        step3_topic_sentiment_aggregation(year, file_prefix, id_link)
        step4_word_count(year, file_prefix)
        step5_merge_sentiment_wordcount(year, file_prefix)

        print(f"✅ Completed processing for year: {year}")



In [None]:
# get signals
# here the output is just for year 2019 in order to give an example. Actually we run this funtion from 2015 to 2024
run_all_years(cleaned_keywords, keyword_to_metatopic, id_link)



📅 ===== Starting processing for year: 2019 =====
🔹 Step 1: Embedding for year 2019


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔹 Step 2: Sentiment analysis for year 2019


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 4/21982 [00:01<1:57:04,  3.13it/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

  0%|          | 10/21982 [00:03<2:07:33,  2.87it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 21982/21982 [1:51:15<00:00,  3.29it/s]


🔹 Step 3: Topic sentiment aggregation for year 2019
🔹 Step 4: Word count for year 2019
🔹 Step 5: Final merge for year 2019
✅ Completed processing for year: 2019


📅 ===== Starting processing for year: 2020 =====
🔹 Step 1: Embedding for year 2020
🔹 Step 2: Sentiment analysis for year 2020


Device set to use cuda:0
100%|██████████| 21715/21715 [1:51:25<00:00,  3.25it/s]


🔹 Step 3: Topic sentiment aggregation for year 2020
🔹 Step 4: Word count for year 2020
🔹 Step 5: Final merge for year 2020
✅ Completed processing for year: 2020


📅 ===== Starting processing for year: 2021 =====
🔹 Step 1: Embedding for year 2021
