<a href="https://colab.research.google.com/github/IG-Reagan/Bank-of-England_Cambridge-ICE_NLP_Analysing-Quarterly-Announcements-of-GSIBs/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd

path = '/content/drive/MyDrive/QA_analysis_HybrRAG_Mistral_JPMC_all_original_.xlsx'

if os.path.exists(path):
    try:
        # Read the Excel file into a pandas DataFrame
        df = pd.read_excel(path)

        # Convert the DataFrame to a CSV string
        csv_data = df.to_csv(index=False)  # Set index=False to avoid writing row indices

        # Define the output CSV file path
        csv_file_path = '/content/drive/MyDrive/QA_analysis_HybrRAG_Mistral_JPMC_all_original_.csv'

        # Write the CSV data to a file
        with open(csv_file_path, 'w') as f:
            f.write(csv_data)

        print(f"Excel file successfully converted to CSV at: {csv_file_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

else:
    print(f"File not found at {path}")

In [None]:
df

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline, AutoTokenizer

In [None]:
import numpy as np
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

In [None]:
# Load tokenizer and sentiment analysis pipeline
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
classifier = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")

In [None]:
# Function to chunk text into max 512-token pieces
def chunk_text(text, max_tokens=512, overlap_ratio=0.1):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) <= max_tokens:
        return [text]  # Return as is if within limit

    # Convert back to text using sentence tokenization
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []

    token_count = 0
    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        if token_count + len(sentence_tokens) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = current_chunk[-int(len(current_chunk) * overlap_ratio):]  # Overlap
            token_count = sum(len(tokenizer.encode(s, add_special_tokens=False)) for s in current_chunk)

        current_chunk.append(sentence)
        token_count += len(sentence_tokens)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
# Function to compute sentiment score for chunks and aggregate
def analyze_sentiment(text):
    chunks = chunk_text(text)
    sentiment_scores = []
    sentiment_labels = []

    for chunk in chunks:
        result = classifier(chunk)[0]
        sentiment_scores.append(result["score"])
        sentiment_labels.append(result["label"])

    # Aggregation
    avg_score = np.mean(sentiment_scores)  # Average sentiment score
    final_label = max(set(sentiment_labels), key=sentiment_labels.count)  # Majority vote for final sentiment

    return final_label, avg_score

In [None]:
# Iterate through the DataFrame and perform sentiment analysis
for index, row in df.iterrows():
    try:
        # Process Question
        question_label, question_score = analyze_sentiment(row['question'])
        df.at[index, 'question_sentiment'] = question_label
        df.at[index, 'question_sentiment_score'] = question_score

        # Process Answer
        answer_label, answer_score = analyze_sentiment(row['answer'])
        df.at[index, 'answer_sentiment'] = answer_label
        df.at[index, 'answer_sentiment_score'] = answer_score

    except Exception as e:
        print(f"Error processing row {index}: {e}")
        df.at[index, 'question_sentiment'] = "Error"
        df.at[index, 'question_sentiment_score'] = -1
        df.at[index, 'answer_sentiment'] = "Error"
        df.at[index, 'answer_sentiment_score'] = -1

print(df.head())

In [None]:
df

In [None]:
# prompt: convert df into xlsx file

# Convert the DataFrame to an xlsx file
output_xlsx_path = '/content/drive/MyDrive/QA_analysis_HybrRAG_Mistral_JPMC_all_original_with_sentiment_analysis.xlsx'
df.to_excel(output_xlsx_path, index=False)  # Set index=False to avoid writing row indices

print(f"DataFrame successfully converted to xlsx at: {output_xlsx_path}")


In [None]:
# prompt: count the different results from the question_sentiment and answer_sentiment columns in df

# Count different results in 'question_sentiment' and 'answer_sentiment' columns
question_sentiment_counts = df['question_sentiment'].value_counts()
answer_sentiment_counts = df['answer_sentiment'].value_counts()

print("Question Sentiment Counts:\n", question_sentiment_counts)
print("\nAnswer Sentiment Counts:\n", answer_sentiment_counts)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count the occurrences of each sentiment label
sentiment_counts = df['question_sentiment'].value_counts()

# Create the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Question Sentiments')
plt.xlabel('Sentiment Label')
plt.ylabel('Number of Questions')
plt.show()


In [None]:
# Count the occurrences of each sentiment label for answers
sentiment_counts_answer = df['answer_sentiment'].value_counts()

# Create the bar chart for answer sentiments
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts_answer.index, y=sentiment_counts_answer.values)
plt.title('Distribution of Answer Sentiments')
plt.xlabel('Sentiment Label')
plt.ylabel('Number of Answers')
plt.show()
