Calculating sentiments for article titles using our finetuned model

In [None]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_DISABLED=true
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModel,
    AdamW,
    AutoConfig,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification
)
!pip install datasets
from datasets import Dataset

import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

In [None]:
class args:
    model = 'finbert-finetune'

In [None]:
data_path = '/content/drive/MyDrive/FYP/processed_proquest_articles_with_dates.csv.zip'

In [None]:
import zipfile
with zipfile.ZipFile(data_path, 'r') as zip_ref:
    zip_ref.extractall('/content/extracted_data')  # Extract to a folder named 'extracted_data'

# 2. Read the CSV file and print the first 5 lines
csv_file_path = '/content/extracted_data/processed_proquest_articles_with_dates.csv'
df = pd.read_csv(csv_file_path)
print(df.head())

In [None]:
from google.colab import drive
import zipfile
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Extract the zip file
data_path = '/content/drive/MyDrive/FYP/processed_proquest_articles_with_dates.csv.zip'
with zipfile.ZipFile(data_path, 'r') as zip_ref:
    zip_ref.extractall('/content/extracted_data')  # Extract to a folder named 'extracted_data'

# Read the CSV file
csv_file_path = '/content/extracted_data/processed_proquest_articles_with_dates.csv'
df = pd.read_csv(csv_file_path)

# Ensure required columns exist
required_columns = ["Company", "Date", "Title", "Content"]
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The dataset must contain the following columns: {required_columns}")

# Load the FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Combine Title and Content
df['Title_Content'] = df['Title'] + " " + df['Content']

# Process each combined text and calculate sentiment
all_texts = df["Title_Content"].tolist()
all_companies = df["Company"].tolist()
all_dates = df["Date"].tolist()

# Store the results in lists
sentiments = []
sentiment_scores = []

# Loop through combined text for prediction
for i, text in enumerate(tqdm(all_texts, desc="Processing articles")):
    # Tokenize text
    tokenized_text = tokenizer(
        text,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        # Get model outputs (exclude invalid keys)
        inputs = {k: v.to(device) for k, v in tokenized_text.items() if k in ["input_ids", "attention_mask", "token_type_ids"]}
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]

        # Get sentiment label and score
        sentiment_idx = np.argmax(probabilities)
        sentiment_label = ["negative", "neutral", "positive"][sentiment_idx]

        sentiment_score = probabilities[sentiment_idx]
        if sentiment_label == "negative":
            sentiment_score = -sentiment_score  # Make negative for negative sentiment

        # Append results
        sentiments.append(sentiment_label)
        sentiment_scores.append(sentiment_score)

# Create a DataFrame to hold the results
rdf = pd.DataFrame({
    "Company": all_companies,
    "Date": all_dates,
    "text": all_texts,
    "sentiment": sentiments,
    "sentiment_score": sentiment_scores
})

# Display the resulting DataFrame
print(rdf.head())

# Optional: Save the DataFrame to a CSV file
output_file_path = '/content/drive/MyDrive/FYP/sentiment_results_title_content.csv'
rdf.to_csv(output_file_path, index=False)
print(f"Results saved to {output_file_path}")
