In [8]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch
from tqdm import tqdm

In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")

Running on device: cuda


In [16]:
import pandas as pd

# List of your CSV file names
file_names = [
    '/content/boereport_articles.csv',
    '/content/boereport_articles_2.csv',
    '/content/boereport_articles_3.csv'
]

dfs = []

for file in file_names:
    try:
        print(f"Reading: {file}")
        df = pd.read_csv(
            file,
            engine='python',
            on_bad_lines='skip',
            quotechar='"'
        )
        dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

merged_df = pd.concat(dfs, ignore_index=True)
# handle date format mismatch
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')

# merged_df = merged_df.dropna(subset=['date'])

# Sort oldest to newest
merged_df = merged_df.sort_values(by='date', ascending=True).reset_index(drop=True)


Reading: /content/boereport_articles.csv
Reading: /content/boereport_articles_2.csv
Reading: /content/boereport_articles_3.csv


In [17]:
merged_df.head()

Unnamed: 0,date,title,description
0,2012-12-03,VIDEO: Waterflood Enhanced Recovery,[Read more]
1,2012-12-10,MEG Energy Announces 2013 Capital Budget and P...,MEG Energy Corp. released its 2013 capital bud...
2,2012-12-10,YES. YES. Harper approves Nexen and Progress d...,"By Stephanie Levitz and Craig Wong, The Canadi..."
3,2012-12-10,"Progress, Petronas say government approval of ...",By The Canadian Press CALGARY - Progress Energ...
4,2012-12-10,Surge Announces Operations Update,"Surge has drilled, completed and placed on pro..."


In [18]:
merged_df.to_csv('merged_output.csv', index=False)

In [19]:
# Load FinBERT model and tokenizer
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

# Combine title + description
merged_df["text"] = merged_df["title"].fillna('') + ". " + merged_df["description"].fillna('')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [20]:
# Function to get sentiment score
def get_sentiment_score(text):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.detach().cpu().numpy()[0]
            probs = softmax(logits)
        # FinBERT label order: [pos, neg, neutral]
        score = probs[0] - probs[1]  # Positive - Negative
        return float(score)
    except Exception as e:
        print(f"Error: {e}")
        return 0.0

In [21]:
tqdm.pandas(desc="Computing sentiment on GPU")
merged_df["sentiment_score"] = merged_df["text"].progress_apply(get_sentiment_score)

# 6. Aggregate by date
daily_sentiment = merged_df.groupby("date")["sentiment_score"].mean().reset_index()

# 7. Save output
daily_sentiment.to_csv("daily_commodity_sentiment_gpu.csv", index=False)

print("✅ Done! Daily sentiment saved to daily_commodity_sentiment_gpu.csv")
print(daily_sentiment)

Computing sentiment on GPU: 100%|██████████| 33544/33544 [06:39<00:00, 83.93it/s]

✅ Done! Daily sentiment saved to daily_commodity_sentiment_gpu.csv
           date  sentiment_score
0    2012-12-03         0.045655
1    2012-12-10         0.584642
2    2012-12-11         0.403724
3    2012-12-12         0.553235
4    2012-12-13         0.086875
...         ...              ...
3753 2025-10-06         0.065642
3754 2025-10-07         0.244831
3755 2025-10-08         0.025787
3756 2025-10-09        -0.146000
3757 2025-10-10        -0.360694

[3758 rows x 2 columns]



