In [None]:
import pandas as pd
import os,tqdm
import gc
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Construct relative path
finance_news = os.path.join(".", "FINANCIAL NEWS", "finance_data.csv")

# Minimal file path check
if not os.path.exists(finance_news):
    print(f"File not found at: {os.path.abspath(finance_news)}")
    print("Current working directory:", os.getcwd())
    print("Directory contents:", os.listdir(os.getcwd()))
    exit()

# Read the CSV file
df = pd.read_csv(finance_news)

# Verify row count
initial_rows = len(df)
print(f"Initial row count: {initial_rows}")

# Verify and process columns
required_columns = ['Date', 'Article_title', 'Stock_symbol', 'Article']
if not all(col in df.columns for col in required_columns):
    print("Error: Missing required columns. Available columns:", list(df.columns))
    exit()

# Keep only required columns
df = df[['Date', 'Article_title', 'Stock_symbol', 'Article']]

df['text'] = df['Article_title'].astype(str) + ' ' + df['Article'].fillna('').astype(str)
# Create df1 and drop original columns
df1 = df[['Date', 'Stock_symbol', 'text']].copy()

# Clear df to free memory
del df
gc.collect()

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Move model to GPU
model = model.to(device)
model.eval()

# Batch inference function with mixed precision
def batch_sentiment_scores(texts, batch_size=32):
    texts = ["" if pd.isna(text) or text.strip() == "" else text for text in texts]
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True
    )
    dataset = TensorDataset(
        inputs['input_ids'],
        inputs['attention_mask']
    )
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    scores = []
    sentiments = []
    labels = ['positive', 'negative', 'neutral']
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing batches", total=len(dataloader)):
            input_ids, attention_mask = [b.to(device) for b in batch]
            with torch.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            probs = probs.cpu().numpy()
            batch_scores = np.max(probs, axis=1)
            batch_sentiments = [labels[np.argmax(p)] for p in probs]
            scores.extend(batch_scores)
            sentiments.extend(batch_sentiments)
            del input_ids, attention_mask, outputs, probs
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    for i, text in enumerate(texts):
        if text.strip() == "":
            scores[i] = np.nan
            sentiments[i] = "neutral"
    
    return scores, sentiments

# Apply batched sentiment analysis
scores, sentiments = batch_sentiment_scores(df1['text'].tolist(), batch_size=32)
df1['sentiment_score'] = scores
df1['sentiment'] = sentiments

# Verify final row count
print(f"Final row count: {len(df1)}")
if len(df1) != initial_rows:
    print(f"Error: Rows were dropped during processing. Initial rows: {initial_rows}, Final rows: {len(df1)}")

# Display results
print("\nDataFrame with sentiment scores:")
print(df1.head())
print("\nNumber of missing sentiment scores:", df1['sentiment_score'].isna().sum())

# Save the DataFrame
output_file = os.path.join(".", "FINANCIAL NEWS", "finance_data_with_sentiment.csv")
df1.to_csv(output_file, index=False, compression=None)
print(f"DataFrame saved to: {output_file}")

# Clear df1 to free memory
del df1
gc.collect()

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


  df = pd.read_csv(finance_news)


Initial row count: 13057514
