In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import wandb
from multiprocessing import Pool, cpu_count, Manager

In [5]:
def predict(chunk_idx, headlines_list, stocks_list, result_list):
    print(f"Processing chunk {chunk_idx}")

    # Initialize model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    # Disable gradient calculations for faster inference
    model.eval()

    # Tokenize input data
    headlines_chunk = headlines_list[chunk_idx]
    stocks_chunk = stocks_list[chunk_idx]
    inputs = tokenizer(headlines_chunk, padding=True, truncation=True, return_tensors='pt')

    # Perform prediction
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Save the prediction results to the shared result list
    results = []
    for headline, stock, pos, neg, neutr in zip(headlines_chunk, stocks_chunk,
                                                prediction[:, 0].tolist(),
                                                prediction[:, 1].tolist(),
                                                prediction[:, 2].tolist()):
        results.append((headline, stock, pos, neg, neutr))

    result_list[chunk_idx] = results


def load_data():
    # Load data from CSV file
    headlines_df = pd.read_csv('D:/AIML projects/Financial Sentiment Analysis/5000_preprocessed_analyst_ratings.csv')
    headlines_array = np.array(headlines_df)
    headlines_list = list(headlines_array[:, 2])
    stocks_list = list(headlines_array[:, -1])
    return headlines_list, stocks_list


def chunk_data(data_list, chunk_size):
    # Chunk the input data into smaller pieces
    return [data_list[i:i + chunk_size] for i in range(0, len(data_list), chunk_size)]


if __name__ == '__main__':
    # Load input data
    headlines_list, stocks_list = load_data()

    # Split input data into chunks
    num_chunks = cpu_count()
    chunk_size = len(headlines_list) // num_chunks
    headlines_list_chunks = chunk_data(headlines_list, chunk_size)
    stocks_list_chunks = chunk_data(stocks_list, chunk_size)

    # Initialize shared result list using multiprocessing.Manager()
    manager = Manager()
    result_list = manager.list([[] for _ in range(num_chunks)])

    # Initialize a pool of worker processes
    with Pool(processes=num_chunks) as pool:
        # Map each worker process to a chunk of input data
        for idx in range(num_chunks):
            pool.apply_async(predict, args=(idx, headlines_list_chunks, stocks_list_chunks, result_list))

        # Wait for all worker processes to complete
        pool.close()
        pool.join()

    # Consolidate results from the shared result list
    headlines_table = wandb.Table(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])
    for chunk_results in result_list:
        for headline, stock, pos, neg, neutr in chunk_results:
            headlines_table.add_data(headline, stock, pos, neg, neutr)

    # Log the consolidated results to wandb
    wandb.init(project="Financial_Sentiment_Analysis")
    wandb.run.log({"Financial Sentiment Analysis Table": headlines_table})
    wandb.run.finish()
