<a href="https://colab.research.google.com/github/Kussil/Financial_Sentiment_LLM/blob/main/LLama_Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPU info

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jun 25 05:14:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              44W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Connecting to G-drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installing and importing libraries

In [None]:
# Install necessary libraries
!pip install -q -U langchain langchain_community transformers bitsandbytes accelerate

In [None]:
import os
import json
import pandas as pd
import torch
from tqdm import tqdm
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import HfFolder, HfApi
from google.colab import userdata
import re
import time

# Hugging Face Using Secret Token

In [None]:
# Retrieve the token from Colab secrets
hf_token = userdata.get('HF_TOKEN')
if hf_token:
    HfFolder.save_token(hf_token)
    api = HfApi()
    user_info = api.whoami()
    if user_info:
        print("Connection to Hugging Face was successful.")
    else:
        print("Failed to connect to Hugging Face. Please check your token.")
else:
    print("Hugging Face token not found. Please set the HF_TOKEN environment variable.")


Connection to Hugging Face was successful.


# Connecting to GitHub Repo

In [None]:
# Import GitHub token with Google secrets and clone the repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone  https://{GITHUB_TOKEN}@github.com/Kussil/Financial_Sentiment_LLM.git


Cloning into 'Financial_Sentiment_LLM'...
remote: Enumerating objects: 2138, done.[K
remote: Counting objects: 100% (543/543), done.[K
remote: Compressing objects: 100% (234/234), done.[K
remote: Total 2138 (delta 329), reused 485 (delta 297), pack-reused 1595[K
Receiving objects: 100% (2138/2138), 444.70 MiB | 13.95 MiB/s, done.
Resolving deltas: 100% (1412/1412), done.
Updating files: 100% (1111/1111), done.


# LLama code

In [None]:
TEMPLATE = """<s>Classify the following article into categories with sentiment (Positive, Neutral, Negative, N/A if not applicable) and provide the output in the specified dictionary format.
Example:
Article: ExxonMobil announced a significant increase in quarterly profits due to rising oil prices and increased production levels.
Output: {{'Finance': 'Positive', 'Production': 'Positive', 'Reserves / Exploration / Mergers': 'Neutral', 'Environment / Regulatory': 'Neutral', 'Alternative Energy': 'Neutral', 'Oil/Gas Prices': 'Positive'}}

Example:
Article: Chevron plans to invest heavily in renewable energy projects, aiming to reduce its carbon footprint over the next decade.
Output: {{'Finance': 'Neutral', 'Production': 'Neutral', 'Reserves / Exploration / Mergers': 'Neutral', 'Environment / Regulatory': 'Positive', 'Alternative Energy': 'Positive', 'Oil/Gas Prices': 'Neutral'}}

Example:
Article: BP faced regulatory challenges in its latest drilling project, delaying operations and increasing costs.
Output: {{'Finance': 'Negative', 'Production': 'Negative', 'Reserves / Exploration / Mergers': 'Negative', 'Environment / Regulatory': 'Negative', 'Alternative Energy': 'Neutral', 'Oil/Gas Prices': 'Neutral'}}


Article: {article}

Output only the EXACT dictionary format:
{{'Finance': '[Sentiment]', 'Production': '[Sentiment]', 'Reserves / Exploration / Mergers': '[Sentiment]', 'Environment / Regulatory': '[Sentiment]', 'Alternative Energy': '[Sentiment]', 'Oil/Gas Prices': '[Sentiment]'}}
</s>"""

In [None]:


def process_article(article, model, tokenizer, generation_pipeline):
    """
    Processes an article using a language model to classify it into predefined categories with sentiment analysis.

    Args:
        article (str): The text of the article to be processed.
        model: The language model used for text generation and classification.
        tokenizer: Tokenizer used to encode the input text for the model.
        generation_pipeline: Pipeline or method for generating text outputs from the model.

    Returns:
        dict or str: A dictionary containing categorized sentiments ('Positive', 'Neutral', 'Negative', or 'N/A' for not applicable)
                    for predefined categories ('Finance', 'Production', 'Reserves / Exploration / Mergers', 'Environment / Regulatory',
                    'Alternative Energy', 'Oil/Gas Prices'), or a message indicating no valid JSON output found after 5 attempts.
    """

    for attempt in range(5):  # Attempt to find valid JSON 5 times
        # Create a prompt using the template
        prompt = TEMPLATE.format(article=article)

        # Encode the prompt and truncate to max length
        inputs = tokenizer(prompt, max_length=1024, truncation=True, return_tensors='pt')

        # Move inputs to the correct device
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Generate the classification using the LLaMA model
        result = model.generate(**inputs, max_new_tokens=200, num_beams=2, early_stopping=True)

        # Decode the result
        generated_text = tokenizer.decode(result[0], skip_special_tokens=True)

        # Split the response to separate the input prompt and the output
        split_response = generated_text.split("</s>", 1)
        if len(split_response) > 1:
            final_response = split_response[1].strip()
        else:
            final_response = generated_text.strip()

        # Extract the JSON part from the final response
        output_start = final_response.find("Output:")
        if output_start != -1:
            output_text = final_response[output_start:]
            json_output = re.search(r"\{.*\}", output_text)
            if json_output:
                try:
                    return json.loads(json_output.group(0).replace("'", '"'))
                except json.JSONDecodeError:
                    continue  # Continue to next attempt if JSON decoding fails
        else:
            continue  # Continue to next attempt if no "Output:" is found

    return "No valid JSON output found after 5 attempts."


# Model and Pipeline settings

In [None]:
# Load the LLaMA model and tokenizer
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    device_map="auto",
    quantization_config=quantization_config,
    use_auth_token=hf_token
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", use_auth_token=hf_token)

# Create text generation pipeline
generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=1024,
    truncation=True,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Data processing

In [None]:
# Path to the CSV file containing articles
invest_df1 = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/Investment_Research_Part1.csv')
invest_df2 = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/Investment_Research_Part2.csv')
proquest_df = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/ProQuest_Articles.csv')
earnings_presentations = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/Earnings_Presentations.csv')
earnings_qa = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/Earnings_QA.csv')
sec_df = pd.read_csv('/content/Financial_Sentiment_LLM/02_Cleaned_Data/SEC_Filings.csv')

# Merge into single df
text_df = pd.concat([invest_df1, invest_df2, proquest_df, sec_df, earnings_presentations, earnings_qa], ignore_index=True)
display(text_df.shape)
display(text_df.head())
display(text_df.tail())

(10126, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,IR-1,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,IR-2,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,IR-3,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,IR-4,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,IR-5,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol: COP ...",


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
10121,Earnings Call Q&A,EQ-338,XOM,Feb-02-2021,"Exxon Mobil Corporation, Q4 2020 Earnings Call...",Question and Answer\nOperator\n[Operator Instr...,
10122,Earnings Call Q&A,EQ-339,COP,Feb-02-2021,"ConocoPhillips, Q4 2020 Earnings Call, Feb 02,...",Question and Answer\nOperator\n[Operator Instr...,
10123,Earnings Call Q&A,EQ-340,EOG,May-03-2019,"EOG Resources, Inc., Q1 2019 Earnings Call, Ma...",Question and Answer\nOperator\n[Operator Instr...,
10124,Earnings Call Q&A,EQ-341,SHEL,May-02-2019,"Royal Dutch Shell plc, Q1 2019 Earnings Call, ...",Question and Answer\nOperator\n[Operator Instr...,
10125,Earnings Call Q&A,EQ-342,COP,Apr-30-2019,"ConocoPhillips, Q1 2019 Earnings Call, Apr 30,...",Question and Answer\nOperator\n[Operator Instr...,


In [None]:


# Path to the intermediate output file
intermediate_output_file = "/content/drive/MyDrive/Capstone folder/Sentiment_Framework/LLAMA_results/output_results_intermediate.json"
final_output_file = "/content/drive/MyDrive/Capstone folder/Sentiment_Framework/LLAMA_results/output_results_final.json"

# Name of the column containing the article text
article_column = 'Article Text'

# Output dictionary to store results
output_dict = {}

# Check if the intermediate output file exists and load it
if os.path.exists(intermediate_output_file):
    with open(intermediate_output_file, "r") as file:
        output_dict = json.load(file)
    # Find the ID of the last processed article in the output_dict
    last_processed_id = list(output_dict.keys())[-1]
    # Find the index of the last processed article in the DataFrame
    start_index = text_df[text_df['Unique_ID'] == last_processed_id].index[0] + 1
else:
    # If intermediate output file doesn't exist, start processing from the beginning
    start_index = 0

# Process each article in the DataFrame starting from the last saved state
for index, row in tqdm(text_df.iloc[start_index:].iterrows(), total=len(text_df) - start_index):
    unique_id = row['Unique_ID']
    article = row[article_column]

    output = process_article(article, model, tokenizer, generation_pipeline)
    output_dict[unique_id] = output
    print(f"Processed article {unique_id}")

    # Save the output dictionary to a JSON file every 1 iterations
    if (index + 1) % 1 == 0:
        with open(intermediate_output_file, "w") as file:
            json.dump(output_dict, file, indent=4)
        print(f"Intermediate results saved to {intermediate_output_file}")

# Save the final output dictionary to a JSON file
with open(final_output_file, "w") as file:
    json.dump(output_dict, file, indent=4)

print(f"Final results saved to {final_output_file}")

  0%|          | 0/10113 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/10113 [00:21<61:13:32, 21.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processed article IR-14
Intermediate results saved to /content/drive/MyDrive/Capstone folder/Sentiment_Framework/LLAMA_results/output_results_intermediate.json


  0%|          | 1/10113 [00:24<68:32:32, 24.40s/it]


KeyboardInterrupt: 