In [None]:
# https://www.kaggle.com/datasets/notlucasp/financial-news-headlines/

In [None]:
import os
import re
import json
import csv
import datetime
from langchain.prompts import PromptTemplate
from genai.credentials import Credentials
import os
from dotenv import load_dotenv
# Using Generative AI Library
from genai.model import Model
from genai.schemas import GenerateParams
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# Suppress all warnings
import warnings
warnings.simplefilter("ignore")

In [None]:
# read the csv file data/reuters_headlines.csv into a dataframe
df = pd.read_csv('data/reuters_headlines.csv',nrows=1000)
df.head()

In [None]:
df.describe()

In [None]:
load_dotenv()
api_key = os.getenv("GENAI_KEY")
api_url = os.getenv("GENAI_API")

creds = Credentials(api_key, api_endpoint=api_url) # credentials object to access the LLM service

In [None]:
# a helper function to generate text
def get_completion(sample, prompt_string, model):
    prompt_template = PromptTemplate.from_template(prompt_string)
    prompt=prompt_template.format(sample=sample)
    result=model.generate([prompt])[0].generated_text
    # print(sample)
    # print(result)
    # print(" ")
    return result

# get sentiment

In [None]:
# define model type
#MODELTYPE = "meta-llama/llama-2-70b-chat"
MODELTYPE = "ibm/granite-13b-chat-v1"
# MODELTYPE = "ibm/granite-13b-sft"


# Instantiate parameters for text generation
params = GenerateParams(
    decoding_method="sample", # use 'greedy' alternatively
    max_new_tokens=1000,
    min_new_tokens=1,
    temperature=0.5,
    repetition_penalty=1.2,
    top_k=50,
    top_p=1,
)

# Instantiate a model proxy object to send your requests
granite_13_chat_model = Model(MODELTYPE, params=params, credentials=creds)

In [None]:
prompt_string_sentiment_analysis="""
Analyze the sentiment of the following financial article.Determine whether the sentiment is positive, negative, or neutral. Answer with only one word!: 

financial article:
"{sample}" 

sentiment:
"""

In [None]:
import concurrent.futures

def process_row_sentiment(index, row):
    sentiment = get_completion(row['Description'], prompt_string_sentiment_analysis, granite_13_chat_model)
    return index, sentiment

# Assuming df is your DataFrame
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Creating a list of futures
    futures = [executor.submit(process_row_sentiment, i, row) for i, row in df.iterrows()]

    # Retrieving results and updating the DataFrame
    for future in concurrent.futures.as_completed(futures):
        index, sentiment = future.result()
        df.at[index, 'sentiment'] = sentiment


In [None]:
df.head()

In [None]:
# iterate over the rows of the dataframe and get_completion for each Description and store the result in a new column called 'sentiment'
# df['sentiment'] = df['Description'].apply(get_completion,prompt_string,granite_13_chat_model)
df['sentiment'] = df['Description'].apply(lambda x: get_completion(x, prompt_string_sentiment_analysis, granite_13_chat_model))


In [None]:
df.head()

# extract named entities

In [None]:


# Instantiate parameters for text generation
params = GenerateParams(
    decoding_method="greedy", # use 'greedy' alternatively
    # max_new_tokens=1000,
    # min_new_tokens=1,
    # temperature=0.5,
    # repetition_penalty=1.2,
    # top_k=50,
    # top_p=1,
)

# Instantiate a model proxy object to send your requests

# llama 2 --> 100 rows = 180 seconds --> 1 row = 1.8 seconds --> 30000 rows = 54000 seconds = 15 hours
llame_2_70b_model = Model("meta-llama/llama-2-70b-chat", params=params, credentials=creds)
# granite-13b-instruct-v1 --> 100 rows = 90 seconds --> 1 row = 0.9 seconds --> 30000 rows = 27000 seconds = 7.5 hours
granite_13_instruct_model = Model("ibm/granite-13b-instruct-v1", params=params, credentials=creds)

In [None]:
prompt_string_named_entities="""
Act as a webmaster who must extract structured information from emails. Read the below email and extract and categorize each entity.


Input:
"Golden Bank is a competitor of Silver Bank in the US" said John Doe.

Output: (Named Entities)
Golden Bank: company, Silver Bank: company, US: country, John Doe: person

Input:
Alphabet Inc's Google said on Friday it would prohibit websites and apps that use its advertising technology from running ads on "dangerous content" that goes against scientific consensus during the coronavirus pandemic.

Output: (Named Entities)
Alphabet Inc: company,  Google: company division, Friday: day of the week, coronavirus pandemic: event

Input:
{sample}

Output: (Named Entities)
"""

In [None]:
df.head()   

In [None]:
result="TikTok: company, UK government: organization, London: location, China: country"

In [None]:
import re

input_str = "TikTok: company, UK government: organization, London: location, China: country "

def parse_string_to_touple_list(input_str):
    # Regular expression pattern to match 'key: value' pairs
    # Adjust the pattern as needed to handle different formats
    pattern = r'(\w[\w\s]*?)\s*:\s*([\w\s]+)'

    # Find all matches and convert them to tuples
    tuples_list = re.findall(pattern, input_str)
    
    # remove any leading or trailing spaces as well as newlines from the keys and values
    tuples_list = [(key.strip(), value.strip()) for key, value in tuples_list]   
    
    return tuples_list



# Display the list of tuples
print(parse_string_to_touple_list(input_str))	

In [None]:
df_copy = df.copy()
# Initialize the 'named_entities' column with default values
df_copy['named_entities'] = [None] * len(df_copy)

for i in range(df_copy.shape[0]):
    named_entities=get_completion(df_copy['Description'][i], prompt_string_named_entities, granite_13_instruct_model)
    # parse the named_entities string into a list of tuples and store it in a new column called 'named_entities'
    df_copy['named_entities'][i] = parse_string_to_touple_list(named_entities)
    
df_copy.head(10)

In [None]:
import concurrent.futures

def process_row(index, row):
    named_entities = get_completion(row['Description'], prompt_string_named_entities, granite_13_instruct_model)
    return index, parse_string_to_touple_list(named_entities)

# Assuming df_copy is your DataFrame
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Creating a list of futures
    futures = [executor.submit(process_row, i, row) for i, row in df_copy.iterrows()]

    # Retrieving results and updating the DataFrame
    for future in concurrent.futures.as_completed(futures):
        index, named_entities = future.result()
        df_copy.at[index, 'named_entities'] = named_entities


In [None]:
# get all company names from the named_entities column
company_names = df_copy['named_entities'].apply(
    lambda x: [tuple[0] for tuple in x if tuple[1] == 'company'] if x is not None else []
)
company_names.head(40)

In [None]:
# get the most common company names
from collections import Counter
company_names = Counter([item for sublist in company_names for item in sublist])
company_names.most_common(10)

## extract key actions

In [None]:
prompt_string_key_actions="""
You are the assistant of a fond manager. To help your boss you create super short summaries of financial news headlines called "key actions". Those key actions are no more than 3 words. Here are a few examples

Input:
TikTok considers London and other locations for headquarters,Jul 18 2020,"TikTok has been in discussions with the UK government over the past few months to locate its headquarters in London, a source familiar with the matter said, as part of a strategy to distance itself from its Chinese ownership
Output (2-3 words keyaction):
Locate headquarters

Input:
Disney cuts ad spending on Facebook amid growing boycott: WSJ,Jul 18 2020,"Walt Disney  has become the latest company to slash its advertising spending on Facebook Inc  as the social media giant faces an ad boycott over its handling of hate speech and controversial content, the Wall Street Journal reported on Saturday, citing people familiar with the situation."
Output (2-3 words keyaction):
Slash advertising

Input:
Twitter says attackers downloaded data from up to eight non-verified accounts,Jul 18 2020,Twitter Inc said on Saturday that hackers were able to download account information for up to eight accounts involved in the hack of its systems this week, but said none of them were verified accounts.
Output (2-3 words keyaction):
Hackers downloaded data

Input:
U.S. Republicans seek liability protections as coronavirus aid battle looms,Jul 17 2020,A battle in the U.S. Congress over a new coronavirus-aid bill began on Friday as Republicans were putting the finishing touches on provisions granting liability protections for a wide range of entities resuming operations amid the pandemic.
Output (2-3 words keyaction):
Seek liability protections

Input:
Senator asks Twitter about claim worker was paid to help with hack,Jul 17 2020,"Senator Josh Hawley, a Republican who closely follows tech issues, pressed Twitter Chief Executive Jack Dorsey on Friday on whether a company employee had been paid to assist with a hack of high-profile accounts this week aimed at scamming readers."
Output (2-3 words keyaction):
Senator asks

Now you do it:

Input:
{sample}

Output (2-3 words keyaction):
"""

In [None]:
df_copy_2 = df_copy.copy()
# Initialize the 'named_entities' column with default values
df_copy['key_action'] = [None] * len(df_copy)

for i in range(10):
    key_action=get_completion(df_copy['Description'][i], prompt_string_key_actions, llame_2_70b_model)
    print(key_action)
    df_copy['key_action'][i] = key_action
    
df_copy.head(10)