In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# make sure gpu valid
import torch
torch.cuda.is_available()

True

In [None]:
!cp -r /content/drive/MyDrive/models/fingpt/hub /content/

In [None]:
!pip install transformers==4.32.0 peft==0.5.0
!pip install sentencepiece
!pip install accelerate
!pip install torch
!pip install peft
!pip install datasets
!pip install bitsandbytes

Collecting transformers==4.32.0
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.5.0
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from peft==0.5.0)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft==0.5.

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizerFast
from peft import PeftModel  # 0.5.0

# Load Models
base_model = "/content/hub/models--NousResearch--Llama-2-13b-hf/snapshots/b0491461253755d8c60bf22f0d696b9e337c6375"
peft_model = "/content/hub/models--FinGPT--fingpt-sentiment_llama2-13b_lora/snapshots/92de73edd2b349aa6d063152bee31c8f1131a56f"
tokenizer = LlamaTokenizerFast.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = LlamaForCausalLM.from_pretrained(base_model, trust_remote_code=True, device_map = "cuda:0", load_in_4bit = True,)
model = PeftModel.from_pretrained(model, peft_model)
model = model.eval()

# Make prompts
prompt = [
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is aggressively pursuing its growth strategy by increasingly focusing on technologically more demanding HDI printed circuit boards PCBs .
Answer: ''',
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Answer: ''',
'''Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}
Input: A tinyurl link takes users to a scamming site promising that users can earn thousands of dollars by becoming a Google ( NASDAQ : GOOG ) Cash advertiser .
Answer: '''
]

# Generate results
tokens = tokenizer(prompt, return_tensors='pt', padding=True, max_length=512)
res = model.generate(**tokens, max_length=512)
res_sentences = [tokenizer.decode(i) for i in res]
out_text = [o.split("Answer: ")[1] for o in res_sentences]

# show results
for sentiment in out_text:
    print(sentiment)
` `   `   `
# Output:
# positive
# neutral
# negative


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
import pandas as pd
import json
import os
from IPython.display import display
import re

data_path = 'TSLA-NEWS-60DAYS-2024-03-23-2024-05-20.csv'
temp_path = 'temp.csv'

class Analysor:
    def __init__(self):
        pass

    def analysis_news(self, dataset, chunk_size=1):
        # try:
        #     os.remove(temp_path)
        # except FileNotFoundError:
        #     pass

        print(f"Total news: [{len(dataset)}] ")
        # do chunk to avoid large dataset
        chunks = [dataset[i:i+chunk_size] for i in range(0, len(dataset), chunk_size)]
        print(f"Chunk into {len(chunks)} parts per request.")
        print('--- --- ---')
        for index, chunk in enumerate(chunks):
            start_index = index * chunk_size
            end_index = min((index + 1) * chunk_size, len(dataset)) - 1
            print(f"processing {start_index}-{end_index}")
            # json_data = json.dumps(chunk)
            result = self.call_llm(chunk)
            self.combine_impact(chunk, result)
            self.append_to_csv(temp_path, chunk)

        # remove return line
        df = pd.read_csv(temp_path)
        df.dropna(how='all', inplace=True)
        df.to_csv(temp_path, index=False)

        # flat map
        # refined = [item for sublist in chunks for item in sublist]
        # return refined

    def call_llm(self, dataset):
        # Make prompts
        prompts = []
        for item in dataset:
            print(item["id"] + " - " + item["headline"])
            prompt = f'''Instruction: What is the sentiment of this news? Please choose an answer from{{negative/neutral/positive}}\nInput: {item["headline"]}\nAnswer: '''
            prompts.append(prompt)

        # prompts = [item.replace('\xa0', '').replace('\n', '') for item in prompts]
        # prompts = [re.sub(r'(\d+)(\D)', r'\1 \2', re.sub(r'(\D)(\d+)', r'\1 \2', prompt)) for prompt in prompts]

        # Generate
        tokens = tokenizer(prompts, return_tensors='pt', padding=True, max_length=512)
        res = model.generate(**tokens, max_length=512)
        res_sentences = [tokenizer.decode(i) for i in res]
        out_text = [o.split("Answer: ")[1].strip().replace("</s>", "") for o in res_sentences]
        return out_text
        # result = ['negative', 'neutral', 'positive']
        # results = [random.choice(result) for _ in prompts]
        # return results

    def combine_impact(self, dataset, analysis_result):
        for index, entry in enumerate(dataset):
            entry['impact'] = analysis_result[index]

        return dataset

    def append_to_csv(self, file_path, rows_to_append):
        if not os.path.isfile(file_path):
            header = ['id', 'datetime', 'impact', 'headline', 'summary']
            pd.DataFrame(columns=header).to_csv(file_path, mode='w', header=True, index=False)

        # Use 'a' mode to append to the file
        with open(file_path, 'a') as f:
            # Create a CSV writer object
            for row in rows_to_append:
                pd.DataFrame([row]).to_csv(f, header=False, index=False)

if __name__ == '__main__':
    news_analysis = Analysor()

    stock_data = pd.read_csv(data_path)
    dataset = stock_data.to_dict('records')
    # dataset = dataset[560:570]
    result = news_analysis.analysis_news(dataset)

    tmp_df = pd.read_csv(temp_path)
    display(tmp_df)


Total news: [103] 
Chunk into 21 parts per request.
--- --- ---
processing 0-4




processing 5-9




processing 10-14
processing 15-19
processing 20-24
processing 25-29
processing 30-34
processing 35-39
processing 40-44
processing 45-49
processing 50-54
processing 55-59
processing 60-64
processing 65-69
processing 70-74
processing 75-79
processing 80-84
processing 85-89
processing 90-94
processing 95-99
processing 100-102


Unnamed: 0,id,datetime,impact,headline,summary
0,127364003,2024-05-02 06:09:47,negative,Google’s Payments to Apple Reached $20 Billion...,(Bloomberg) -- Alphabet Inc. paid Apple Inc. $...
1,127361226,2024-05-02 04:57:48,neutral,"Apple, Novo Nordisk earnings, jobless claims: ...",A number of companies release their quarterly ...
2,127360339,2024-05-02 04:17:04,positive,"Apple Q2 earnings: Services, gen. AI are poten...",Tech giant Apple Inc. (AAPL) is gearing up to ...
3,127360340,2024-05-02 03:53:09,negative,Apple to report Q2 earnings amid iPhone slowdo...,Apple will report its Q2 earnings after the be...
4,127354904,2024-05-02 01:21:22,neutral,"After Nvidia and Apple, Alibaba Chases Vietnam...",Alibaba Group Holding Limited (NYSE:BABA) plan...
...,...,...,...,...,...
98,127321837,2024-04-30 15:31:00,neutral,Morning Brew: Tech Giants Lead Market Movement...,Looking for stock market analysis and research...
99,127336277,2024-04-30 15:27:00,neutral,"‘Buy the Fear,’ Says Bernstein About Apple Stock",Looking for stock market analysis and research...
100,127323119,2024-04-30 14:44:00,neutral,‘Sell in May and go away?’ This year the calen...,Looking for stock market analysis and research...
101,127330524,2024-04-30 14:41:00,positive,Analysts Are Bullish on Top Technology Stocks:...,Looking for stock market analysis and research...


In [None]:
import torch
import gc

del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()