In [37]:
from huggingface_hub import login

# 设置你的 Hugging Face API Token
hf_token = "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"

# 登录 Hugging Face，设置 API Token
login(hf_token)

print("Successfully logged in to Hugging Face!")

Successfully logged in to Hugging Face!


# Fine tune model Finance chat with Fiqa dataset

* Model information:
    - Model name: [AdaptLLM/finance-chat](https://huggingface.co/AdaptLLM/finance-chat)
    - Description: the domain-specific chat model developed from LLaMA-2-Chat-7B, using the method in our ICLR 2024 paper Adapting Large Language Models via Reading Comprehension.
    - List dataset used to train:
        - [Open-Orca/OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca)
        - [GAIR/lima](https://huggingface.co/datasets/GAIR/lima)
        - [WizardLM/WizardLM_evol_instruct_V2_196k](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
* Dataset information:
    - Dataset name: [FinGPT/fingpt-fiqa_qa](https://huggingface.co/datasets/FinGPT/fingpt-fiqa_qa)


## 1. Install packages, setup global settings
### 1.1 Install packages

In [4]:
!pip install torch
!pip install bitsandbytes
!pip install transformers peft accelerate trl
!pip install datasets==2.16.1
!pip install evaluate rouge_score



### 1.2 Setup Global settings

In [32]:
# Indicate availability CUDA devices to help Trainer can recognize and use then in training process
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

device = "cuda" if torch.cuda.is_available() else "cpu"

## 2. Prepare dataset

In [6]:
from datasets import load_dataset

DATASET_NAME = "FinGPT/fingpt-fiqa_qa"

# load piece data of datasets
def get_dataset(from_pc=0, to_pc=10):
    dataset_dict = load_dataset(DATASET_NAME, split="train[10%:20%]")
    
    # rename columns of dataset to fix with format: system_prompt, question, response
    dataset_dict = dataset_dict.rename_column("instruction", "system_prompt")
    dataset_dict = dataset_dict.rename_column("input", "question")
    dataset_dict = dataset_dict.rename_column("output", "response")
    
    dataset = dataset_dict.train_test_split(test_size=0.1)
    
    return dataset

In [7]:
# def format_instruction(sample):
#     return f"""### System prompt:
# {sample['system_prompt']}

# ### Question:
# {sample["question"]}

# ### Response:
# {sample["response"]}
# """

def format_instruction(sample):
    return f"""<s>[INST] <<SYS>>{sample["system_prompt"]}<</SYS>>\n\nQuestion: {sample["question"]}\n\nResponse: {sample["response"]} [/INST]"""

In [8]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [9]:
# Load a piece of data because it is big dataset
training_dataset = get_dataset(0, 11)
print(training_dataset)

# test format instruction
example = training_dataset["train"][5]
print(format_instruction(example))

DatasetDict({
    train: Dataset({
        features: ['question', 'response', 'system_prompt'],
        num_rows: 1539
    })
    test: Dataset({
        features: ['question', 'response', 'system_prompt'],
        num_rows: 172
    })
})
<s>[INST] <<SYS>>Offer your thoughts or opinion on the input financial query or topic using your financial background.<</SYS>>

Question: Are stories of turning a few thousands into millions by trading stocks real?

Response: The short answer is yes, it is possible to do what these classes claim, however, it is highly unlikely. For every person they can show you that got rich using whatever so called method they are teaching, there are hundreds of people that didn't that they aren't telling you about. What I would recommend is invest in a well diversified portfolio. If you have a higher tolerance for risk then you can make some of that portfolio out of higher risk/reward investments. Maybe you pick the next Apple or Google or Netflix or whatever but t

### 3. Load model

In [10]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)

from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer

MODEL_NAME = "AdaptLLM/finance-chat"

In [11]:
def load_model():
    """
    Load model in qantization mode 4 big
    - https://huggingface.co/docs/accelerate/en/usage_guides/quantization
    - https://huggingface.co/blog/4bit-transformers-bitsandbytes
    """
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quant_config,
        device_map = "auto",
        token=True
    )
    
    model = prepare_model_for_kbit_training(model)

    model.config.use_cache = False
    model.config.pretraining_tp = 1
    
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
        token=True,
        add_eos_token=True,
        add_bos_token=True,
        # WARN: Ignore the warning of SFTTrainer for use padding_side="right", using padding side right will cause the model can't generate eos token
        padding_side="left",
    )
    
    # https://clay-atlas.com/us/blog/2024/01/01/mistral-sft-trainer-cannot-generate-eos-token/
    tokenizer.pad_token = tokenizer.unk_token
    
    
    return model, tokenizer


In [12]:
import torch
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0

    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()

    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Clean cache before loading model
torch.cuda.empty_cache()

# Load model and tokenizer
model, tokenizer = load_model()

# Check number of trainable parameters
print(print_number_of_trainable_model_parameters(model))
print(tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



trainable model parameters: 0
all model parameters: 3500421120
percentage of trainable model parameters: 0.00%
LlamaTokenizerFast(name_or_path='AdaptLLM/finance-chat', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


### 4. Example to chat with the finance-chat model

In [14]:
user_input = """
Input Texts:
{Recent indicators suggest that economic activity has continued to expand at a solid pace. Job gains have slowed, and the unemployment rate has moved up but remains low. Inflation has made further progress toward the Committee's 2 percent objective but remains somewhat elevated.
The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run. The Committee has gained greater confidence that inflation is moving sustainably toward 2 percent, and judges that the risks to achieving its employment and inflation goals are roughly in balance. The economic outlook is uncertain, and the Committee is attentive to the risks to both sides of its dual mandate.
In light of the progress on inflation and the balance of risks, the Committee decided to lower the target range for the federal funds rate by 1/2 percentage point to 4-3/4 to 5 percent. In considering additional adjustments to the target range for the federal funds rate, the Committee will carefully assess incoming data, the evolving outlook, and the balance of risks. The Committee will continue reducing its holdings of Treasury securities and agency debt and agency mortgage‑backed securities. The Committee is strongly committed to supporting maximum employment and returning inflation to its 2 percent objective.
In assessing the appropriate stance of monetary policy, the Committee will continue to monitor the implications of incoming information for the economic outlook. The Committee would be prepared to adjust the stance of monetary policy as appropriate if risks emerge that could impede the attainment of the Committee's goals. The Committee's assessments will take into account a wide range of information, including readings on labor market conditions, inflation pressures and inflation expectations, and financial and international developments.
Based on the provided text, I would classify the overall sentiment and tone as neutral to cautiously optimistic, with an emphasis on ongoing progress and a balanced risk outlook.}

Given a list of cleaned text data, conduct a sentiment analysis to evaluate the emotional tone of each text (e.g., positive, neutral, negative). Provide a confidence score for each sentiment classification, as well as a high-level explanation that justifies the analysis. Additionally, assess the potential implications these sentiments may have on the perception of {topic/subject}. Your response should be structured only as follows:

	1.	Sentiment Polarity (Positive, Neutral, Negative)
	2.	Confidence Score (0 to 9 scale)
"""

# Apply the prompt template and system prompt of LLaMA-2-Chat demo for chat models (NOTE: NO prompt template is required for base models!)
our_system_prompt = "\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n" # Please do NOT change this
prompt = f"<s>[INST] <<SYS>>{our_system_prompt}<</SYS>>\n\n{user_input} [/INST]"

# # NOTE:
# # If you want to apply your own system prompt, please integrate it into the instruction part following our system prompt like this:
# your_system_prompt = "Please, check if the answer can be inferred from the pieces of context provided."
# prompt = f"<s>[INST] <<SYS>>{our_system_prompt}<</SYS>>\n\n{your_system_prompt}\n{user_input} [/INST]"

inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)
outputs = model.generate(input_ids=inputs, max_length=4096)[0]

answer_start = int(inputs.shape[-1])
pred = tokenizer.decode(outputs[answer_start:], skip_special_tokens=True)

print(f'### User Input:\n{user_input}\n\n### Assistant Output:\n{pred}')



### User Input:

Input Texts:
{Recent indicators suggest that economic activity has continued to expand at a solid pace. Job gains have slowed, and the unemployment rate has moved up but remains low. Inflation has made further progress toward the Committee's 2 percent objective but remains somewhat elevated.
The Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run. The Committee has gained greater confidence that inflation is moving sustainably toward 2 percent, and judges that the risks to achieving its employment and inflation goals are roughly in balance. The economic outlook is uncertain, and the Committee is attentive to the risks to both sides of its dual mandate.
In light of the progress on inflation and the balance of risks, the Committee decided to lower the target range for the federal funds rate by 1/2 percentage point to 4-3/4 to 5 percent. In considering additional adjustments to the target range for the federal funds rat

In [15]:
test = training_dataset["train"][1]
user_input = test["question"]

# Apply the prompt template and system prompt of LLaMA-2-Chat demo for chat models (NOTE: NO prompt template is required for base models!)
our_system_prompt = test["system_prompt"]
prompt = f"<s>[INST] <<SYS>>{our_system_prompt}<</SYS>>\n\n{user_input} [/INST]"

inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)
outputs = model.generate(input_ids=inputs, max_length=4096)[0]

answer_start = int(inputs.shape[-1])
pred = tokenizer.decode(outputs[answer_start:], skip_special_tokens=True)

print(f'### User Input:\n{user_input}\n\n### Assistant Output:\n{pred}')

### User Input:
Are COBRA premiums deductible when self-employed?

### Assistant Output:
As an AI language model, I do not have personal financial advice or opinions. However, I can provide you with general information on the topic.

COBRA premiums are typically not deductible for self-employed individuals. This is because COBRA is a federal law that requires employers to offer continuation of health insurance coverage to employees who lose their job or experience a reduction in hours. Self-employed individuals are not considered employees, so they are not eligible for COBRA coverage.

However, there are some exceptions to this rule. If you are self-employed and have a spouse or dependent child who is covered under your employer's health insurance plan, you may be eligible for COBRA coverage if you lose your job or experience a reduction in hours. Additionally, if you are self-employed and have a business partner who is an employee of your business, you may be eligible for COBRA covera

### 5. LoRA configuration and Training Arguments

In [16]:
# Configure Lora
lora_config = LoraConfig(
    # Lora attention dimension
    r=64,
    # Scaling process
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    # Prevent overfitting
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [17]:
import math
import time

LEARNING_RATE = 1e-5 # 2e-4
WEIGHT_DECAY=0.001
EPOCHS = 3
BATCH_SIZE = 4
LOGGING_STEPS = 10
MAX_SEQ_LEN = 2048
MAX_STEPS = -1

TRAINING_OUTPUT_DIR=f"outputs/peft-financial-chatbot-trained-{str(int(time.time()))}"

In [19]:
training_args = TrainingArguments(
    output_dir=TRAINING_OUTPUT_DIR,
    
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE,
    max_steps=MAX_STEPS,
    logging_steps=LOGGING_STEPS,
    
    # max_grad_norm=0.3, # measure of the magnitude or steepness of the gradient of a loss function
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    optim="paged_adamw_8bit",
    
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=LOGGING_STEPS,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    
    peft_config=lora_config,
    args=training_args,
    
    train_dataset=training_dataset["train"],
    eval_dataset=training_dataset["test"],
    dataset_text_field="question",
    
    max_seq_length=None,
    formatting_func=format_instruction,
#     packing=True
)

print(training_args.device)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1539 [00:00<?, ? examples/s]

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

cuda:0


  super().__init__(


In [33]:
PEFT_MODEL_LOCAL_CHECKPOINT = "./outputs/peft-training-checkpoint"
PEFT_MODEL_ADAPTER_ID = "anhtranhong/finance-chat_fingpt-fiqa_qa_v2"
trainer.train()
trainer.model.save_pretrained(PEFT_MODEL_LOCAL_CHECKPOINT)
tokenizer.save_pretrained(PEFT_MODEL_LOCAL_CHECKPOINT)

Step,Training Loss,Validation Loss
10,4.1373,4.199275
20,4.1512,4.199275
30,4.1057,4.199275
40,4.0472,4.199275
50,4.1773,4.199275
60,4.0767,4.199275
70,4.0839,4.199275
80,4.1817,4.199275
90,4.1758,4.199275
100,4.5397,4.199275


('./outputs/peft-training-checkpoint/tokenizer_config.json',
 './outputs/peft-training-checkpoint/special_tokens_map.json',
 './outputs/peft-training-checkpoint/tokenizer.model',
 './outputs/peft-training-checkpoint/added_tokens.json',
 './outputs/peft-training-checkpoint/tokenizer.json')

In [35]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 162218048
all model parameters: 3662639168
percentage of trainable model parameters: 4.43%


### 6. Push to huggingface

### 7. Model generation

In [49]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(model, PEFT_MODEL_ADAPTER_ID, is_trainable=True)

tokenizer = AutoTokenizer.from_pretrained(
    PEFT_MODEL_ADAPTER_ID,
    padding_side="left",
)

In [66]:
import pandas as pd

# Load the CSV file
file_path = '/usr1/home/s124mdg41_08/FinLLM-FOMC/data/processed/Merged_FMOC.csv'
df = pd.read_csv(file_path)

# Function to split each note by paragraphs and remove paragraphs with less than 50 characters
def split_into_paragraphs(row):
    date = row['Date']
    paragraphs = str(row['Minutes_cleaned']).split('\n')
    # Only keep paragraphs that are non-empty and have at least 50 characters (ignoring spaces)
    return [(date, para.strip()) for para in paragraphs if para.strip() and len(para.strip()) >= 40]

# Apply the function to each row in the DataFrame
split_data = [item for idx, row in df.iterrows() for item in split_into_paragraphs(row)]

# Create a new DataFrame with the split paragraphs
new_df = pd.DataFrame(split_data, columns=['Date', 'Minutes_cleaned'])

# 删除 'Minutes_cleaned' 列中为空字符串或者'NaN' 的行
new_df = new_df[new_df['Minutes_cleaned'].notna() & (new_df['Minutes_cleaned'] != '')]

# 输出前50行并输出行数
print(new_df.head(50))
print(len(new_df))


          Date                                    Minutes_cleaned
0   2012-01-25       Minutes of the Federal Open Market Committee
1   2012-01-25  A meeting of the Federal Open Market Committee...
2   2012-01-25  Role of Financial Conditions in Economic Recov...
3   2012-01-25  Staff summarized research projects being condu...
4   2012-01-25  In their discussion following the staff presen...
5   2012-01-25  In the agenda for this meeting, it was reporte...
6   2012-01-25  The elected members and alternate members were...
7   2012-01-25  By unanimous vote, the following officers of t...
8   2012-01-25  By unanimous vote, the Authorization for Domes...
9   2012-01-25  AUTHORIZATION FOR DOMESTIC OPEN MARKET OPERATIONS
10  2012-01-25  A. To buy or sell U.S. Government securities, ...
11  2012-01-25  B. To buy or sell in the open market U.S. Gove...
12  2012-01-25  A. for System Open Market Account, to sell U.S...
13  2012-01-25  B. for New York Bank account, when appropriate...
14  2012-0

In [79]:
import re
import pandas as pd

# 初始化一个空列表来保存每一行的结果
results = []

# 遍历 new_df 的每一行
for idx, row in new_df.iterrows():
    content = row['Minutes_cleaned']  # 获取 'Minutes_cleaned' 列的内容
    date_value = row['Date']  # 获取 'Date' 列的值

    our_system_prompt = """You're an expert on sentiment analysis in economic texts"""

    user_input = f"""Please analyze the economic sentiment of the following content and provide a score from 0 to 10, where 0 represents extremely negative sentiment, 5 is neutral, and 10 is extremely positive. Analyze only the economic-related aspects in each paragraph. If you can't judge the emotional content, judge it as 5.
        Content: {content}
        <requirement>Only show number of one score, no more text.</requirement>
    """
    
    # 构建最终的 LLaMA2 prompt
    prompt = f"<s>[INST] <<SYS>>{our_system_prompt}<</SYS>>\n\n{user_input} [/INST]"

    # 生成情感分数
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)

    outputs = peft_model.generate(
        input_ids=inputs,
        do_sample=True,
        max_new_tokens=1024,
        temperature=0.6,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.2,
        num_return_sequences=1,
    )[0]

    # 解码并提取分数
    answer_start = int(inputs.shape[-1])
    pred = tokenizer.decode(outputs[answer_start:], skip_special_tokens=True)


   # 使用 re.search() 获取第一个匹配的数字
    score = re.search(r'\d+', pred)
    print(score)

    # 处理提取的分数
    if score:
        score_value = int(score[0])  # 提取第一个匹配的分数
    else:
        score_value = None  # 如果没有找到分数，设置为 None

    # 将结果与 Date 合并，存入结果列表
    results.append({
        'Date': date_value,
        'Sentiment_Score': score_value
    })
    

# 将结果列表转换为 DataFrame
result_df = pd.DataFrame(results)




<re.Match object; span=(7, 8), match='6'>
<re.Match object; span=(7, 8), match='5'>
<re.Match object; span=(7, 8), match='7'>
<re.Match object; span=(7, 8), match='7'>
<re.Match object; span=(77, 78), match='7'>
<re.Match object; span=(7, 8), match='6'>
<re.Match object; span=(17, 18), match='5'>
<re.Match object; span=(167, 168), match='5'>
<re.Match object; span=(7, 8), match='6'>
<re.Match object; span=(61, 62), match='7'>
None
None
<re.Match object; span=(71, 72), match='7'>
None
<re.Match object; span=(66, 67), match='5'>
<re.Match object; span=(17, 18), match='4'>
<re.Match object; span=(186, 187), match='5'>
<re.Match object; span=(78, 79), match='5'>
None
<re.Match object; span=(26, 27), match='6'>
<re.Match object; span=(183, 184), match='5'>
<re.Match object; span=(26, 27), match='5'>
<re.Match object; span=(129, 130), match='7'>
None
<re.Match object; span=(67, 68), match='4'>
<re.Match object; span=(7, 8), match='3'>
<re.Match object; span=(26, 27), match='5'>


KeyboardInterrupt: 

In [None]:
csv_file_path = '/usr1/home/s124mdg41_08/FinLLM-FOMC/data/sentiment_score/sentiment_analysis_results.csv'
result_df.to_csv(csv_file_path, index=False)

print(f"Results saved to {csv_file_path}")