In [None]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117

In [None]:
!pip install -q -U transformers=="4.38.2"
!pip install -q accelerate
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U datasets

In [None]:
!pip install -q -U git+https://github.com/huggingface/trl
!pip install -q -U git+https://github.com/huggingface/peft

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install -q -U datasets==2.17.0

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from datasets import Dataset
from peft import LoraConfig, PeftConfig
import bitsandbytes as bnb
from trl import SFTTrainer

2024-05-04 09:44:53.876311: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 09:44:53.876388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 09:44:53.878001: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
model_name = "/kaggle/input/gemma/transformers/7b-it/1"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)
EOS_TOKEN = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model.config

GemmaConfig {
  "_name_or_path": "/kaggle/input/gemma/transformers/7b-it/1",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 24576,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 16,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-

In [None]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
   

# Dataset Loading & Basic Preprocessing

In [None]:
file_path_1 = "/kaggle/input/finance-bench-dataset/10_labels.csv"
file_path_2 = "/kaggle/input/finance-bench-dataset/syntheses_10.csv"
Model_Generated_Data = pd.read_csv(file_path_1)
Finance_data = pd.read_csv(file_path_2)
print(Finance_data.head())

In [None]:
# Reduce Finance_data to the first 50 rows
Finance_Reduced_Test = Finance_data.head(50)
Finance_Reduced_Train = Finance_data.tail(100)
# Print the reduced DataFrame to verify
print(len(Finance_Reduced_Train))
print(len(Finance_Reduced_Test))

# Zero Shot Prompt Engineering

In [None]:
SYSTEM_PROMPT = """You are a financial chatbot trained to answer questions based on the information provided.
Your responses should be directly sourced from the content of these evidence_text(context).
When asked a question, ensure that your answer is explicitly supported by the text and do not
include any external information, interpretations, or assumptions not clearly stated in the evidence_text(context).

Your primary focus should be on accuracy, specificity, and adherence to the information in the evidence_text(context),
particularly regarding financial statements, company performance, and market positions."""

count = 0
syntheses_with_gemma = []

for idx, row in Finance_Reduced_Test.iterrows():
    question = row['question']
    evidence_text = row['evidence_text']

    prompt = f"""<start_of_turn>user\n{SYSTEM_PROMPT}\n
        {question}\n{evidence_text}\n<end_of_turn>\n"""

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate response
    output = model.generate(input_ids=inputs["input_ids"], max_new_tokens=100)

    # Decode the response
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)

    #Finding Extracted Answer
    index_of_answer = generated_text.find("<end_of_turn>")

    # Extract the text after "Answer:"
    answer_text = generated_text[index_of_answer + len("<end_of_turn>"):].strip()
    syntheses_with_gemma.append(answer_text)
    #print(answer_text)

    #Comparing the answer with the base answer
#     dash_line = '-'.join('' for x in range(100))
#     Answer = Finance_Reduced_Test.loc[idx, 'answer']
#     print(f'BASELINE HUMAN ANSWER:\n{Answer}\n')
#     print(dash_line)
#     print(f'MODEL GENERATION - ZERO SHOT:\n{answer_text}')
#     print(dash_line)
    print(count+1)
    count = count + 1

#Appending it to main file
Finance_Reduced_Test['Generated_BY_GEMMA'] = syntheses_with_gemma

In [None]:
Finance_Reduced_Test.head()

In [None]:
from IPython.display import FileLink
Zero_Shot_Gemma = Finance_Reduced_Test.rename(columns={'question': 'question', 'answer': 'answer', 'evidence_text': 'evidence_text', 'syntheses': 'syntheses', 'Generated_BY_GEMMA': 'Generated_BY_GEMMA'})
Zero_Shot_Gemma.to_csv('Zero_Shot_Gemma.csv', index=False)
FileLink('Zero_Shot_Gemma.csv')

In [None]:
Zero_Shot_Gemma.head()

# Zero Shot Test

In [None]:
file_path_3 = "/kaggle/input/zero-shot-gemma/Zero_Shot_Gemma.csv"
zero_shot_test = pd.read_csv(file_path_3)
zero_shot_test.head()

In [None]:
# # Selecting the 'answer' and 'Generated_BY_GEMMA' columns
# selected_columns = ['answer', 'Generated_BY_GEMMA']

# # Printing the first 50 rows with a dash line between the columns
# for index, row in zero_shot_test[selected_columns].head(50).iterrows():
#     print(f"{row['answer']} --------- {row['Generated_BY_GEMMA']}")
#     print('--' * 50)  # Dash line separator

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def compute_rouge_l(candidate, reference):
    m, n = len(candidate), len(reference)
    #print(m,n)
    dp_table = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if candidate[i - 1] == reference[j - 1]:
                dp_table[i][j] = dp_table[i - 1][j - 1] + 1
            else:
                dp_table[i][j] = max(dp_table[i - 1][j], dp_table[i][j - 1])

    return dp_table[m][n] / n

In [None]:
nltk.download('stopwords')

In [None]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

In [None]:
def compute_similarity_score(answer,syntheses):
    tokens_answer = preprocess_text(answer)
    tokens_syntheses = preprocess_text(syntheses)
    str_answer = ' '.join(tokens_answer)
    str_syntheses = ' '.join(tokens_syntheses)
    freqdist_answer = nltk.FreqDist(str_answer.split())
    freqdist_syntheses = nltk.FreqDist(str_syntheses.split())
    # Extract frequencies for unique tokens in both texts
    unique_tokens = set(freqdist_answer.keys()).union(freqdist_syntheses.keys())

    freq_answer = [freqdist_answer[token] for token in unique_tokens]
    freq_syntheses = [freqdist_syntheses[token] for token in unique_tokens]

    vector_answer = np.array(freq_answer).reshape(1, -1)
    vector_syntheses = np.array(freq_syntheses).reshape(1, -1)

    similarity_score = cosine_similarity(vector_answer, vector_syntheses)[0][0]

    return similarity_score

In [None]:
import numpy as np

tfidf_vectorizer = TfidfVectorizer

total_average_rouge_l_scores_1 = []
total_average_cosine_similarity_scores_1  = []

num_labels_1 = []
count = 0

for i in range(5):
    count += 10
    num_labels_1.append(count)
    #print(num_labels)
    df = zero_shot_test.head(count)
    #print(len(df))

    rouge_l_scores = []
    cosine_similarity_scores = []

    for idx, row in df.iterrows():
        answer = row['answer']
        syntheses = row['Generated_BY_GEMMA']

        compute_sim_score = compute_similarity_score(answer,syntheses)
        cosine_similarity_scores.append(compute_sim_score)

        rouge_l_score = compute_rouge_l(answer, syntheses)
        rouge_l_scores.append(rouge_l_score)


    total_average_cosine_similarity_score = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    total_average_cosine_similarity_scores_1.append(total_average_cosine_similarity_score)


    total_average_rouge_l_score = sum(rouge_l_scores)/len(rouge_l_scores)
    total_average_rouge_l_scores_1.append(total_average_rouge_l_score)

    #print(total_average_cosine_similarity_scores_1)
    print(total_average_rouge_l_scores_1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

plt.figure(figsize=(20, 5))

# Plot Accuracy
plt.subplot(1, 3, 1)
plt.plot(num_labels_1, total_average_rouge_l_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)

plt.xlabel('Number of Labels')
plt.ylabel('Total Average ROUGE-L Score for 5 Epoch')
plt.title(' After Fine-Tuning ROUGE-L Score Comparison for 5 Epoch')

plt.subplot(1, 3, 2)
plt.plot(num_labels_1, total_average_cosine_similarity_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)
plt.xlabel('Number of Labels')
plt.ylabel('Total Average Cosine Similarity Score for 5 Epoch')
plt.title(' After Fine-Tuning Cosine Similarity Score for 5 Epoch')

### Zero Shot Finished

# Few Shot Prompt Engineering

In [None]:
Few_Shot_7 = Finance_Reduced_Test[45:50]
len(Few_Shot_7)

In [None]:
SYSTEM_PROMPT = """You are a financial chatbot trained to answer questions based on the information provided.
Your responses should be directly sourced from the content of these evidence_text(context).
When asked a question, ensure that your answer is explicitly supported by the text and do not
include any external information, interpretations, or assumptions not clearly stated in the evidence_text(context).

Your primary focus should be on accuracy, specificity, and adherence to the information in the evidence_text(context),
particularly regarding financial statements, company performance, and market positions.

Here are some sample Questions, evidence text and answers.

question 1: Was there any drop in Cash & Cash equivalents between FY 2023 and Q2 of FY2024?
evidence_text 1: July 29, 2023 January 28, 2023 July 30, 2022 Cash and cash equivalents $ 1,093 $ 1,874 $ 840
answer 1: Yes, there was a decline of ~42% between FY2023 and Q2 of FY 2024.

question 2: We need to calculate a financial metric by using information only provided within the balance sheet. Please answer the following question: what is Boeing's year end FY2018 net property, plant, and equipment (in USD millions)?
evidence_text 2: Table of Contents The Boeing Company and Subsidiaries Consolidated Statements of Financial Position   (Dollars in millions, except per share data)       December 31, 2018   2017 Assets       Cash and cash equivalents $7,637   $8,813 Short-term and other investments 927   1,179 Accounts receivable, net 3,879   2,894 Unbilled receivables, net 10,025   8,194 Current portion of customer financing, net 460   309 Inventories 62,567   61,388 Other current assets 2,335   2,417 Total current assets 87,830   85,194 Customer financing, net 2,418   2,756 Property, plant and equipment, net 12,645   12,672 Goodwill 7,840   5,559 Acquired intangible assets, net 3,429   2,573 Deferred income taxes 284   321 Investments 1,087   1,260 Other assets, net of accumulated amortization of $503 and $482 1,826   2,027 Total assets $117,359   $112,362 Liabilities and equity       Accounts payable $12,916   $12,202 Accrued liabilities 14,808   13,069 Advances and progress billings 50,676   48,042 Short-term debt and current portion of long-term debt 3,190   1,335 Total current liabilities 81,590   74,648 Deferred income taxes 1,736   2,188 Accrued retiree health care 4,584   5,545 Accrued pension plan liability, net 15,323   16,471 Other long-term liabilities 3,059   2,015 Long-term debt 10,657   9,782 Shareholders’ equity:       Common stock, par value $5.00 – 1,200,000,000 shares authorized; 1,012,261,159 shares issued 5,061   5,061 Additional paid-in capital 6,768   6,804 Treasury stock, at cost (52,348)   (43,454) Retained earnings 55,941   49,618 Accumulated other comprehensive loss (15,083)   (16,373) Total shareholders’ equity 339   1,656 Noncontrolling interests 71   57 Total equity 410   1,713 Total liabilities and equity $117,359   $112,362 See Notes to the Consolidated Financial Statements on pages 54 – 113 . 50
answer 2: $12645.00

question 3: At the Pepsico AGM held on May 3, 2023, what was the outcome of the shareholder vote on the shareholder proposal for a congruency report by Pepsico on net-zero emissions policies?
evidence_text 3: (8) The shareholder proposal regarding a congruency report on net-zero emissions policies was defeated: For 19,718,780 Against 977,228,788
answer 3: The shareholder proposal for a congruency report by Pepsico on net-zero emissions policies was defeated.

Now give answer to questions provided below from the evidence text."""

count = 0
syntheses_with_gemma = []

for idx, row in Few_Shot_7.iterrows():
    question = row['question']
    evidence_text = row['evidence_text']

    prompt = f"""<start_of_turn>user\n{SYSTEM_PROMPT}\n
        {question}\n{evidence_text}\n<end_of_turn>\n"""

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate response
    output = model.generate(input_ids=inputs["input_ids"], max_new_tokens=100)

    # Decode the response
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)

    #Finding Extracted Answer
    index_of_answer = generated_text.find("<end_of_turn>")

    # Extract the text after "Answer:"
    answer_text = generated_text[index_of_answer + len("<end_of_turn>"):].strip()
    syntheses_with_gemma.append(answer_text)
    #print(answer_text)

    #Comparing the answer with the base answer
#     dash_line = '-'.join('' for x in range(100))
#     Answer = Finance_Reduced_Test.loc[idx, 'answer']
#     print(f'BASELINE HUMAN ANSWER:\n{Answer}\n')
#     print(dash_line)
#     print(f'MODEL GENERATION - ZERO SHOT:\n{answer_text}')
#     print(dash_line)
    print(count+1)
    count = count + 1

#Appending it to main file
Few_Shot_7['Generated_BY_GEMMA'] = syntheses_with_gemma

In [None]:
Few_Shot_7.head()

In [None]:
from IPython.display import FileLink
Few_Shot_Gemma_7 = Few_Shot_7.rename(columns={'question': 'question', 'answer': 'answer', 'evidence_text': 'evidence_text', 'syntheses': 'syntheses', 'Generated_BY_GEMMA': 'Generated_BY_GEMMA'})
Few_Shot_Gemma_7.to_csv('Few_Shot_Gemma_7.csv', index=False)
FileLink('Few_Shot_Gemma_7.csv')

In [None]:
file_path_1 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_1.csv"
file_path_2 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_2.csv"
file_path_3 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_3.csv"
file_path_4 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_4.csv"
file_path_5 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_5.csv"
file_path_6 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_6.csv"
file_path_7 = "/kaggle/input/few-shot-gemma/Few_Shot_Gemma_7.csv"
Few_Shot_Gemma_1 = pd.read_csv(file_path_1)
Few_Shot_Gemma_2 = pd.read_csv(file_path_2)
Few_Shot_Gemma_3 = pd.read_csv(file_path_3)
Few_Shot_Gemma_4 = pd.read_csv(file_path_4)
Few_Shot_Gemma_5 = pd.read_csv(file_path_5)
Few_Shot_Gemma_6 = pd.read_csv(file_path_6)
Few_Shot_Gemma_7 = pd.read_csv(file_path_7)

In [None]:
Few_Shot_Gemma_Merged = pd.concat([Few_Shot_Gemma_1, Few_Shot_Gemma_2, Few_Shot_Gemma_3, Few_Shot_Gemma_4, Few_Shot_Gemma_5, Few_Shot_Gemma_6, Few_Shot_Gemma_7], ignore_index=True)


In [None]:
# # Selecting the 'answer' and 'Generated_BY_GEMMA' columns
# selected_columns = ['answer', 'Generated_BY_GEMMA']

# # Printing the first 50 rows with a dash line between the columns
# for index, row in Few_Shot_Gemma_Merged[selected_columns].head(50).iterrows():
#     print(f"{row['answer']} --------- {row['Generated_BY_GEMMA']}")
#     print('--' * 50)  # Dash line separator

In [None]:
from IPython.display import FileLink
Few_Shot_Gemma_Merged.to_csv('Few_Shot_Gemma_Merged.csv', index=False)
FileLink('Few_Shot_Gemma_Merged.csv')

In [None]:
len(Few_Shot_Gemma_Merged)

# Test Few Shot

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def compute_rouge_l(candidate, reference):
    m, n = len(candidate), len(reference)
    #print(m,n)
    dp_table = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if candidate[i - 1] == reference[j - 1]:
                dp_table[i][j] = dp_table[i - 1][j - 1] + 1
            else:
                dp_table[i][j] = max(dp_table[i - 1][j], dp_table[i][j - 1])

    return dp_table[m][n] / n

In [None]:
nltk.download('stopwords')

In [None]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

In [None]:
def compute_similarity_score(answer,syntheses):
    tokens_answer = preprocess_text(answer)
    tokens_syntheses = preprocess_text(syntheses)
    str_answer = ' '.join(tokens_answer)
    str_syntheses = ' '.join(tokens_syntheses)
    freqdist_answer = nltk.FreqDist(str_answer.split())
    freqdist_syntheses = nltk.FreqDist(str_syntheses.split())
    # Extract frequencies for unique tokens in both texts
    unique_tokens = set(freqdist_answer.keys()).union(freqdist_syntheses.keys())

    freq_answer = [freqdist_answer[token] for token in unique_tokens]
    freq_syntheses = [freqdist_syntheses[token] for token in unique_tokens]

    vector_answer = np.array(freq_answer).reshape(1, -1)
    vector_syntheses = np.array(freq_syntheses).reshape(1, -1)

    similarity_score = cosine_similarity(vector_answer, vector_syntheses)[0][0]

    return similarity_score

In [None]:
import numpy as np

tfidf_vectorizer = TfidfVectorizer

total_average_rouge_l_scores_1 = []
total_average_cosine_similarity_scores_1  = []

num_labels_1 = []
count = 0

for i in range(5):
    count += 10
    num_labels_1.append(count)
    #print(num_labels)
    df = Few_Shot_Gemma_Merged.head(count)
    #print(len(df))

    rouge_l_scores = []
    cosine_similarity_scores = []

    for idx, row in df.iterrows():
        answer = row['answer']
        syntheses = row['Generated_BY_GEMMA']

        compute_sim_score = compute_similarity_score(answer,syntheses)
        cosine_similarity_scores.append(compute_sim_score)

        rouge_l_score = compute_rouge_l(answer, syntheses)
        rouge_l_scores.append(rouge_l_score)


    total_average_cosine_similarity_score = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    total_average_cosine_similarity_scores_1.append(total_average_cosine_similarity_score)


    total_average_rouge_l_score = sum(rouge_l_scores)/len(rouge_l_scores)
    total_average_rouge_l_scores_1.append(total_average_rouge_l_score)

    print(total_average_cosine_similarity_scores_1)
    print(total_average_rouge_l_scores_1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

plt.figure(figsize=(20, 5))

# Plot Accuracy
plt.subplot(1, 3, 1)
plt.plot(num_labels_1, total_average_rouge_l_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)

plt.xlabel('Number of Labels')
plt.ylabel('Total Average ROUGE-L Score for 5 Epoch')
plt.title(' After Fine-Tuning ROUGE-L Score Comparison for 5 Epoch')

plt.subplot(1, 3, 2)
plt.plot(num_labels_1, total_average_cosine_similarity_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)
plt.xlabel('Number of Labels')
plt.ylabel('Total Average Cosine Similarity Score for 5 Epoch')
plt.title(' After Fine-Tuning Cosine Similarity Score for 5 Epoch')

### Few Shot Finished

# Fine Tuning Preprocessing

In [None]:
Finance_Reduced_Train.drop('syntheses', axis=1, inplace=True)
Finance_Reduced_Train.columns

In [None]:
Finance_Reduced_Train_Reduced_80 = Finance_Reduced_Train.head(80)
Validation_20 = Finance_Reduced_Train.tail(20)
len(Finance_Reduced_Train_Reduced_80)
len(Validation_20)

In [None]:
import pandas as pd
from datasets import Dataset


prompts = []
for idx, row in Finance_Reduced_Train_Reduced_80.iterrows():
    question = row['question']
    answer = row['answer']
    evidence_text = row['evidence_text']  # Assuming you have access to this column

    SYSTEM_PROMPT = """You are a financial chatbot trained to answer questions based on the information provided.
Your responses should be directly sourced from the content of these evidence_text(context).
When asked a question, ensure that your answer is explicitly supported by the text and do not
include any external information, interpretations, or assumptions not clearly stated in the evidence_text(context).
If a question pertains to financial data or analysis that is not explicitly covered in the evidence_text(context) provided,
respond by stating that the information is not available in the evidence_text(context).
Your primary focus should be on accuracy, specificity, and adherence to the information in the evidence_text(context),
particularly regarding financial statements, company performance, and market positions."""

    prompt = f"<start_of_turn>user\n{SYSTEM_PROMPT}\n{question}\n{evidence_text}\n<end_of_turn>\n<start_of_turn>model\n{answer}\n<end_of_turn>"
    prompts.append(prompt)

dataset = Dataset.from_pandas(pd.DataFrame({'text': prompts}))
dataset


In [None]:
prompts = []
for idx, row in Validation_20.iterrows():
    question = row['question']
    answer = row['answer']
    evidence_text = row['evidence_text']  # Assuming you have access to this column

    SYSTEM_PROMPT = """You are a financial chatbot trained to answer questions based on the information provided.
Your responses should be directly sourced from the content of these evidence_text(context).
When asked a question, ensure that your answer is explicitly supported by the text and do not
include any external information, interpretations, or assumptions not clearly stated in the evidence_text(context).
If a question pertains to financial data or analysis that is not explicitly covered in the evidence_text(context) provided,
respond by stating that the information is not available in the evidence_text(context).
Your primary focus should be on accuracy, specificity, and adherence to the information in the evidence_text(context),
particularly regarding financial statements, company performance, and market positions."""

    prompt = f"<start_of_turn>user\n{SYSTEM_PROMPT}\n{question}\n{evidence_text}\n<end_of_turn>\n<start_of_turn>model\n{answer}\n<end_of_turn>"
    prompts.append(prompt)

eval_dataset = Dataset.from_pandas(pd.DataFrame({'text': prompts}))
eval_dataset


# Fine Tuning

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print(model)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    # target_modules=["query_key_value"],
    target_modules=['o_proj', 'q_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj', 'gate_proj'], #specific to Gemma models.
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### For 5 Epoches

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    "Gemma_5_Epoch",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-5,
    fp16=True,
    weight_decay=0.01,
    max_grad_norm=0.3,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    lr_scheduler_type="cosine",
    seed=42,
    push_to_hub=True,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=100,
    tokenizer=tokenizer,
    args=training_arguments,
)

Fine_tuned_5_Epoch = trainer.train()


### For 10 Epoches

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    "Gemma_Finance_80_10_Epoch",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-5,
    fp16=True,
    weight_decay=0.01,
    max_grad_norm=0.3,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    lr_scheduler_type="cosine",
    seed=42,
    push_to_hub=True,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=100,
    tokenizer=tokenizer,
    args=training_arguments,
)

Fine_tuned_80_10_Epoch = trainer.train()


### For 15 Epoches

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    "Gemma_Finance_80_15_Epoch_2nd",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-5,
    fp16=True,
    weight_decay=0.01,
    max_grad_norm=0.3,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    lr_scheduler_type="cosine",
    seed=42,
    push_to_hub=True,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=100,
    tokenizer=tokenizer,
    args=training_arguments,
)

Fine_tuned_80_15_Epoch = trainer.train()


# Prediction

In [None]:
# model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers

model_id_1 = '/kaggle/working/Gemma_Finance_80_5_Epoch'
model_id_2 = '/kaggle/working/Gemma_Finance_80_10_Epoch'
model_id_3 = '/kaggle/working/Gemma_Finance_80_15_Epoch_2nd'

#device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# fine_tuned_gemma_5_Epoch = AutoModelForCausalLM.from_pretrained(model_id_1, quantization_config=bnb_config, device_map="auto")
# tokenizer_1 = AutoTokenizer.from_pretrained(model_id_1)
# tokenizer_1.pad_token = tokenizer_1.eos_token
# tokenizer_1.padding_side = "right"

# fine_tuned_gemma_10_Epoch = AutoModelForCausalLM.from_pretrained(model_id_2, quantization_config=bnb_config, device_map="auto")
# tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2)
# tokenizer_2.pad_token = tokenizer_2.eos_token
# tokenizer_2.padding_side = "right"

fine_tuned_gemma_15_Epoch = AutoModelForCausalLM.from_pretrained(model_id_3, quantization_config=bnb_config, device_map="auto")
tokenizer_3 = AutoTokenizer.from_pretrained(model_id_3)
tokenizer_3.pad_token = tokenizer_3.eos_token
tokenizer_3.padding_side = "right"

In [None]:
import pandas as pd

file_path_2 = "/kaggle/input/finance-bench-dataset/syntheses_10.csv"
Finance_data = pd.read_csv(file_path_2)
print(Finance_data.head())

In [None]:
Finance_data.drop('syntheses', axis=1, inplace=True)
Finance_data.columns

In [None]:
Finance_Reduced_Test= Finance_data.head(50)
len(Finance_Reduced_Test)

In [None]:
SYSTEM_PROMPT = """Give answer to questions provided below from the evidence text."""

count = 0
syntheses_with_gemma = []

for idx, row in Finance_Reduced_Test.iterrows():
    question = row['question']
    evidence_text = row['evidence_text']

    prompt = f"""<start_of_turn>user\n{SYSTEM_PROMPT}\n
        {question}\n{evidence_text}\n<end_of_turn>\n"""

    # Tokenize the prompt
    inputs = tokenizer_3(prompt, return_tensors="pt")

    # Generate response
    output = fine_tuned_gemma_15_Epoch.generate(input_ids=inputs["input_ids"], max_new_tokens=100)

    # Decode the response
    generated_text = tokenizer_3.decode(output[0], skip_special_tokens=True)
    #print(generated_text)

    #Finding Extracted Answer
    index_of_answer = generated_text.find("<end_of_turn>")

    # Extract the text after "Answer:"
    answer_text = generated_text[index_of_answer + len("<end_of_turn>"):].strip()
    syntheses_with_gemma.append(answer_text)
    #print(answer_text)

    #Comparing the answer with the base answer
    dash_line = '-'.join('' for x in range(100))
    Answer = Finance_Reduced_Test.loc[idx, 'answer']
#     print(f'BASELINE HUMAN ANSWER:\n{Answer}\n')
#     print(dash_line)
#     print(f'MODEL GENERATION - ZERO SHOT:\n{answer_text}')
#     print(dash_line)
    print(count+1)
    count = count + 1

#Appending it to main file
Finance_Reduced_Test['Generated_BY_GEMMA'] = syntheses_with_gemma

In [None]:
from IPython.display import FileLink
fine_tuned_15_Epoch = Finance_Reduced_Test.rename(columns={'question': 'question', 'answer': 'answer', 'evidence_text': 'evidence_text', 'Generated_BY_Gemma': 'Generated_BY_Gemma'})
fine_tuned_15_Epoch.to_csv('fine_tuned_15_Epoch_Gemma.csv', index=False)
FileLink('fine_tuned_15_Epoch_Gemma.csv')

In [None]:
fine_tuned_5_Epoch.head()

In [None]:
fine_tuned_5_Epoch['Generated_BY_Gemma'][17]

# Evaluation

In [None]:
import pandas as pd
file_path_1 = "/kaggle/input/fine-tuned-gemma/fine_tuned_5_Epoch_Gemma (1).csv"
file_path_2 = "/kaggle/input/fine-tuned-gemma/fine_tuned_10_Epoch_Gemma.csv"
file_path_3 = "/kaggle/input/fine-tuned-gemma/fine_tuned_15_Epoch_Gemma.csv"

Fine_Tuned_Gemma_5_Epoch = pd.read_csv(file_path_1)
Fine_Tuned_Gemma_10_Epoch = pd.read_csv(file_path_2)
Fine_Tuned_Gemma_15_Epoch = pd.read_csv(file_path_3)


In [None]:
# Selecting the 'answer' and 'Generated_BY_GEMMA' columns
selected_columns = ['answer', 'Generated_BY_GEMMA']

# Printing the first 50 rows with a dash line between the columns
for index, row in Fine_Tuned_Gemma_5_Epoch[selected_columns].head(50).iterrows():
    print(f"{row['answer']} --------- {row['Generated_BY_GEMMA']}")
    print('--' * 50)  # Dash line separator

$1577.00 --------- The text does not specify the FY2018 capital expenditure amount for 3M, therefore I cannot answer this question.
----------------------------------------------------------------------------------------------------
$8.70 --------- The text does not provide information about the year end FY2018 net PPNE for 3M, therefore I cannot answer this question.
----------------------------------------------------------------------------------------------------
No, the company is managing its CAPEX and Fixed Assets pretty efficiently, which is evident from below key metrics:
CAPEX/Revenue Ratio: 5.1%
Fixed assets/Total Assets: 20%
Return on Assets= 12.4% --------- The text does not explicitly state whether 3M is a capital-intensive business based on FY2022 data, therefore I cannot answer this question.
----------------------------------------------------------------------------------------------------
Operating Margin for 3M in FY2022 has decreased by 1.7% primarily due to: 
-Dec

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
def compute_rouge_l(candidate, reference):
    m, n = len(candidate), len(reference)
    #print(m,n)
    dp_table = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if candidate[i - 1] == reference[j - 1]:
                dp_table[i][j] = dp_table[i - 1][j - 1] + 1
            else:
                dp_table[i][j] = max(dp_table[i - 1][j], dp_table[i][j - 1])

    return dp_table[m][n] / n

In [None]:
nltk.download('stopwords')

In [None]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

In [None]:
def compute_similarity_score(answer,syntheses):
    tokens_answer = preprocess_text(answer)
    tokens_syntheses = preprocess_text(syntheses)
    str_answer = ' '.join(tokens_answer)
    str_syntheses = ' '.join(tokens_syntheses)
    freqdist_answer = nltk.FreqDist(str_answer.split())
    freqdist_syntheses = nltk.FreqDist(str_syntheses.split())
    # Extract frequencies for unique tokens in both texts
    unique_tokens = set(freqdist_answer.keys()).union(freqdist_syntheses.keys())

    freq_answer = [freqdist_answer[token] for token in unique_tokens]
    freq_syntheses = [freqdist_syntheses[token] for token in unique_tokens]

    vector_answer = np.array(freq_answer).reshape(1, -1)
    vector_syntheses = np.array(freq_syntheses).reshape(1, -1)

    similarity_score = cosine_similarity(vector_answer, vector_syntheses)[0][0]

    return similarity_score

### 5 Epoch Test

In [None]:
import numpy as np

tfidf_vectorizer = TfidfVectorizer

total_average_rouge_l_scores_1 = []
total_average_cosine_similarity_scores_1  = []

num_labels_1 = []
count = 0

for i in range(5):
    count += 10
    num_labels_1.append(count)
    #print(num_labels)
    df = Fine_Tuned_Gemma_5_Epoch.head(count)
    #print(len(df))

    rouge_l_scores = []
    cosine_similarity_scores = []

    for idx, row in df.iterrows():
        answer = row['answer']
        syntheses = row['Generated_BY_GEMMA']

        compute_sim_score = compute_similarity_score(answer,syntheses)
        cosine_similarity_scores.append(compute_sim_score)

        rouge_l_score = compute_rouge_l(answer, syntheses)
        rouge_l_scores.append(rouge_l_score)


    total_average_cosine_similarity_score = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    total_average_cosine_similarity_scores_1.append(total_average_cosine_similarity_score)


    total_average_rouge_l_score = sum(rouge_l_scores)/len(rouge_l_scores)
    total_average_rouge_l_scores_1.append(total_average_rouge_l_score)

    print(total_average_cosine_similarity_scores_1)
    print(total_average_rouge_l_scores_1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

plt.figure(figsize=(20, 5))

# Plot Accuracy
plt.subplot(1, 3, 1)
plt.plot(num_labels_1, total_average_rouge_l_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)

plt.xlabel('Number of Labels')
plt.ylabel('Total Average ROUGE-L Score for 5 Epoch')
plt.title(' After Fine-Tuning ROUGE-L Score Comparison for 5 Epoch')

plt.subplot(1, 3, 2)
plt.plot(num_labels_1, total_average_cosine_similarity_scores_1, marker='o', linestyle='-', color='skyblue', linewidth=2)
plt.xlabel('Number of Labels')
plt.ylabel('Total Average Cosine Similarity Score for 5 Epoch')
plt.title(' After Fine-Tuning Cosine Similarity Score for 5 Epoch')

### 10 Epoch Test

In [None]:
import numpy as np

tfidf_vectorizer = TfidfVectorizer

total_average_rouge_l_scores_2 = []
total_average_cosine_similarity_scores_2  = []

num_labels_2 = []
count = 0

for i in range(5):
    count += 10
    num_labels_2.append(count)
    #print(num_labels)
    df = Fine_Tuned_Gemma_10_Epoch.head(count)
    #print(len(df))

    rouge_l_scores = []
    cosine_similarity_scores = []

    for idx, row in df.iterrows():
        answer = row['answer']
        syntheses = row['Generated_BY_GEMMA']

        compute_sim_score = compute_similarity_score(answer,syntheses)
        cosine_similarity_scores.append(compute_sim_score)

        rouge_l_score = compute_rouge_l(answer, syntheses)
        rouge_l_scores.append(rouge_l_score)


    total_average_cosine_similarity_score = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    total_average_cosine_similarity_scores_2.append(total_average_cosine_similarity_score)


    total_average_rouge_l_score = sum(rouge_l_scores)/len(rouge_l_scores)
    total_average_rouge_l_scores_2.append(total_average_rouge_l_score)

    #print(total_average_cosine_similarity_scores_1)
    #print(total_average_rouge_l_scores_1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

plt.figure(figsize=(20, 5))

# Plot Accuracy
plt.subplot(1, 3, 1)
plt.plot(num_labels_2, total_average_rouge_l_scores_2, marker='o', linestyle='-', color='skyblue', linewidth=2)

plt.xlabel('Number of Labels')
plt.ylabel('Total Average ROUGE-L Score for 5 Epoch')
plt.title(' After Fine-Tuning ROUGE-L Score Comparison for 5 Epoch')

plt.subplot(1, 3, 2)
plt.plot(num_labels_2, total_average_cosine_similarity_scores_2, marker='o', linestyle='-', color='skyblue', linewidth=2)
plt.xlabel('Number of Labels')
plt.ylabel('Total Average Cosine Similarity Score for 5 Epoch')
plt.title(' After Fine-Tuning Cosine Similarity Score for 5 Epoch')

### 15 Epoch Test

In [None]:
import numpy as np

tfidf_vectorizer = TfidfVectorizer

total_average_rouge_l_scores_3 = []
total_average_cosine_similarity_scores_3  = []

num_labels_3 = []
count = 0

for i in range(5):
    count += 10
    num_labels_3.append(count)
    #print(num_labels)
    df = Fine_Tuned_Gemma_15_Epoch.head(count)
    #print(len(df))

    rouge_l_scores = []
    cosine_similarity_scores = []

    for idx, row in df.iterrows():
        answer = row['answer']
        syntheses = row['Generated_BY_GEMMA']

        compute_sim_score = compute_similarity_score(answer,syntheses)
        cosine_similarity_scores.append(compute_sim_score)

        rouge_l_score = compute_rouge_l(answer, syntheses)
        rouge_l_scores.append(rouge_l_score)


    total_average_cosine_similarity_score = sum(cosine_similarity_scores) / len(cosine_similarity_scores)
    total_average_cosine_similarity_scores_3.append(total_average_cosine_similarity_score)


    total_average_rouge_l_score = sum(rouge_l_scores)/len(rouge_l_scores)
    total_average_rouge_l_scores_3.append(total_average_rouge_l_score)

    #print(total_average_cosine_similarity_scores_1)
    #print(total_average_rouge_l_scores_1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

plt.figure(figsize=(20, 5))

# Plot Accuracy
plt.subplot(1, 3, 1)
plt.plot(num_labels_3, total_average_rouge_l_scores_3, marker='o', linestyle='-', color='skyblue', linewidth=2)

plt.xlabel('Number of Labels')
plt.ylabel('Total Average ROUGE-L Score for 5 Epoch')
plt.title(' After Fine-Tuning ROUGE-L Score Comparison for 5 Epoch')

plt.subplot(1, 3, 2)
plt.plot(num_labels_3, total_average_cosine_similarity_scores_3, marker='o', linestyle='-', color='skyblue', linewidth=2)
plt.xlabel('Number of Labels')
plt.ylabel('Total Average Cosine Similarity Score for 5 Epoch')
plt.title(' After Fine-Tuning Cosine Similarity Score for 5 Epoch')