## LLM Experiments

# Import libraries

In [46]:
import json
import os
import time
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import evaluate
import pandas as pd
import numpy as np
import pandas as pd
import tiktoken
import seaborn as sns
from tenacity import retry, wait_exponential
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
import evaluate
warnings.filterwarnings('ignore')
tqdm.pandas()

# Pretrained flan t5 small with dialogsum dataset (Task1)

In [2]:
huggingface_dataset_name = "knkarthick/dialogsum"
model_name='google/flan-t5-small'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset(huggingface_dataset_name)

Found cached dataset csv (/Users/inbanerj/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

# Logging mock implementation

In [3]:
def log(text):
    print(text)

# Print model parameters

In [4]:
def print_number_of_trainable_model_parameters(model):
        trainable_model_params = 0
        all_model_params = 0
        for _, param in model.named_parameters():
            all_model_params += param.numel()
            if param.requires_grad:
                trainable_model_params += param.numel()
        return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Summarization task (Task 2)

In [5]:
def summary_generator(index):
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        prompt = f"""
        Summarize the following conversation.
        {dialogue}
        Summary:
        """
        inputs = tokenizer(prompt, return_tensors='pt')
        output = tokenizer.decode(
        original_model.generate(
            inputs["input_ids"], 
            max_new_tokens=200,
            )[0], 
            skip_special_tokens=True
            )
        dash_line = '-'.join('' for x in range(100))
        log(dash_line)
        log(f'INPUT PROMPT:\n{prompt}')
        log(dash_line)
        log(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
        log(dash_line)
        log(f'MODEL GENERATION - ZERO SHOT:\n{output}')

# Question Answer task (Task 3)

In [9]:
def question_answer(query):
        prompt = f"""
        Answer the query in brief and a catchy manner in about 100 characters
        {query}
        """
        inputs = tokenizer(prompt, return_tensors='pt')
        output = tokenizer.decode(
        original_model.generate(
            inputs["input_ids"], 
            max_new_tokens=200,
            )[0], 
            skip_special_tokens=True
            )
        log(f'Generated Answer:\n{output}')

# Verify if summary generation is working (Task 2)

In [6]:
index = 105
summary_generator(index)

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

        Summarize the following conversation.
        #Person1#: What's the matter, Bill? You look kind of pale.
#Person2#: Oh, I'm just tired.
#Person1#: Why?
#Person2#: Well, I've been working until around ten every night this week.
#Person1#: You should go home at quitting time today and take it easy.
#Person2#: Yes. I think I will.
#Person1#: That's good. Say, how's your brother?
#Person2#: He's fine, but he is awfully busy. He went to the States on a business trip two weeks ago.
#Person1#: Oh, really? Is he back yet?
#Person2#: No, he won't come back for several more weeks.
#Person1#: Wow! He must have a lot to do there.
#Person2#: Yes, he does.
#Person1#: I want to be sure of the time because I'm going to meet a friend at five o'clock sharp.
#Person2#: Well, my watch says 4:30, and that time should be right. I set it with the radio yesterday.
#Person1#: Good.
       

# Verify if question answer is working (Task 3)

In [15]:
query = " Who is the best footballer of all times"
question_answer(query)

# The question answer model is giving responses but not good or accurate ones

Generated Answer:
scott scott


In [18]:
query = "What is a good way to procrastinate"
question_answer(query)

# Again not very relevant responses but this is expected for a google flan t5 small model

Generated Answer:
You can use a syringe to procrastinate.


# Programatically print the parameters (Task 6)

In [8]:
log(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


# Programatically print the names of all the model layers and their dimensions (Task 5)

In [16]:
log(original_model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

# Set the tensor in final layer to all zeros

In [22]:
original_model.decoder.final_layer_norm.weight=torch.nn.Parameter(torch.zeros(512, dtype=torch.bfloat16))
print(original_model.decoder.final_layer_norm.weight)

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.

# Verify if the QnA model is still working (Task 8)

In [25]:
query = "What is a good way to procrastinate"
question_answer(query)

# Model is alive but unable wo generate any response with weights zero

Generated Answer:



# Replacing the model decoder final layer with smaller dimensions (256) Task 9

In [26]:
original_model.decoder.final_layer_norm.weight=torch.nn.Parameter(torch.zeros(256, dtype=torch.bfloat16))
print(original_model.decoder.final_layer_norm.weight)

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.

# Get the SQUAD dataset

In [31]:
# Download the data for rag implementation
!brew install wget
!mkdir -p local_cache
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O train.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O dev.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/wget/manifests/1.21.4[0m
######################################################################### 100.0%
[32m==>[0m [1mFetching dependencies for wget: [32mlibidn2[39m[0m
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/libidn2/manifests/2.3.4_1-1[0m
Already downloaded: /Users/inbanerj/Library/Caches/Homebrew/downloads/03ad193177f4e7d05ee2ed19a455028cb5fbf7ea1a812d88f18f5e9e8b4a4d43--libidn2-2.3.4_1-1.bottle_manifest.json
[32m==>[0m [1mFetching [32mlibidn2[39m[0m
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/libidn2/blobs/sha256:b044c66cc0[0m
Already downloaded: /Users/inbanerj/Library/Caches/Homebrew/do

# Read JSON files of the SQUAD dataset on to a dataframe

In [27]:
def json_to_dataframe_with_titles(json_data):
    qas = []
    context = []
    is_impossible = []
    answers = []
    titles = []

    for article in json_data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                qas.append(qa['question'].strip())
                context.append(paragraph['context'])
                is_impossible.append(qa['is_impossible'])
                
                ans_list = []
                for ans in qa['answers']:
                    ans_list.append(ans['text'])
                answers.append(ans_list)
                titles.append(title)

    df = pd.DataFrame({'title': titles, 'question': qas, 'context': context, 'is_impossible': is_impossible, 'answers': answers})
    return df

# Build a diverse sample of the df

In [28]:
def get_diverse_sample(df, sample_size=100, random_state=42):

    sample_df = df.groupby(['title', 'is_impossible']).apply(lambda x: x.sample(min(len(x), max(1, sample_size // 50)), random_state=random_state)).reset_index(drop=True)
    
    if len(sample_df) < sample_size:
        remaining_sample_size = sample_size - len(sample_df)
        remaining_df = df.drop(sample_df.index).sample(remaining_sample_size, random_state=random_state)
        sample_df = pd.concat([sample_df, remaining_df]).sample(frac=1, random_state=random_state).reset_index(drop=True)

    return sample_df.sample(min(sample_size, len(sample_df)), random_state=random_state).reset_index(drop=True)

# Preparing the train and validation DF

In [29]:
train_df = json_to_dataframe_with_titles(json.load(open('train.json')))
val_df = json_to_dataframe_with_titles(json.load(open('dev.json')))

df = get_diverse_sample(val_df, sample_size=100, random_state=42)

# Function to get prompt messages with additional context addition (SQUAD dataset)

In [30]:
def get_prompt(row):
    return [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"""Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.
    Question: {row.question}\n\n
    Context: {row.context}\n\n
    Answer:\n""",
        },
    ]

# Reload the Google flan t5 small model (Task 10)

In [32]:
model_name='google/flan-t5-small'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("squad")

Found cached dataset squad (/Users/inbanerj/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [65]:
df.to_json("100_val.json", orient="records", lines=True)
df = pd.read_json("100_val.json", orient="records", lines=True)
df

Unnamed: 0,title,question,context,is_impossible,answers
0,Scottish_Parliament,What consequence of establishing the Scottish ...,A procedural consequence of the establishment ...,False,[able to vote on domestic legislation that app...
1,Imperialism,Imperialism is less often associated with whic...,The principles of imperialism are often genera...,True,[]
2,Economic_inequality,What issues can't prevent women from working o...,"When a person’s capabilities are lowered, they...",True,[]
3,Southern_California,"What county are Los Angeles, Orange, San Diego...","Its counties of Los Angeles, Orange, San Diego...",True,[]
4,French_and_Indian_War,When was the deportation of Canadians?,Britain gained control of French Canada and Ac...,True,[]
...,...,...,...,...,...
95,Geology,"In the layered Earth model, what is the inner ...",Seismologists can use the arrival times of sei...,True,[]
96,Prime_number,What type of value would the Basel function ha...,The zeta function is closely related to prime ...,True,[]
97,"Fresno,_California",What does the San Joaquin Valley Railroad cros...,Passenger rail service is provided by Amtrak S...,True,[]
98,Victoria_(Australia),What party rules in Melbourne's inner regions?,"The centre-left Australian Labor Party (ALP), ...",False,"[The Greens, Australian Greens, Greens]"


# Prepare the data for training (fine-tuning). Add answer to the context. Task 11

In [63]:
def dataframe_to_jsonl(df):
    def create_jsonl_entry(row):
        answer = row["answers"][0] if row["answers"] else "I don't know"
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"""Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.
            Question: {row.question}\n\n
            Context: {row.context}\n\n
            Answer:\n""",
            },
            {"role": "assistant", "content": answer},
        ]
        return json.dumps({"messages": messages})

    jsonl_output = df.apply(create_jsonl_entry, axis=1)
    print(jsonl_output[1])
    return "\n".join(jsonl_output)

train_sample = get_diverse_sample(train_df, sample_size=100, random_state=42)

with open("100_train.jsonl", "w") as f:
    f.write(dataframe_to_jsonl(train_sample))






{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.\n            Question: How much was the Labour majority reduced by?\n\n\n            Context: Labour improved its performance in 1987, gaining 20 seats and so reducing the Conservative majority from 143 to 102. They were now firmly re-established as the second political party in Britain as the Alliance had once again failed to make a breakthrough with seats. A merger of the SDP and Liberals formed the Liberal Democrats. Following the 1987 election, the National Executive Committee resumed disciplinary action against members of Militant, who remained in the party, leading to further expulsions of their activists and the two MPs who supported the group.\n\n\n            Answer:\n"}, {"role": "assistant", "content": "I don't know"}]}


'A procedural consequence of the establishment of the Scottish Parliament is that Scottish MPs sitting in the UK House of Commons are able to vote on domestic legislation that applies only to England, Wales and Northern Ireland – whilst English, Scottish, Welsh and Northern Irish Westminster MPs are unable to vote on the domestic legislation of the Scottish Parliament. This phenomenon is known as the West Lothian question and has led to criticism. Following the Conservative victory in the 2015 UK election, standing orders of the House of Commons were changed to give MPs representing English constituencies a new "veto" over laws only affecting England.'

# Context addition Task 11

In [35]:
def question_answer2(query, context):
        prompt = f"""
        Answer the query in brief and a catchy manner in about 100 characters
        {query}
        Use the additional information given in the context below to come up an answer
        {context}
        """
        inputs = tokenizer(prompt, return_tensors='pt')
        output = tokenizer.decode(
        original_model.generate(
            inputs["input_ids"], 
            max_new_tokens=200,
            )[0], 
            skip_special_tokens=True
            )
        log(f'Generation with Context added:\n{output}')
        return str(output)

# Sample outputs to show the model is coming up with responses with context

In [39]:
for i in range(3):
    df["answers"][i] = question_answer2(df["question"][i], df["context"][i])

Generation with Context added:
unable to vote on the domestic legislation of the Scottish Parliament
Generation with Context added:
Australia
Generation with Context added:
gender roles and customs


# Testing responses are working or not by injecting a custom prompt

In [41]:
df["question"][1] = "Who is Amitabh Bachchan?"
df["context"][1] = "Amitabh Bachchan, born in 1942, is an Indian film producer, television host, occasional playback singer and former politician, and actor who works in Hindi cinema. In a film career spanning over five decades, he has starred in more than 200 films. Bachchan is widely regarded as one of the most successful and influential actors in the history of Indian cinema"
df["answers"][1] = question_answer2(df["question"][1], df["context"][1])


Generation with Context added:
he is widely regarded as one of the most successful and influential actors in the history of Indian cinema


# Evaluation Task 12

In [57]:
# Evaluation framework with ROUGE and BERT_Score
# The comparision can be extended to GPT-4 outputs as reference 
# Additionally we can bertscore for Cosine Similarities.

def Evaluatemet(predictions, references):
    rouge = evaluate.load('rouge')
    results_rouge = rouge.compute(predictions=predictions,references=references)

    bleu = evaluate.load("bleu")
    results_bleuscore = bleu.compute(predictions=predictions,references=references)

    dash_line = '-'.join('' for x in range(100))
    log(dash_line)
    log(results_rouge)
    log(dash_line)
    log(results_bleuscore)



In [61]:
# While the idea to take the length of prediction in the reference text is not actually correct, this is just for demonstration purposes how the evaluation module will work
# More logical rules that keep the prediction and reference lengths the same for BLEU and ROUGE matrices will need to be implemented.
# ECSM implementation (BERTScore) can also be done

for i in range(3):
    df["answers"][i] = question_answer2(df["question"][i], df["context"][i])
    predictions = df["answers"][i]
    references = df["context"][i][0:len(predictions)]
    Evaluatemet(predictions, references)

Generation with Context added:
unable to vote on the domestic legislation of the Scottish Parliament
---------------------------------------------------------------------------------------------------
{'rouge1': 0.028985507246376812, 'rouge2': 0.0, 'rougeL': 0.028985507246376812, 'rougeLsum': 0.028985507246376812}
---------------------------------------------------------------------------------------------------
{'bleu': 0.0, 'precisions': [0.03389830508474576, 0.0, 0.0, 0.0], 'brevity_penalty': 0.9831936762627184, 'length_ratio': 0.9833333333333333, 'translation_length': 59, 'reference_length': 60}
Generation with Context added:
he is widely regarded as one of the most successful and influential actors in the history of Indian cinema
---------------------------------------------------------------------------------------------------
{'rouge1': 0.018867924528301886, 'rouge2': 0.0, 'rougeL': 0.018867924528301886, 'rougeLsum': 0.018867924528301886}
----------------------------------------