<a href="https://colab.research.google.com/github/KaifAhmad1/Agri-Llama/blob/main/Mistral_7B_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Necessary Dependencies:**

In [1]:
!pip install -qU bitsandbytes
!pip install -qU trl
!pip install -qU transformers
!pip install -qU peft
!pip install -qU optimum
!pip install -qU datasets
!pip install -qU accelerate
!pip install -qU nltk
!pip install -qU rouge_score

**Necessary Imports:**

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    pipeline,
    logging
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
from huggingface_hub import notebook_login
from google.colab import drive
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

**Set Up Environment:**

In [3]:
notebook_login()
drive.mount('/content/drive')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Data:**

In [4]:
# Load data
file_path = '/content/drive/MyDrive/Network-QA-Dataset.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,Questions,Answers,Context Info,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110
0,What is the scope of the technical specificati...,The scope of the technical specification is de...,"The technical specification, titled ""3GPP TS 2...",,,,,,,,...,,,,,,,,,,
1,Where can specifications and reports for the i...,Specifications and reports for the implementat...,,,,,,,,,...,,,,,,,,,,
2,What are the different restoration indicators ...,The document discusses various restoration ind...,,,,,,,,,...,,,,,,,,,,
3,What procedures are outlined for the restorati...,Procedures for the restoration of data in the ...,,,,,,,,,...,,,,,,,,,,
4,In which section can information about the res...,Information about the restoration of data in ...,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,How does the UE handle a security check failur...,If the security check fails and the UE is in a...,,,,,,,,,...,,,,,,,,,,
968,What happens if the UE is in the manual mode o...,If the UE is in manual mode or the current cho...,,,,,,,,,...,,,,,,,,,,
969,What information does the UDM provide to the S...,The UDM informs the SOR-AF about successful de...,,,,,,,,,...,,,,,,,,,,
970,What is described in Annex D (informative): Ch...,Annex D provides the change history of the spe...,,,,,,,,,...,,,,,,,,,,


In [5]:
network_data = data[['Questions', 'Answers', 'Context Info']]
network_data

Unnamed: 0,Questions,Answers,Context Info
0,What is the scope of the technical specificati...,The scope of the technical specification is de...,"The technical specification, titled ""3GPP TS 2..."
1,Where can specifications and reports for the i...,Specifications and reports for the implementat...,
2,What are the different restoration indicators ...,The document discusses various restoration ind...,
3,What procedures are outlined for the restorati...,Procedures for the restoration of data in the ...,
4,In which section can information about the res...,Information about the restoration of data in ...,
...,...,...,...
967,How does the UE handle a security check failur...,If the security check fails and the UE is in a...,
968,What happens if the UE is in the manual mode o...,If the UE is in manual mode or the current cho...,
969,What information does the UDM provide to the S...,The UDM informs the SOR-AF about successful de...,
970,What is described in Annex D (informative): Ch...,Annex D provides the change history of the spe...,


In [6]:
def process_data_sample(example):
    # Extract relevant information from the example
    question = example['Questions']
    answer = example['Answers']
    context_info = example['Context Info']

    # Convert potential NaN values to empty string
    question = str(question)
    answer = str(answer)
    context_info = str(context_info) if pd.notna(context_info) else ""

    # Prepare the processed example for a Question Answering System
    processed_example = (
        "You are a Question Answering System designed to assist users with queries. "
        "Your capabilities include providing technical details, offering implementation guidance, "
        "and staying updated on telecommunications standards.\n\n"
        f"User Query:\n{question}\n\n"
        f"Answer:\n{answer}\n\n"
        f"Context Information:\n{context_info}"
    )
    return processed_example

In [7]:
# Create 'text' column in 'network_data' by applying 'process_data_sample' to each row's 'Questions', 'Answers', and 'Context Info'
network_data['text'] = network_data[['Questions', 'Answers', 'Context Info']].apply(lambda x: process_data_sample(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  network_data['text'] = network_data[['Questions', 'Answers', 'Context Info']].apply(lambda x: process_data_sample(x), axis=1)


In [8]:
for i in range(5):
  print(network_data['text'].iloc[i])

You are a Question Answering System designed to assist users with queries. Your capabilities include providing technical details, offering implementation guidance, and staying updated on telecommunications standards.

User Query:
What is the scope of the technical specification mentioned in the document?

Answer:
The scope of the technical specification is defined in Section 1, and it covers restoration procedures within the 3rd Generation Partnership Project (3GPP) for Core Network and Terminals, Release 18.

Context Information:
The technical specification, titled "3GPP TS 23.007 V18.2.0 (2023-06)," is a document developed within the 3rd Generation Partnership Project (3GPP). It falls under the Technical Specification Group Core Network and Terminals and specifically focuses on "Restoration procedures" as part of Release 18.The document outlines various aspects related to restoration procedures within the 3GPP framework. It covers a wide range of topics, including restoration indicat

In [9]:
# Split data
train_data, test_data = train_test_split(network_data, test_size=0.2, random_state=42)

In [10]:
# Create datasets for training and evaluation using the datasets library
network_train_data = Dataset.from_pandas(train_data)
network_test_data = Dataset.from_pandas(test_data)

In [11]:
network_train_data

Dataset({
    features: ['Questions', 'Answers', 'Context Info', 'text', '__index_level_0__'],
    num_rows: 777
})

In [12]:
network_test_data

Dataset({
    features: ['Questions', 'Answers', 'Context Info', 'text', '__index_level_0__'],
    num_rows: 195
})

**Pretrained Model:**

In [13]:
model_name = 'mistralai/Mistral-7B-v0.1'

In [14]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
# Tokenization and Padding
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [17]:
# Function to calculate BLEU score
def calculate_bleu_score(reference, generated):
    reference_tokens = [reference.split()]
    generated_tokens = generated.split()
    return sentence_bleu(reference_tokens, generated_tokens)

In [18]:
# Function to calculate ROUGE scores
def calculate_rouge_scores(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

In [19]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Select the top 20 questions from the dataset
top_twenty_questions = network_data['Questions'].head(20).tolist()
actual_answers = network_data.loc[network_data['Questions'].isin(top_twenty_questions), 'Answers'].tolist()

# Initialize a list to store generated answers
pretrained_generated_answers = []

# Generate answers using the pre-trained model
for query in top_twenty_questions:
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = model(**inputs)

    generated_token_ids = outputs['logits'][0].argmax(dim=-1).tolist()
    generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

    pretrained_generated_answers.append(generated_text)

In [20]:
# Calculate BLEU scores
bleu_scores = [sentence_bleu([actual_answer.split()], generated_answer.split()) for actual_answer, generated_answer in zip(actual_answers, pretrained_generated_answers)]

# Calculate ROUGE scores
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [rouge_scorer_instance.score(actual_answer, generated_answer) for actual_answer, generated_answer in zip(actual_answers, pretrained_generated_answers)]

# Print the results
for i in range(20):
    print(f"\nQuestion:\n{top_twenty_questions[i]}\nActual Answer:\n{actual_answers[i]}\nGenerated Answer:\n{pretrained_generated_answers[i]}\n"
          f"BLEU Score: {bleu_scores[i]}\nROUGE Scores: {rouge_scores[i]}\n")


Question:
What is the scope of the technical specification mentioned in the document?
Actual Answer:
The scope of the technical specification is defined in Section 1, and it covers restoration procedures within the 3rd Generation Partnership Project (3GPP) for Core Network and Terminals, Release 18.
Generated Answer:
# is the best of the project assistanceification? in the tender?

BLEU Score: 1.2243949545562701e-155
ROUGE Scores: {'rouge1': Score(precision=0.7, recall=0.23333333333333334, fmeasure=0.35), 'rouge2': Score(precision=0.1111111111111111, recall=0.034482758620689655, fmeasure=0.05263157894736842), 'rougeL': Score(precision=0.5, recall=0.16666666666666666, fmeasure=0.25)}


Question:
Where can specifications and reports for the implementation of the 3GPP TM system be obtained?
Actual Answer:
Specifications and reports for the implementation of the 3GPP TM system should be obtained via the 3GPP Organizational Partners' Publications Offices, as mentioned in the document.
Gene

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


**LoRA and SFT**

In [21]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [22]:
# LoRA Config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias='none',
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

In [23]:
# Training Arguments
training_arguments = TrainingArguments(
    output_dir='Finetuned-Model',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim='paged_adamw_32bit',
    learning_rate=2e-4,
    lr_scheduler_type='cosine',
    save_strategy='epoch',
    logging_steps=10,
    save_steps=10,
    num_train_epochs=3,
    max_steps=200,
    fp16=True,
    warmup_ratio=0.05,
    push_to_hub=False,
)

In [24]:
# SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=network_train_data,
    peft_config=peft_config,
    dataset_text_field='text',
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=264
)

Map:   0%|          | 0/777 [00:00<?, ? examples/s]



In [25]:
# Train the model
trainer.train()



Step,Training Loss
10,2.4285
20,1.5696
30,1.3322
40,1.2304
50,1.1906
60,1.1219
70,1.1068
80,1.091
90,1.0647
100,1.0776


Checkpoint destination directory Finetuned-Model/checkpoint-48 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Finetuned-Model/checkpoint-97 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Step,Training Loss
10,2.4285
20,1.5696
30,1.3322
40,1.2304
50,1.1906
60,1.1219
70,1.1068
80,1.091
90,1.0647
100,1.0776




TrainOutput(global_step=200, training_loss=1.1304764461517334, metrics={'train_runtime': 2674.8205, 'train_samples_per_second': 1.196, 'train_steps_per_second': 0.075, 'total_flos': 2.520032319892685e+16, 'train_loss': 1.1304764461517334, 'epoch': 4.1})

In [30]:
output_model_dir = "Mistral-finetuned-network-QnA"
trainer.model.save_pretrained(output_model_dir)

In [27]:
# Push your model to the Hub
model.push_to_hub("Mistral-finetuned-network-QnA")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kaifahmad/Mistral-finetuned-network-QnA/commit/14e49c38871e3c2996b1d9dad39fd4e0afc46fde', commit_message='Upload MistralForCausalLM', commit_description='', oid='14e49c38871e3c2996b1d9dad39fd4e0afc46fde', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
import plotly.graph_objects as go

# Extract training loss values
train_losses = [entry['loss'] for entry in trainer.state.log_history if 'loss' in entry]

# Create a Plotly figure
fig = go.Figure()

# Add a scatter plot for training loss
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_losses) + 1)),
    y=train_losses,
    mode='lines+markers',
    marker=dict(color='blue'),
    line=dict(color='blue', dash='solid'),
    name='Training Loss'
))

# Update layout
fig.update_layout(
    title='Training Loss Over Steps',
    xaxis=dict(title='Steps', showgrid=True),
    yaxis=dict(title='Training Loss', showgrid=True),
    showlegend=True,
    width=800,
    height=500
)

# Show the plot
fig.show()

In [32]:
# Assuming you have a fine-tuned model loaded in 'fine_tuned_model'
fine_tuned_model = 'Mistral-finetuned-network-QnA'

# Select the top 20 questions from the dataset
top_twenty_questions = network_data['Questions'].head(20).tolist()
actual_answers = network_data.loc[network_data['Questions'].isin(top_twenty_questions), 'Answers'].tolist()

# Initialize a list to store generated answers
fine_tuned_generated_answers = []

# Generate answers using the fine-tuned model
for query in top_twenty_questions:
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = fine_tuned_model(**inputs)

    generated_token_ids = outputs['logits'][0].argmax(dim=-1).tolist()
    generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

    fine_tuned_generated_answers.append(generated_text)

TypeError: 'str' object is not callable

In [None]:
# Calculate BLEU scores
bleu_scores = [sentence_bleu([actual_answer.split()], generated_answer.split()) for actual_answer, generated_answer in zip(actual_answers, fine_tuned_generated_answers)]

# Calculate ROUGE scores
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [rouge_scorer_instance.score(actual_answer, generated_answer) for actual_answer, generated_answer in zip(actual_answers, fine_tuned_generated_answers)]

# Print the results
for i in range(20):
    print(f"\nQuestion:\n{top_twenty_questions[i]}\nActual Answer:\n{actual_answers[i]}\nGenerated Answer:\n{fine_tuned_generated_answers[i]}\n"
          f"BLEU Score: {bleu_scores[i]}\nROUGE Scores: {rouge_scores[i]}\n")
