<a href="https://colab.research.google.com/github/KaifAhmad1/Agri-Llama/blob/main/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing Necessary Dependencies:**

In [1]:
!pip install -qU bitsandbytes
!pip install -qU trl
!pip install -qU transformers
!pip install -qU peft
!pip install -qU optimum
!pip install -qU datasets
!pip install -qU accelerate
!pip install -qU nltk
!pip install -qU rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

**Necessary Imports:**

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    pipeline,
    logging
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datasets import Dataset
from huggingface_hub import notebook_login
from google.colab import drive
import plotly.express as px
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

**Set Up Environment:**

In [4]:
notebook_login()
drive.mount('/content/drive')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Data:**

In [5]:
# Load data
file_path = '/content/drive/MyDrive/Network-QA-Dataset.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,Questions,Answers,Context Info,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110
0,What is the scope of the technical specificati...,The scope of the technical specification is de...,"The technical specification, titled ""3GPP TS 2...",,,,,,,,...,,,,,,,,,,
1,Where can specifications and reports for the i...,Specifications and reports for the implementat...,,,,,,,,,...,,,,,,,,,,
2,What are the different restoration indicators ...,The document discusses various restoration ind...,,,,,,,,,...,,,,,,,,,,
3,What procedures are outlined for the restorati...,Procedures for the restoration of data in the ...,,,,,,,,,...,,,,,,,,,,
4,In which section can information about the res...,Information about the restoration of data in ...,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,"In the context of CAPIF deployment models, wha...","""NEF implements the CAPIF architecture"" means...",,,,,,,,,...,,,,,,,,,,
1267,"Explain the concept of ""Distributed deployment...","The ""Distributed deployment of the NEF complia...",,,,,,,,,...,,,,,,,,,,
1268,"According to Annex D, what is the document's a...",Annex D provides a table (Table D-1) that illu...,,,,,,,,,...,,,,,,,,,,
1269,What kind of information does Annex E (Configu...,Annex E specifies configuration data for CAPIF...,,,,,,,,,...,,,,,,,,,,


In [6]:
network_data = data[['Questions', 'Answers', 'Context Info']]
network_data

Unnamed: 0,Questions,Answers,Context Info
0,What is the scope of the technical specificati...,The scope of the technical specification is de...,"The technical specification, titled ""3GPP TS 2..."
1,Where can specifications and reports for the i...,Specifications and reports for the implementat...,
2,What are the different restoration indicators ...,The document discusses various restoration ind...,
3,What procedures are outlined for the restorati...,Procedures for the restoration of data in the ...,
4,In which section can information about the res...,Information about the restoration of data in ...,
...,...,...,...
1266,"In the context of CAPIF deployment models, wha...","""NEF implements the CAPIF architecture"" means...",
1267,"Explain the concept of ""Distributed deployment...","The ""Distributed deployment of the NEF complia...",
1268,"According to Annex D, what is the document's a...",Annex D provides a table (Table D-1) that illu...,
1269,What kind of information does Annex E (Configu...,Annex E specifies configuration data for CAPIF...,


In [7]:
def process_data_sample(example):
    # Extract relevant information from the example
    question = example['Questions']
    answer = example['Answers']
    context_info = example['Context Info']

    # Convert potential NaN values to empty string
    question = str(question)
    answer = str(answer)
    context_info = str(context_info) if pd.notna(context_info) else ""

    # Prepare the processed example for a Question Answering System
    processed_example = (
        "You are a Question Answering System designed to assist users with queries. "
        "Your capabilities include providing technical details, offering implementation guidance, "
        "and staying updated on telecommunications standards.\n\n"
        f"User Query:\n{question}\n\n"
        f"Answer:\n{answer}\n\n"
        f"Context Information:\n{context_info}"
    )
    return processed_example

In [8]:
# Create 'text' column in 'network_data' by applying 'process_data_sample' to each row's 'Questions', 'Answers', and 'Context Info'
network_data['text'] = network_data[['Questions', 'Answers', 'Context Info']].apply(lambda x: process_data_sample(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  network_data['text'] = network_data[['Questions', 'Answers', 'Context Info']].apply(lambda x: process_data_sample(x), axis=1)


In [9]:
for i in range(5):
  print(network_data['text'].iloc[i])

You are a Question Answering System designed to assist users with queries. Your capabilities include providing technical details, offering implementation guidance, and staying updated on telecommunications standards.

User Query:
What is the scope of the technical specification mentioned in the document?

Answer:
The scope of the technical specification is defined in Section 1, and it covers restoration procedures within the 3rd Generation Partnership Project (3GPP) for Core Network and Terminals, Release 18.

Context Information:
The technical specification, titled "3GPP TS 23.007 V18.2.0 (2023-06)," is a document developed within the 3rd Generation Partnership Project (3GPP). It falls under the Technical Specification Group Core Network and Terminals and specifically focuses on "Restoration procedures" as part of Release 18.The document outlines various aspects related to restoration procedures within the 3GPP framework. It covers a wide range of topics, including restoration indicat

In [10]:
# Split data
train_data, test_data = train_test_split(network_data, test_size=0.2, random_state=42)

In [11]:
# Create datasets for training and evaluation using the datasets library
network_train_data = Dataset.from_pandas(train_data)
network_test_data = Dataset.from_pandas(test_data)

In [12]:
network_train_data

Dataset({
    features: ['Questions', 'Answers', 'Context Info', 'text', '__index_level_0__'],
    num_rows: 1016
})

In [13]:
network_test_data

Dataset({
    features: ['Questions', 'Answers', 'Context Info', 'text', '__index_level_0__'],
    num_rows: 255
})

**Pretrained Model:**

In [14]:
model_name = 'mistralai/Mistral-7B-v0.1'

In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [21]:
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# Tokenization and Padding
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [23]:
# Assess pre-trained model on custom queries
custom_queries = [
    "Is the Emergency Service a subscription service?",
    "What considerations should the HSS follow during emergency registrations?",
    "What are the different restoration indicators discussed in the document?",
    "What procedures are outlined for the restoration of data in the VLR after a failure?",
    "In which section can information about the restoration of data in the GGSN be found?",
    "What is included in the DL NAS TRANSPORT message sent from the AMF to the UE?",
    "How is the SOR-CMCI configured and provided to the UE?",
    "In signal level enhanced network selection, what criteria must be met for the MS to select a PLMN?",
    "What is Minimization of Service Interruption (MINT), and when is it applicable?",
    "What corrective actions does the User Equipment (UE) take in case of a security check failure of System Operator's Roaming (SOR) information during automatic network selection mode?"
]

In [31]:
for query in custom_queries:
    # Tokenize and generate response
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = pretrained_model(**inputs)

    # Access the logits tensor and apply argmax
    logits = outputs.logits
    predicted_index = torch.argmax(logits[0]).item()

    # Decode the predicted index
    generated_text = tokenizer.decode(predicted_index, skip_special_tokens=True)

    # Print the result
    print(f"\nPre-trained Model Assessment - Custom Query:\n{query}\nGenerated Answer:\n{generated_text}\n")


Pre-trained Model Assessment - Custom Query:
Is the Emergency Service a subscription service?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
What considerations should the HSS follow during emergency registrations?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
What are the different restoration indicators discussed in the document?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
What procedures are outlined for the restoration of data in the VLR after a failure?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
In which section can information about the restoration of data in the GGSN be found?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
What is included in the DL NAS TRANSPORT message sent from the AMF to the UE?
Generated Answer:



Pre-trained Model Assessment - Custom Query:
How is the SOR-CMCI configured and provided to the UE?
Generated Answer:



Pre-trained Model Assessment - Custom 

In [27]:
for query in custom_queries:
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = pretrained_model(**inputs)
    print(outputs)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          [-3.1055e+00,  1.9102e+00,  3.0566e+00,  ...,  1.0752e+00,
           -1.4131e+00,  1.0336e+01],
          [-3.7227e+00,  2.5684e+00,  2.1113e+00,  ...,  8.0371e-01,
           -8.4229e-01,  1.0086e+01],
          [-1.9473e+00, -4.0527e-01,  7.1240e-01,  ...,  9.1260e-01,
           -5.1611e-01,  9.0156e+00]],

         [[-3.3936e-02, -2.0996e-02, -9.9487e-03,  ...,  1.0156e-01,
           -6.5625e-01, -1.1572e-01],
          [-1.3760e+00, -1.0088e+00, -1.9785e+00,  ..., -1.5391e+00,
           -2.6367e-01, -2.3906e+00],
          [ 2.4395e+00, -1.0391e+00,  3.1250e-01,  ...,  3.4961e-01,
            3.3545e-01,  4.0283e-01],
          ...,
          [-1.4463e+00, -1.6787e+00,  3.2031e-01,  ...,  2.9834e-01,
            3.0664e+00, -3.3813e-01],
          [-2.9414e+00, -1.3887e+00,  1.3369e+00,  ..., -7.5098e-01,
            2.7246e-01, -2.8491e-01],
          [-1.0342e+00,  1.2080e+00,  1.4463e+00,  ..., -1.590

In [28]:
for query in custom_queries:
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    print(inputs)

{'input_ids': tensor([[    1,  1691,   272, 16762,  9588,  5836,   264, 15400,  2372, 28804]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[    1,  1824,  1917,   697,  1023,   272,   382,  1383,  1372,  1938,
         11843, 24113,   697, 28804]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[    1,  1824,   460,   272,  1581, 27243,  4073,  3117,  9951,   297,
           272,  3248, 28804]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[    1,  1824, 15251,   460, 28637,   354,   272, 27243,   302,  1178,
           297,   272,   550, 28758, 28754,  1024,   264,  7719, 28804]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[    1,   560,   690,  4211,   541,  1871,   684,   272, 27243,   302,
          1178,   297,   272,   420, 28777, 15024,   347,  1419, 28804]]), 'attention_mask': ten

In [30]:
for query in custom_queries:
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = pretrained_model(**inputs)
    logits = outputs.logits
    predicted_index = torch.argmax(logits[0]).item()
    generated_text = tokenizer.decode(predicted_index)
    print(generated_text)













**LoRA and SFT**

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
# LoRA Config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias='none',
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

In [None]:
# Training Arguments
training_arguments = TrainingArguments(
    output_dir='Mistral-Network-QnA-System',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim='paged_adamw_32bit',
    learning_rate=2e-4,
    lr_scheduler_type='cosine',
    save_strategy='epoch',
    logging_steps=10,
    save_steps=10,
    num_train_epochs=1,
    max_steps=200,
    fp16=True,
    warmup_ratio=0.05,
    push_to_hub=False,
)

In [None]:
# SFT Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=Dataset.from_pandas(network_train_data[['text']]),
    peft_config=peft_config,
    dataset_text_field='text',
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=264
)

In [None]:
# Train the model
trainer.train()

In [None]:
import plotly.express as px

# Extract training loss values
train_losses = [entry['loss'] for entry in trainer.state.log_history if 'loss' in entry]

# Create a plot
fig = px.line(x=range(1, len(train_losses) + 1), y=train_losses, title='Training Loss Over Steps',
              labels={'x': 'Steps', 'y': 'Training Loss'})

# Show the plot
fig.show()