In [None]:
!pip install langchain openai crewai faiss-cpu langchain_openai
!pip install -U langchain-community
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib
!pip install -q trl

In [None]:
!python --version

Python 3.11.12


In [None]:
!pip freeze > requirements.txt

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

In [None]:
df = pd.DataFrame(lines,columns=['label','sentence'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 'yes'
  else:
    return 'no'

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.3, random_state=42)

In [None]:
rag_lines = X_train + " the sentence is procedural: " + y_train
print(rag_lines.values)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from crewai import Agent, Task, Crew
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from crewai.tools import tool

api_key=""
llm = ChatOpenAI(model="gpt-4o-mini",api_key=api_key)

documents = [Document(line) for line in rag_lines.values]
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)


vectorstore = FAISS.from_documents(chunks, OpenAIEmbeddings(api_key=api_key))


summarizer_prompt = PromptTemplate.from_template("""
Summarize the following procedural-related paragraph into one concise sentence:

"{input}"

Return only the summary sentence.
""")
summarizer = Agent(
    role="Summarizer",
    goal="Summarize procedural {input_content} text into a single sentence.",
    backstory="A skilled agent that condenses complex procedural text into short summaries.",
    llm=llm,
    prompt=summarizer_prompt,
    verbose=True,
)


few_shot_examples = []
for i in range(20):
  d = f"{{'input':{rag_lines.values[i]},'output':{y_train.values[i]}}}"
  few_shot_examples.append(d)

classifier_prompt = PromptTemplate.from_examples(
    examples=few_shot_examples,
    suffix="Sentence: {input}\n\nReply with 'Yes' if procedural, otherwise 'No'.",
    input_variables=["input"]
)
classifier = Agent(
    role="Classifier",
    goal="Classify sentence as procedural or not using few-shot examples.",
    backstory="Expert in detecting whether sentences are valid surgical procedures.",
    llm=llm,
    prompt=classifier_prompt,
    verbose=True,
)

contextual_prompt = PromptTemplate.from_template("""
Using the context below, decide if the given sentence is a valid procedural step.

Context:
{context}

Sentence:
"{input}"

Return 'Yes' or 'No' based on the evidence.
""")
retriever = vectorstore.as_retriever()

@tool("contextual_search")
def search_tool(query: str) -> str:
    """Retrieve relevant examples using semantic similarity search.
      Input should be a single sentence or short paragraph.
      Returns a list of closely related sentences from the reference dataset.
      Use this tool to verify, support, or challenge classification decisions by comparing with known labeled examples."""

    docs = retriever.invoke(query)

    formatted_docs = "\n\n".join([
        f"Document {i+1}:\n{'-'*40}\n{doc.page_content}"
        for i, doc in enumerate(docs)
    ])
    return formatted_docs
contextual = Agent(
    role="Contextual Validator",
    goal="Verify classifier output using retrieval-augmented context.",
    backstory="Uses surgical knowledge base to validate classification decisions.",
    llm=llm,
    tools=[search_tool],
    prompt=contextual_prompt,
    verbose=True,
)


validator_prompt = PromptTemplate.from_template("""
Given the results from classifier and context-check, validate final decision.

Classifier output: {input1}
Contextual output: {input2}

If both are 'Yes', return JSON: {{ "is_procedural": true }}
Otherwise return JSON: {{ "is_procedural": false }}
""")
validator = Agent(
    role="Final Validator",
    goal="Finalize and format the decision into a structured JSON. ",
    backstory="Ensures consistency between classifier and context output.",
    llm=llm,
    prompt=validator_prompt,
    verbose=True,
)


task1 = Task(
    agent=summarizer,
    description="Summarize procedural {input_content}.",
    expected_output="A concise single sentence summary of the input paragraph."
)

task2 = Task(
    agent=classifier,
    description="Classify summary as procedural or not using few-shot learning.",
    expected_output="'Yes' if procedural, otherwise 'No'."
)

task3 = Task(
    agent=contextual,
    description="Validate classifier decision using RAG-powered context.",
    expected_output="'Yes' if RAG context supports the classification, otherwise 'No'."
)

task4 = Task(
    agent=validator,
    description="Produce final structured JSON output.",
    expected_output='A JSON like: { "is_procedural": true } or { "is_procedural": false }'
)


crew = Crew(
    agents=[summarizer, classifier, contextual, validator],
    tasks=[task1, task2, task3, task4],
    verbose=True,
)





In [None]:
query = "We always proceed performing a 3-cm utility incision at the 5th intercostal space anteriorly of the latissimus dorsi"
inputs = {"input_content":query}


result = crew.kickoff(inputs=inputs)



In [None]:
import json
json.loads(result.raw)['is_procedural']


In [None]:
preds = []
for query in X_test.values:
  inputs = {"input_content":query}
  result = crew.kickoff(inputs=inputs)
  out = json.loads(result.raw)['is_procedural']
  if out in [True,False]:
    preds.append(out)
  else:
    preds.append(None)

In [None]:
df2 = pd.DataFrame(zip(preds,y_test.values),columns=['pred','label'])

In [None]:
df2.to_csv('my_preds.csv')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('my_preds.csv',index_col=0)

In [None]:
df

In [None]:
df['pred'] = df['pred'].apply(lambda x : int(x))

In [None]:
df['label'] = df['label'].apply(lambda x : int(x == 'yes'))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(df['label'],df['pred']))

              precision    recall  f1-score   support

           0       0.68      0.31      0.42       157
           1       0.75      0.93      0.83       346

    accuracy                           0.74       503
   macro avg       0.71      0.62      0.63       503
weighted avg       0.73      0.74      0.70       503



In [None]:
df['pred'].value_counts()

In [None]:
df['label'].value_counts()

# Simple Multi Agents

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

In [None]:
df = pd.DataFrame(lines,columns=['label','sentence'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 'yes'
  else:
    return 'no'

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.3, random_state=42)

In [None]:
!export OPENAI_API_KEY=""

In [None]:
rag_lines = X_train + " the sentence is procedural: " + y_train
print(rag_lines.values)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from crewai import Agent, Task, Crew
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from crewai.tools import tool

api_key=""
llm = ChatOpenAI(model="gpt-4o-mini",api_key=api_key)



few_shot_examples = []
for i in range(50):
  d = f"{{'input':{rag_lines.values[i]},'output':{y_train.values[i]}}}"
  few_shot_examples.append(d)

classifier_prompt = PromptTemplate.from_examples(
    examples=few_shot_examples,
    suffix="Sentence: {input}\n\nReply with 'Yes' if procedural, otherwise 'No'.",
    input_variables=["input"]
)
classifier = Agent(
    role="Classifier",
    goal="Classify sentence {input_content} as procedural or not using few-shot examples.",
    backstory="Expert in detecting whether sentences are valid surgical procedures.",
    llm=llm,
    prompt=classifier_prompt,
    verbose=True,
)



validator_prompt = PromptTemplate.from_template("""
Given the results from classifier , validate final decision.

Classifier output: {input1}


If both are 'Yes', return JSON: {{ "is_procedural": true }}
Otherwise return JSON: {{ "is_procedural": false }}
""")
validator = Agent(
    role="Final Validator",
    goal="Finalize and format the decision into a structured JSON. ",
    backstory="Ensures consistency between classifier and context output.",
    llm=llm,
    prompt=validator_prompt,
    verbose=True,
)



task1 = Task(
    agent=classifier,
    description="Classify {input_content} as procedural or not using few-shot learning.",
    expected_output="'Yes' if procedural, otherwise 'No'."
)



task2 = Task(
    agent=validator,
    description="Produce final structured JSON output.",
    expected_output='A JSON like: { "is_procedural": true } or { "is_procedural": false }'
)

crew = Crew(
    agents=[ classifier,  validator],
    tasks=[task1, task2,],
    verbose=True,
)





In [None]:
query = "We always proceed performing a 3-cm utility incision at the 5th intercostal space anteriorly of the latissimus dorsi"
inputs = {"input_content":query}


result = crew.kickoff(inputs=inputs)



In [None]:
import json
json.loads(result.raw)['is_procedural']


In [None]:
preds = []
for query in X_test.values:
  inputs = {"input_content":query}
  result = crew.kickoff(inputs=inputs)
  out = json.loads(result.raw)['is_procedural']
  if out in [True,False]:
    preds.append(out)
  else:
    preds.append(None)

In [None]:
df2 = pd.DataFrame(zip(preds,y_test.values),columns=['pred','label'])

In [None]:
df2.to_csv('my_preds2.csv')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('my_preds2.csv',index_col=0)

In [None]:
df

In [None]:
df['pred'] = df['pred'].apply(lambda x : int(x))

In [None]:
df['label'] = df['label'].apply(lambda x : int(x == 'yes'))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(df['label'],df['pred']))

              precision    recall  f1-score   support

           0       0.74      0.56      0.64       217
           1       0.81      0.91      0.86       458

    accuracy                           0.80       675
   macro avg       0.78      0.73      0.75       675
weighted avg       0.79      0.80      0.79       675



In [None]:
df['pred'].value_counts()

In [None]:
df['label'].value_counts()

# Simple Multi Agents Higher Few shot

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

In [None]:
df = pd.DataFrame(lines,columns=['label','sentence'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 'yes'
  else:
    return 'no'

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.3, random_state=42)

In [None]:
rag_lines = X_train + " the sentence is procedural: " + y_train
print(rag_lines.values)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
from crewai import Agent, Task, Crew
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from crewai.tools import tool

api_key=""
llm = ChatOpenAI(model="gpt-4o-mini",api_key=api_key)



few_shot_examples = []
for i in range(150):
  d = f"{{'input':{rag_lines.values[i]},'output':{y_train.values[i]}}}"
  few_shot_examples.append(d)

classifier_prompt = PromptTemplate.from_examples(
    examples=few_shot_examples,
    suffix="Sentence: {input}\n\nReply with 'Yes' if procedural, otherwise 'No'.",
    input_variables=["input"]
)
classifier = Agent(
    role="Classifier",
    goal="Classify sentence {input_content} as procedural or not using few-shot examples.",
    backstory="Expert in detecting whether sentences are valid surgical procedures.",
    llm=llm,
    prompt=classifier_prompt,
    verbose=True,
)



validator_prompt = PromptTemplate.from_template("""
Given the results from classifier , validate final decision.

Classifier output: {input1}


If both are 'Yes', return JSON: {{ "is_procedural": true }}
Otherwise return JSON: {{ "is_procedural": false }}
""")
validator = Agent(
    role="Final Validator",
    goal="Finalize and format the decision into a structured JSON. ",
    backstory="Ensures consistency between classifier and context output.",
    llm=llm,
    prompt=validator_prompt,
    verbose=True,
)



task1 = Task(
    agent=classifier,
    description="Classify {input_content} as procedural or not using few-shot learning.",
    expected_output="'Yes' if procedural, otherwise 'No'."
)



task2 = Task(
    agent=validator,
    description="Produce final structured JSON output.",
    expected_output='A JSON like: { "is_procedural": true } or { "is_procedural": false }'
)


crew = Crew(
    agents=[ classifier,  validator],
    tasks=[task1, task2,],
    verbose=True,
)





In [None]:
query = "We always proceed performing a 3-cm utility incision at the 5th intercostal space anteriorly of the latissimus dorsi"
inputs = {"input_content":query}


result = crew.kickoff(inputs=inputs)



In [None]:
import json
json.loads(result.raw)['is_procedural']


In [None]:
preds = []
for query in X_test.values:
  inputs = {"input_content":query}
  result = crew.kickoff(inputs=inputs)
  out = json.loads(result.raw)['is_procedural']
  if out in [True,False]:
    preds.append(out)
  else:
    preds.append(None)

In [None]:
df2 = pd.DataFrame(zip(preds,y_test.values),columns=['pred','label'])

In [None]:
df2.to_csv('my_preds3.csv')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('my_preds3.csv',index_col=0)

In [None]:
df

In [None]:
df['pred'] = df['pred'].apply(lambda x : int(x))

In [None]:
df['label'] = df['label'].apply(lambda x : int(x == 'yes'))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(df['label'],df['pred']))

              precision    recall  f1-score   support

           0       0.75      0.56      0.64       217
           1       0.81      0.91      0.86       458

    accuracy                           0.80       675
   macro avg       0.78      0.74      0.75       675
weighted avg       0.79      0.80      0.79       675



In [None]:
df['pred'].value_counts()

In [None]:
df['label'].value_counts()

# Multi Agent new version

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

In [None]:
df = pd.DataFrame(lines,columns=['label','sentence'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 'yes'
  else:
    return 'no'

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.3, random_state=42)

In [None]:
rag_lines = X_train + " the sentence is procedural: " + y_train
print(rag_lines.values)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
len(X_train)

1575

In [None]:
from crewai import Agent, Task, Crew
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from crewai.tools import tool

api_key=""
llm = ChatOpenAI(model="gpt-4o-mini",api_key=api_key)

documents = [Document(line) for line in rag_lines.values]
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)

vectorstore = FAISS.from_documents(chunks, OpenAIEmbeddings(api_key=api_key))
few_shot_examples = []
for i in range(1575):
  d = f"{{'input':{rag_lines.values[i]},'output':{y_train.values[i]}}}"
  few_shot_examples.append(d)

classifier_prompt = PromptTemplate.from_examples(
    examples=few_shot_examples,
    suffix="Sentence: {input_content}\n\nReply with 'Yes' if procedural, otherwise 'No'.",
    input_variables=["input"]
)
classifier = Agent(
    role="Classifier",
    goal="Classify sentence {input_content} as procedural or not using few-shot examples.",
    backstory="Expert in detecting whether sentences are valid surgical procedures.",
    llm=llm,
    prompt=classifier_prompt,
    verbose=True,
)

task1 = Task(
    agent=classifier,
    description="Classify based on {input_content} input sentence as procedural or not using few-shot learning.",
    expected_output='A JSON like: { "is_procedural": true } or { "is_procedural": false }'
)




crew = Crew(
    agents=[ classifier],
    tasks=[task1],
    verbose=True,
)





In [None]:
query = "We always proceed performing a 3-cm utility incision at the 5th intercostal space anteriorly of the latissimus dorsi"
inputs = {"input_content":query}


result = crew.kickoff(inputs=inputs)



In [None]:
import json
json.loads(result.raw)['is_procedural']


In [None]:
len(X_test)

675

In [None]:
preds = []
for query in X_test.values:
  inputs = {"input_content":query}
  result = crew.kickoff(inputs=inputs)
  out = json.loads(result.raw)['is_procedural']
  if out in [True,False]:
    preds.append(out)
  else:
    preds.append(None)

In [None]:
df2 = pd.DataFrame(zip(preds,y_test.values),columns=['pred','label'])

In [None]:
df2

In [None]:
df2.to_csv('my_preds4.csv')

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('my_preds4.csv',index_col=0)

In [None]:
df

In [None]:
df['pred'] = df['pred'].apply(lambda x : int(x))

In [None]:
df['label'] = df['label'].apply(lambda x : int(x == 'yes'))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(df['label'],df['pred']))

              precision    recall  f1-score   support

           0       0.81      0.45      0.58       217
           1       0.78      0.95      0.86       458

    accuracy                           0.79       675
   macro avg       0.80      0.70      0.72       675
weighted avg       0.79      0.79      0.77       675



In [None]:
df['pred'].value_counts()

Unnamed: 0_level_0,count
pred,Unnamed: 1_level_1
1,555
0,120


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,458
0,217


# Fine Tuning QLoRA

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

['label', 'sentence\n']

In [None]:
df = pd.DataFrame(lines,columns=['label','text'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 1
  else:
    return 0

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

In [None]:
train_df = pd.concat([X_train,y_train],axis=1)
test_df = pd.concat([X_test,y_test],axis=1)
train_df

In [None]:
import os
os.environ["HF_TOKEN"] = ""

In [None]:


import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer


dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})


def format_text(example):
    prompt = f"SENTENCE: {example['text']}\nIs it procedural?"
    label = "Yes" if example["label"] == 1 else "No"
    return {"text": f"{prompt}\n{label}"}

dataset = dataset.map(format_text)
dataset = dataset.remove_columns([col for col in dataset["train"].column_names if col != "text"])


model_name = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)


training_args = TrainingArguments(
    output_dir="./mistral-spks-finetune",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_total_limit=2,
    logging_steps=10
)




In [None]:

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],

    args=training_args,
)

trainer.train()

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm


def predict(example):
    prompt = f"SENTENCE: {example['text']}\nIs it procedural?"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    output = model.generate(**inputs, max_new_tokens=5)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    prediction = 1 if "Yes" in decoded else 0
    return prediction


y_true = test_df["label"].tolist()
y_pred = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    pred = predict(row)
    y_pred.append(pred)


acc = accuracy_score(y_true, y_pred)
print(f"\n Accuracy: {acc:.4f}\n")

print(" Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Non-Procedural", "Procedural"]))


  0%|          | 0/675 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/675 [00:01<12:22,  1.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/675 [00:01<09:52,  1.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/675 [00:02<07:54,  1.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 4/675 [00:02<06:50,  1.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 5/675 [00:03<06:17,  1.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/675 [00:03<06:02,  1.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/675 [00:04<06:11,  1.80it/s]Setting `pad_t


✅ Accuracy: 0.8948

📊 Classification Report:
                precision    recall  f1-score   support

Non-Procedural       0.89      0.76      0.82       217
    Procedural       0.90      0.96      0.93       458

      accuracy                           0.89       675
     macro avg       0.89      0.86      0.87       675
  weighted avg       0.89      0.89      0.89       675






In [None]:
test = X_test.reset_index()['text'][3]
prompt = f"SENTENCE: {test}\nIs it procedural?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
output = model.generate(**inputs, max_new_tokens=5)
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
prediction = 1 if "Yes" in decoded else 0
prediction

# Fine Tuning QLoRA New Version

In [None]:
import pandas as pd

In [None]:
with open("SPKS.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [None]:
lines = list(map(lambda x : x.split('@'),lines))
lines.pop(0)

In [None]:
df = pd.DataFrame(lines,columns=['label','text'])

In [None]:
df['label'].value_counts()

In [None]:
def change_label(x):
  if x == '__label__1':
    return 1
  else:
    return 0

In [None]:
df['label'] = df['label'].apply(change_label)

In [None]:
df

In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

In [None]:
train_df = pd.concat([X_train,y_train],axis=1)
test_df = pd.concat([X_test,y_test],axis=1)
train_df

In [None]:
import os
os.environ["HF_TOKEN"] = ""

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})


def format_text(example):
    prompt = f"SENTENCE: {example['text']}\nIs it procedural?"
    label = "Yes" if example["label"] == 1 else "No"
    return {"text": f"{prompt}\n{label}"}

dataset = dataset.map(format_text)
dataset = dataset.remove_columns([col for col in dataset["train"].column_names if col != "text"])


model_name = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)




In [None]:
from transformers import EarlyStoppingCallback

In [None]:


training_args = TrainingArguments(
    output_dir="./mistral-spks-finetune",
    num_train_epochs=7,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    logging_steps=10,
    learning_rate=2e-5,
    warmup_steps=100,
    weight_decay=0.05,

     eval_strategy ="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_loss"
)


In [None]:

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],

    args=training_args,
     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm


def predict(example):
    prompt = f"SENTENCE: {example['text']}\nIs it procedural?"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
    output = model.generate(**inputs, max_new_tokens=5)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)


    prediction = 1 if "Yes" in decoded else 0
    return prediction


y_true = test_df["label"].tolist()
y_pred = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    pred = predict(row)
    y_pred.append(pred)


acc = accuracy_score(y_true, y_pred)
print(f"\n Accuracy: {acc:.4f}\n")

print(" Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Non-Procedural", "Procedural"]))


  0%|          | 0/675 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/675 [00:00<10:05,  1.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/675 [00:01<07:36,  1.48it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/675 [00:01<06:16,  1.78it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 4/675 [00:02<05:27,  2.05it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 5/675 [00:02<05:06,  2.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/675 [00:03<04:58,  2.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/675 [00:03<05:00,  2.23it/s]Setting `pad_t


✅ Accuracy: 0.8963

📊 Classification Report:
                precision    recall  f1-score   support

Non-Procedural       0.91      0.76      0.82       217
    Procedural       0.89      0.96      0.93       458

      accuracy                           0.90       675
     macro avg       0.90      0.86      0.88       675
  weighted avg       0.90      0.90      0.89       675






In [None]:
test = X_test.reset_index()['text'][3]
prompt = f"SENTENCE: {test}\nIs it procedural?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
output = model.generate(**inputs, max_new_tokens=5)
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
prediction = 1 if "Yes" in decoded else 0
prediction