In [5]:
import pandas as pd

# Load with error handling
try:
    train = pd.read_csv('train.csv', delimiter=',', quoting=3, on_bad_lines='skip')  # Skip problematic lines
except pd.errors.ParserError as e:
    print("Error loading file:", e)

# Save cleaned file for debugging
train.to_csv('train_cleaned.csv', index=False)


In [7]:
import pandas as pd

# Load with error handling
try:
    train = pd.read_csv('valid.csv', delimiter=',', quoting=3, on_bad_lines='skip')  # Skip problematic lines
except pd.errors.ParserError as e:
    print("Error loading file:", e)

# Save cleaned file for debugging
train.to_csv('valid_cleaned.csv', index=False)


In [8]:
print(train.columns)
print(train.head())


Index(['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx',
       'utterance', 'selfeval', 'tags'],
      dtype='object')
        conv_id  utterance_idx    context  \
0  hit:3_conv:6              1  terrified   
1  hit:3_conv:6              3  terrified   
2  hit:3_conv:6              5  terrified   
3  hit:4_conv:9              1  surprised   
4  hit:4_conv:9              3  surprised   

                                              prompt  speaker_idx  \
0  Today_comma_as i was leaving for work in the m...            6   
1  Today_comma_as i was leaving for work in the m...            6   
2  Today_comma_as i was leaving for work in the m...            6   
3  I was walking through my hallway a few week ag...            8   
4  I was walking through my hallway a few week ag...            8   

                                           utterance     selfeval tags  
0  Today_comma_as i was leaving for work in the m...  4|5|5_5|5|5  NaN  
1  Yeah_comma_i'm doing alright no

In [9]:
# Load the cleaned files
train_cleaned = pd.read_csv('train_cleaned.csv')
test_cleaned = pd.read_csv('test_cleaned.csv')
valid_cleaned = pd.read_csv('valid_cleaned.csv')

# Check basic information
print("Train Data:")
print(train_cleaned.info())
print(train_cleaned.head())

print("Test Data:")
print(test_cleaned.info())
print(test_cleaned.head())

print("Validation Data:")
print(valid_cleaned.info())
print(valid_cleaned.head())


Train Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84169 entries, 0 to 84168
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   conv_id        84169 non-null  object
 1   utterance_idx  84169 non-null  int64 
 2   context        84169 non-null  object
 3   prompt         84169 non-null  object
 4   speaker_idx    84169 non-null  int64 
 5   utterance      84169 non-null  object
 6   selfeval       84169 non-null  object
 7   tags           755 non-null    object
dtypes: int64(2), object(6)
memory usage: 5.1+ MB
None
        conv_id  utterance_idx      context  \
0  hit:0_conv:1              1  sentimental   
1  hit:0_conv:1              2  sentimental   
2  hit:0_conv:1              3  sentimental   
3  hit:0_conv:1              4  sentimental   
4  hit:0_conv:1              5  sentimental   

                                              prompt  speaker_idx  \
0  I remember going to the fireworks with m

In [10]:
import pandas as pd

# Load cleaned datasets
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
valid = pd.read_csv('valid_cleaned.csv')

# Combine datasets into one
combined_data = pd.concat([train, test, valid], ignore_index=True)

# Save combined data
combined_data.to_csv('combined_cleaned_data.csv', index=False)

# Inspect the data
print(combined_data.head())


        conv_id  utterance_idx      context  \
0  hit:0_conv:1              1  sentimental   
1  hit:0_conv:1              2  sentimental   
2  hit:0_conv:1              3  sentimental   
3  hit:0_conv:1              4  sentimental   
4  hit:0_conv:1              5  sentimental   

                                              prompt  speaker_idx  \
0  I remember going to the fireworks with my best...            1   
1  I remember going to the fireworks with my best...            0   
2  I remember going to the fireworks with my best...            1   
3  I remember going to the fireworks with my best...            0   
4  I remember going to the fireworks with my best...            1   

                                           utterance     selfeval tags  
0  I remember going to see the fireworks with my ...  5|5|5_2|2|5  NaN  
1  Was this a friend you were in love with_comma_...  5|5|5_2|2|5  NaN  
2                This was a best friend. I miss her.  5|5|5_2|2|5  NaN  
3         

In [11]:
import pandas as pd
import re

# Load cleaned CSV data
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
valid = pd.read_csv('valid_cleaned.csv')

# Combine datasets into one
combined_data = pd.concat([train, test, valid], ignore_index=True)

# Step 1: Preprocess Text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

combined_data['utterance'] = combined_data['utterance'].apply(clean_text)

# Save combined data with cleaned text
combined_data.to_csv('combined_cleaned_data.csv', index=False)

print("Text cleaning completed and combined dataset saved!")



Text cleaning completed and combined dataset saved!


In [12]:
import os
import xml.etree.ElementTree as ET

# Path to the MedQuAD dataset folder
medquad_path = 'MedQuAD-master'

# List all folders inside the MedQuAD directory
folders = os.listdir(medquad_path)

# Print folder names
print(folders)

# Load a sample XML file to inspect
sample_file = os.path.join(medquad_path, folders[0], os.listdir(os.path.join(medquad_path, folders[0]))[0])

# Parse XML
tree = ET.parse(sample_file)
root = tree.getroot()

# Display root tag and some child elements
print(root.tag)
print(root[0].tag, root[0].text)  # Assuming the first child element


['10_MPlus_ADAM_QA', '11_MPlusDrugs_QA', '12_MPlusHerbsSupplements_QA', '1_CancerGov_QA', '2_GARD_QA', '3_GHR_QA', '4_MPlus_Health_Topics_QA', '5_NIDDK_QA', '6_NINDS_QA', '7_SeniorHealth_QA', '8_NHLBI_QA_XML', '9_CDC_QA', 'LICENSE.txt', 'QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip', 'readme.txt']
Document
Focus A guide to clinical trials for cancer


In [21]:
import pandas as pd
import xml.etree.ElementTree as ET
import os

# Path to the MedQuAD dataset folder
medquad_path = 'MedQuAD-master'

# Initialize an empty DataFrame
medquad_data = pd.DataFrame()

# Loop through each folder and parse XML files
folders = ['10_MPlus_ADAM_QA', '11_MPlusDrugs_QA', '12_MPlusHerbsSupplements_QA', 
           '1_CancerGov_QA', '2_GARD_QA', '3_GHR_QA', '4_MPlus_Health_Topics_QA', 
           '5_NIDDK_QA', '6_NINDS_QA', '7_SeniorHealth_QA', '8_NHLBI_QA_XML', '9_CDC_QA']

for folder in folders:
    folder_path = os.path.join(medquad_path, folder)
    files = os.listdir(folder_path)
    for file in files:
        xml_file = os.path.join(folder_path, file)
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Extract relevant fields with error handling
        question_elem = root.find('question')
        answer_elem = root.find('answer')
        
        if question_elem is not None and answer_elem is not None:
            question = question_elem.text
            answer = answer_elem.text
            
            # Append to DataFrame
            medquad_data = medquad_data.append({'question': question, 'answer': answer}, ignore_index=True)

# Save combined MedQuAD data to CSV
medquad_data.to_csv('medquad_combined_cleaned.csv', index=False)

print("MedQuAD dataset parsed and combined with Questions and Answers!")


MedQuAD dataset parsed and combined with Questions and Answers!


In [27]:
import pandas as pd
import xml.etree.ElementTree as ET
import os

# Path to the MedQuAD dataset folder
medquad_path = 'MedQuAD-master'

# Initialize an empty DataFrame
medquad_data = pd.DataFrame()

# Loop through each folder and parse XML files
folders = ['10_MPlus_ADAM_QA', '11_MPlusDrugs_QA', '12_MPlusHerbsSupplements_QA',
           '1_CancerGov_QA', '2_GARD_QA', '3_GHR_QA', '4_MPlus_Health_Topics_QA',
           '5_NIDDK_QA', '6_NINDS_QA', '7_SeniorHealth_QA', '8_NHLBI_QA_XML', '9_CDC_QA']

for folder in folders:
    folder_path = os.path.join(medquad_path, folder)
    files = os.listdir(folder_path)
    for file in files:
        xml_file = os.path.join(folder_path, file)
        
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            
            # Iterate over all <QAPair> elements
            for qa_pair in root.findall('QAPairs/QAPair'):
                question_elem = qa_pair.find('Question')
                answer_elem = qa_pair.find('Answer')
                
                if question_elem is not None and answer_elem is not None:
                    question = question_elem.text.strip() if question_elem.text else None
                    answer = answer_elem.text.strip() if answer_elem.text else None
                    
                    if question and answer:
                        medquad_data = pd.concat([medquad_data, pd.DataFrame({'question': [question], 'answer': [answer]})], ignore_index=True)
        
        except ET.ParseError as e:
            print(f"Parse error in file {xml_file}: {e}")

# Save combined MedQuAD data to CSV
medquad_data.to_csv('medquad_combined_cleaned.csv', index=False)

print("MedQuAD dataset parsed and combined with Questions and Answers!")


Parse error in file MedQuAD-master\1_CancerGov_QA\0000001_1.xml: XML or text declaration not at start of entity: line 1, column 1
MedQuAD dataset parsed and combined with Questions and Answers!


In [1]:
# Import required libraries
import pandas as pd

# Load the datasets
empathetic_df = pd.read_csv("combined_cleaned_data.csv")
medical_df = pd.read_csv("medquad_combined_cleaned.csv")

# Select relevant columns and standardize column names
# For empathetic dialogue dataset
empathetic_df = empathetic_df[['context', 'utterance']].rename(columns={
    'context': 'input',
    'utterance': 'response'
})
empathetic_df['source'] = 'empathetic'  # Add source identifier

# For medical dataset
medical_df = medical_df[['question', 'answer']].rename(columns={
    'question': 'input',
    'answer': 'response'
})
medical_df['source'] = 'medical'  # Add source identifier

# Combine the two datasets
combined_df = pd.concat([empathetic_df, medical_df], ignore_index=True)

# Optional: Shuffle the combined dataset for better distribution
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined dataset to a new CSV
combined_df.to_csv("combined_knowledge_base.csv", index=False)

print("Datasets combined successfully and saved as 'combined_knowledge_base.csv'.")


Datasets combined successfully and saved as 'combined_knowledge_base.csv'.


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the combined dataset
combined_data = pd.read_csv('combined_knowledge_base.csv')  # Replace with your file path

# Randomly sample a subset of the data (e.g., 10% of the rows or a fixed number)
subset_size = 30000  # Change this value to the desired number of rows
subset_data = combined_data.sample(n=subset_size, random_state=42)  # For reproducibility

# Load the Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2')  # Replace with other models if needed

# Generate embeddings for the "input" column
text_data = subset_data['input'].tolist()  # Use the 'input' column from your dataset
embeddings = model.encode(text_data, show_progress_bar=True, batch_size=32)

# Save embeddings and update the DataFrame
np.save('subset_knowledge_base_embeddings.npy', embeddings)  # Save embeddings as .npy
subset_data['embedding'] = embeddings.tolist()

# Save the subset with embeddings for reference
subset_data.to_csv('subset_knowledge_base_with_embeddings.csv', index=False)

print("Embeddings for the subset generated and saved successfully!")


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

Embeddings for the subset generated and saved successfully!


In [4]:
import faiss

# Dimensions of the embeddings
embedding_dim = embeddings.shape[1]

# Create a FAISS index
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, 'subset_knowledge_base_faiss_index.bin')

print("FAISS index created and saved successfully!")


FAISS index created and saved successfully!


In [6]:
# Test with a sample query
query = "What are the symptoms of a fever?"
query_embedding = model.encode([query])

# Perform similarity search in FAISS
D, I = index.search(query_embedding, k=5)

# Display retrieved results
print("\nQuery Results:")
for idx in I[0]:
    print("Retrieved Input:", subset_data.iloc[idx]['input'])
    print("Retrieved Response:", subset_data.iloc[idx]['response'])
    print("Retrieved Source:", subset_data.iloc[idx]['source'])
    print("-" * 50)  # Separator for readability



Query Results:
Retrieved Input: What are the symptoms of Typhoid Fever ?
Retrieved Response: Persons with typhoid fever usually have a sustained fever as high as 103° to 104° F (39° to 40° C). They may also feel weak, or have stomach pains, headache, or loss of appetite. In some cases, patients have a rash of flat, rose-colored spots. The only way to know for sure if an illness is typhoid fever is to have samples of stool or blood tested for the presence of Salmonella Typhi.

Typhoid fever’s danger doesn’t end when symptoms disappear: 

Even if your symptoms seem to go away, you may still be carrying Salmonella Typhi. If so, the illness could return, or you could pass the disease to other people. In fact, if you work at a job where you handle food or care for small children, you may be barred legally from going back to work until a doctor has determined that you no longer carry any typhoid bacteria.
                
If you are being treated for typhoid fever, it is important to do the

In [7]:
import faiss
import pandas as pd

# Load FAISS index
faiss_index_path = "subset_knowledge_base_faiss_index.bin"
faiss_index = faiss.read_index(faiss_index_path)

# Load metadata
metadata_path = "subset_knowledge_base_with_embeddings.csv"
metadata_df = pd.read_csv(metadata_path)

print("FAISS index and metadata loaded successfully!")


FAISS index and metadata loaded successfully!


In [8]:
print("FAISS index dimension:", faiss_index.d)
print("Query embedding shape:", query_embedding.shape[1])


FAISS index dimension: 768
Query embedding shape: 768


In [5]:
pip install tensorflow


Note: you may need to restart the kernel to use updated packages.Collecting tensorflow
  Using cached tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Using cached tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Using cached flatbuffers-24.12.23-py2.py3-none-any.whl.metadata (876 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Using cached google_pasta-0.2.0-py3-no

In [7]:
pip install tf-keras


Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 1.5 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 1.5 MB/s eta 0:00:01
   ------------------------ --------------- 1.0/1.7 MB 1.7 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.7 MB/s eta 0:00:01
   ------------------------------------ --- 1.6/1.7 MB 1.5 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 1.2 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.18.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import faiss
import pandas as pd
import numpy as np
import traceback
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM

# Step 1: Initialize the Sentence Transformer model
embed_model = SentenceTransformer('all-mpnet-base-v2')

# Step 2: Load the FAISS index and metadata
faiss_index_path = "subset_knowledge_base_faiss_index.bin"  # Ensure this file exists
metadata_path = "subset_knowledge_base_with_embeddings.csv"  # Ensure this file exists

# Load the FAISS index
faiss_index = faiss.read_index(faiss_index_path)

# Load the metadata
metadata_df = pd.read_csv(metadata_path)

print("FAISS index and metadata loaded successfully!")

# Step 3: Load the TensorFlow T5 model and tokenizer
local_model_path = r"C:\Users\Kaushik\Desktop\baymax_personal"  # Path to the folder containing model files
tokenizer = T5Tokenizer.from_pretrained(local_model_path, local_files_only=True, legacy=True)
model = TFAutoModelForSeq2SeqLM.from_pretrained(local_model_path, local_files_only=True, from_pt=False)

# Step 4: Define the RAG pipeline function
def generate_response(query, top_k=5):
    """
    Generates a response for a given query using the RAG pipeline.
    :param query: Query string
    :param top_k: Number of top documents to retrieve
    :return: Generated response and retrieved context
    """
    try:
        # Step 1: Convert query to embedding
        query_embedding = embed_model.encode(query, convert_to_tensor=False)

        # Step 2: Search FAISS index
        distances, indices = faiss_index.search(np.array([query_embedding]), top_k)

        # Step 3: Retrieve relevant context from metadata
        retrieved_context = "\n".join(
            f"Input: {metadata_df.iloc[idx]['input']}\nResponse: {metadata_df.iloc[idx]['response']}\nSource: {metadata_df.iloc[idx]['source']}"
            for idx in indices[0] if idx < len(metadata_df)
        )

        # Step 4: Generate response with T5
        input_text = f"Query: {query}\nContext: {retrieved_context}\nAnswer:"
        inputs = tokenizer(input_text, return_tensors="tf", max_length=512, truncation=True)
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            num_beams=2,
            early_stopping=True,
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return response, retrieved_context
    except Exception as e:
        return f"An error occurred: {e}\n{traceback.format_exc()}", None

# Step 5: Test the RAG pipeline
if __name__ == "__main__":
    print("RAG pipeline setup complete!")

    # Example query
    example_query = "What is the role of artificial intelligence in healthcare?"
    response, context = generate_response(example_query, top_k=5)

    print("\nGenerated Response:")
    print(response)

    print("\nRetrieved Context:")
    print(context)


FAISS index and metadata loaded successfully!



All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at C:\Users\Kaushik\Desktop\baymax_personal.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


RAG pipeline setup complete!

Generated Response:
Use a service such as Meals on Wheels, which will bring meals right to your home. For more information, check your local phone book, or contact the Meals on Wheels organization at 1 (888) 998-6325.

Retrieved Context:
Input: Do you have information about Patient Safety
Response: Summary : You can help prevent medical errors by being an active member of your health care team. Research shows that patients who are more involved with their care tend to get better results. To reduce the risk of medical errors, you can       - Ask questions if you have doubts or concerns. Take a relative or friend to your doctor appointment to help you ask questions and understand answers.    - Make sure you understand what will happen if you need surgery    - Tell your health care providers about all the medicines you take, including over-the-counter drugs and dietary supplements. Tell them if you have any allergies or bad reactions to anesthesia. Make sure 

In [11]:
if __name__ == "__main__":
    print("Testing RAG pipeline...")

    # Example queries to test
    queries = [
        "What are the benefits of AI in healthcare?",
        "Explain the role of machine learning in modern technology.",
        "How does natural language processing work?",
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        response, context = generate_response(query, top_k=3)
        print("\nGenerated Response:")
        print(response)
        print("\nRetrieved Context:")
        print(context)

    print("\nRAG pipeline setup and testing complete!")


Testing RAG pipeline...

Query: What are the benefits of AI in healthcare?

Generated Response:
Use a service such as Meals on Wheels, which will bring meals right to your home. For more information, check your local phone book, or contact the Meals on Wheels organization at 1 (888) 998-6325

Retrieved Context:
Input: what research (or clinical trials) is being done for Alzheimer's Disease ?
Response: Clinical research is medical research involving people. It includes clinical studies, which observe and gather information about large groups of people. It also includes clinical trials, which test a medicine, therapy, medical device, or intervention in people to see if it is safe and effective. Clinical trials are the best way to find out whether a particular intervention actually slows, delays, or prevents Alzheimers disease. Trials may compare a potential new treatment with a standard treatment or placebo (mock treatment). Or, they may study whether a certain behavior or condition affe

In [14]:
import pandas as pd
import json

# Load the combined dataset
combined_path = "combined_knowledge_base.csv"  # Update this path if needed
df = pd.read_csv(combined_path)

# Prepare input and output columns with refined preprocessing
def preprocess_row(row):
    # Check if 'input' or 'response' are NaN and replace with empty string
    input_text = str(row.get('input', '') or '').strip()
    output_text = str(row.get('response', '') or '').strip()

    # Handling rows based on the source column
    if row['source'] == 'empathetic':
        # Prepare input and output for empathic dialogue dataset
        input_text = f"Context: {input_text}"
        # Output remains the same
    elif row['source'] == 'medical':
        # Prepare input and output for medical dataset
        input_text = f"Question: {input_text}"
        # Output remains the same
    else:
        # Handle unexpected or undefined source types
        input_text, output_text = "", ""

    return input_text, output_text

# Apply preprocessing
df['input'], df['output'] = zip(*df.apply(preprocess_row, axis=1))

# Drop rows with empty input or output (after ensuring they are not NaN or empty)
df = df[(df['input'] != "") & (df['output'] != "")]

# Split into train and validation sets
train_df = df.sample(frac=0.8, random_state=42)  # 80% for training
val_df = df.drop(train_df.index)  # 20% for validation

# Save datasets to JSON format
def save_to_json(dataframe, path):
    # Save the 'input' and 'output' columns as JSON
    records = dataframe[['input', 'output']].to_dict(orient='records')
    with open(path, 'w') as f:
        json.dump(records, f, indent=4)

# Save training and validation datasets
save_to_json(train_df, "train_data.json")
save_to_json(val_df, "val_data.json")

print(f"Preprocessing complete! Train and validation datasets saved as 'train_data.json' and 'val_data.json'.")


Preprocessing complete! Train and validation datasets saved as 'train_data.json' and 'val_data.json'.


In [1]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
from datasets import load_dataset
import tensorflow as tf

# Paths and configuration
train_data_path = r"C:\\Users\\Kaushik\\Desktop\\baymax_personal\\train_data.json"
val_data_path = r"C:\\Users\\Kaushik\\Desktop\\baymax_personal\\val_data.json"
model_path = r"C:\\Users\\Kaushik\\Desktop\\baymax_personal"  # Start from pre-trained T5-small model
output_dir = r"C:\\Users\\Kaushik\\Desktop\\baymax_personal\\fine_tuned_flan_t5"

# Load the dataset
dataset = load_dataset("json", data_files={"train": train_data_path, "validation": val_data_path})

# Define the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = TFT5ForConditionalGeneration.from_pretrained(model_path)

# Preprocess the dataset
def preprocess_data(examples):
    # Ensure inputs and outputs are valid strings
    inputs = ["summarize: " + str(x) if x is not None else "" for x in examples["input"]]
    targets = [str(y) if y is not None else "" for y in examples["output"]]

    # Tokenize input and target sequences
    tokenized_inputs = tokenizer(
        inputs, max_length=128, truncation=True, padding="max_length", return_tensors="np"
    )
    tokenized_targets = tokenizer(
        targets, max_length=128, truncation=True, padding="max_length", return_tensors="np"
    )

    # Replace PAD tokens in labels with -100
    labels = tokenized_targets["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        "input_ids": tokenized_inputs["input_ids"].tolist(),
        "attention_mask": tokenized_inputs["attention_mask"].tolist(),
        "labels": labels.tolist(),
    }

# Apply preprocessing to train and validation datasets
tokenized_train = dataset["train"].map(
    preprocess_data, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)
tokenized_val = dataset["validation"].map(
    preprocess_data, 
    batched=True, 
    remove_columns=dataset["validation"].column_names
)

# Convert datasets to TensorFlow Datasets
def create_tf_dataset(tokenized_dataset):
    input_ids = [example["input_ids"] for example in tokenized_dataset]
    attention_masks = [example["attention_mask"] for example in tokenized_dataset]
    labels = [example["labels"] for example in tokenized_dataset]

    return tf.data.Dataset.from_tensor_slices(({
            "input_ids": tf.constant(input_ids, dtype=tf.int32),
            "attention_mask": tf.constant(attention_masks, dtype=tf.int32),
        },
        tf.constant(labels, dtype=tf.int32),
    ))

train_dataset = create_tf_dataset(tokenized_train).shuffle(1000).batch(8)
val_dataset = create_tf_dataset(tokenized_val).batch(8)

# Debugging: Check the structure of one batch
for batch in train_dataset.take(1):
    print(batch)

# Define optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)

# Train the model
try:
    model.fit(train_dataset, validation_data=val_dataset, epochs=3)
except Exception as e:
    print("Error during training:", str(e))

# Save the fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Fine-tuning completed! Model saved at:", output_dir)

# Generate answers for questions after training
def generate_answer(query: str):
    # Tokenize the input query
    input_ids = tokenizer("summarize: " + query, return_tensors="np").input_ids

    # Generate response using the fine-tuned model
    outputs = model.generate(
        input_ids, max_length=150, num_beams=4, temperature=0.7, early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example query after training
query = "What is neurotoxicity?"
answer = generate_answer(query)
print("Answer:", answer)





You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565





All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at C:\\Users\\Kaushik\\Desktop\\baymax_personal.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


({'input_ids': <tf.Tensor: shape=(8, 128), dtype=int32, numpy=
array([[21603,    10,  1193, ...,     0,     0,     0],
       [21603,    10,  1193, ...,     0,     0,     0],
       [21603,    10,  1193, ...,     0,     0,     0],
       ...,
       [21603,    10, 11860, ...,     0,     0,     0],
       [21603,    10,  1193, ...,     0,     0,     0],
       [21603,    10,  1193, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(8, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(8, 128), dtype=int32, numpy=
array([[   34,    47,     3, ...,  -100,  -100,  -100],
       [    3,    23,   317, ...,  -100,  -100,  -100],
       [  165,     3,     9, ...,  -100,  -100,  -100],
       ...,
       [  366,     3,     9, ..., 16935,    13,     1],
  



Answer: Neurotoxicity is a neurotoxicity that occurs when a person's nervous system is unable to function properly.
