In [1]:
# train_neo_model.py

from datasets import load_dataset
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load your dataset
dataset = load_dataset("text", data_files={"train": "data1.txt"})

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token  # Add pad token

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Create data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./neo_outputs",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    fp16=True,                      # ✅ Enables GPU half-precision for speed
    save_steps=200,
    save_total_limit=2,
    logging_steps=50,
    logging_dir="./neo_logs",
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./neo_outputs")
tokenizer.save_pretrained("./neo_outputs")

print("✅ Fine-tuning complete. Model and tokenizer saved to ./neo_outputs")


  trainer = Trainer(


Step,Training Loss
50,2.5765
100,2.5188
150,2.4961
200,2.5589
250,2.5702
300,2.5419
350,2.5632
400,2.5399
450,2.489
500,2.4963


✅ Fine-tuning complete. Model and tokenizer saved to ./neo_outputs


In [4]:
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle

# 📄 Path to your single .txt file
FILE_PATH = "data1.txt"
INDEX_DIR = "./vector_index"
os.makedirs(INDEX_DIR, exist_ok=True)

# 🔹 Load sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# 🔹 Function to split text into chunks (~100 words)
def chunk_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# 🔹 Read and chunk the single file
with open(FILE_PATH, "r", encoding="utf-8") as f:
    text = f.read()
    chunks = chunk_text(text)

print(f"📄 Total chunks: {len(chunks)}")

# 🔹 Generate embeddings
embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)

# 🔹 Store in FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# 🔹 Save FAISS index and text chunks
faiss.write_index(index, os.path.join(INDEX_DIR, "faiss_index.idx"))

with open(os.path.join(INDEX_DIR, "chunks.pkl"), "wb") as f:
    pickle.dump(chunks, f)

with open(os.path.join(INDEX_DIR, "sources.pkl"), "wb") as f:
    pickle.dump(["data1.txt"] * len(chunks), f)

print("✅ Embeddings created and stored in ./vector_index/")


📄 Total chunks: 26124


Batches:   0%|          | 0/817 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


✅ Embeddings created and stored in ./vector_index/


In [8]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer

# Paths
INDEX_DIR = "./vector_index"
TOP_K = 3  # Number of top chunks to retrieve

# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load FAISS index
index = faiss.read_index(f"{INDEX_DIR}/faiss_index.idx")

# Load text chunks
with open(f"{INDEX_DIR}/chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

# Load sources (optional)
with open(f"{INDEX_DIR}/sources.pkl", "rb") as f:
    sources = pickle.load(f)

def retrieve_top_chunks(query, top_k=TOP_K):
    # Embed the user query
    query_vector = embedder.encode([query])
    
    # Search in FAISS
    distances, indices = index.search(np.array(query_vector), top_k)
    
    # Get matching chunks
    top_chunks = [chunks[i] for i in indices[0]]
    top_sources = [sources[i] for i in indices[0]]
    
    print("\n🔎 Top Relevant Chunks:\n")
    for i, chunk in enumerate(top_chunks):
        print(f"Chunk {i+1} from [{top_sources[i]}]:\n{chunk}\n")
    
    return top_chunks
query = "What are the symptoms of lung cancer?"
top_chunks = retrieve_top_chunks(query)




🔎 Top Relevant Chunks:

Chunk 1 from [data1.txt]:
of cell type in patients with cancer of the lungs. In order to evaluate the determinants of cell type in patients with primary lung cancer, we compared smoking characteristics in 1,939 patients (1,474 men and 465 women). Patients with squamous cell carcinomas, adenocarcinomas, or small-cell carcinomas were eligible. This study did not consider smoking as a risk factor for lung cancer, as all subjects had a confirmed diagnosis. We were interested in smoking history and the pattern of smoking among those whose risk was 100 percent. Among these patients, we confirmed that a larger subset of nonsmoking individuals developed adenocarcinomas

Chunk 2 from [data1.txt]:
surgically treated 185 patients with non-small cell lung cancer who were 70 years old or older. The operative mortality rate was 3%, and the 5-year survival rate was 48%. The mortality and prognosis were similar to those in younger patients. The number of elderly patients who s

In [10]:
import os
import torch
import pickle
import faiss
import numpy as np
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from sentence_transformers import SentenceTransformer

# Load model and tokenizer
MODEL_DIR = "./neo_outputs"
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)
tokenizer.pad_token = tokenizer.eos_token
model = GPTNeoForCausalLM.from_pretrained(MODEL_DIR).to("cuda" if torch.cuda.is_available() else "cpu")

# Load FAISS index + text chunks
INDEX_DIR = "./vector_index"
embedder = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index(os.path.join(INDEX_DIR, "faiss_index.idx"))

with open(os.path.join(INDEX_DIR, "chunks.pkl"), "rb") as f:
    chunks = pickle.load(f)

with open(os.path.join(INDEX_DIR, "sources.pkl"), "rb") as f:
    sources = pickle.load(f)

def retrieve_top_chunks(query, top_k=3):
    query_vector = embedder.encode([query])
    distances, indices = index.search(np.array(query_vector), top_k)
    return [chunks[i] for i in indices[0]]

def generate_answer(query, context_chunks):
    context = "\n".join(context_chunks)
    full_prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer.encode(full_prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)
    attention_mask = (inputs != tokenizer.pad_token_id).long()
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_new_tokens=150,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)[len(full_prompt):].strip()

# 🔁 Loop or single input
while True:
    query = input("\n🧠 Ask a medical question (or type 'exit'): ").strip()
    if query.lower() == "exit":
        break
    top_chunks = retrieve_top_chunks(query)
    print("\n🔎 Top Chunks:")
    for i, chunk in enumerate(top_chunks):
        print(f"Chunk {i+1}:\n{chunk}\n")

    print("🤖 Generating Answer...\n")
    answer = generate_answer(query, top_chunks)
    print("🧠 Answer:\n", answer)



🧠 Ask a medical question (or type 'exit'):  what are symptoms of lung cancer?



🔎 Top Chunks:
Chunk 1:
of cell type in patients with cancer of the lungs. In order to evaluate the determinants of cell type in patients with primary lung cancer, we compared smoking characteristics in 1,939 patients (1,474 men and 465 women). Patients with squamous cell carcinomas, adenocarcinomas, or small-cell carcinomas were eligible. This study did not consider smoking as a risk factor for lung cancer, as all subjects had a confirmed diagnosis. We were interested in smoking history and the pattern of smoking among those whose risk was 100 percent. Among these patients, we confirmed that a larger subset of nonsmoking individuals developed adenocarcinomas

Chunk 2:
not correspond to previous pathological classifications. Atrophy was not related to the duration of tumour symptoms, ageing, clinical type of myopathy or histological type of lung tumour, and was statistically different from that seen in controls. Qualitatively, the presence of weight loss, muscle wasting and metastatic 


🧠 Ask a medical question (or type 'exit'):  exit
