In [None]:
%pip install torch tiktoken sentencepiece blobfile datasets

In [None]:
from huggingface_hub import login
login(token="enter token here")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-medium")



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load the dataset
dataset = load_dataset("ParthKadam2003/NCERT_Dataset")

if "validation" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1)
    print("Dataset split into 90% train and 10% validation.")
# Load GPT-2 Medium model and tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Use GPU if available
    torch_dtype=torch.float32
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
# Tokenize the dataset (using Explanation column)
def tokenize_function(examples):
    tokens = tokenizer(examples["Explanation"], 
                       truncation=True, 
                       padding="max_length", 
                       max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()  # Set labels as a copy of input_ids
    return tokens

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["Explanation"])
# Set training arguments
training_args = TrainingArguments(
    output_dir="llama-3_2Instruct-science-finetuned",
    # evaluation_strategy="epoch",
    per_device_train_batch_size=4,  # Adjust for your system
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),  # Mixed precision training if GPU available
    save_strategy="epoch",
    logging_dir="./logs",
    push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # The split test is used as validation
)

# Start training
trainer.train()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer (ensure it is saved in the main directory)
base_model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained("./llama-3_2Instruct-science-finetuned")  # Save tokenizer to the main directory

# Load the fine-tuned model from the latest checkpoint
model = AutoModelForCausalLM.from_pretrained("./llama-3_2Instruct-science-finetuned/checkpoint-1725")

# Set the model to evaluation mode
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

print("GPT-2 Fine-Tuned Model Ready for Testing!\n")

# Infinite test loop
while True:
    user_question = input("Ask your science question (or type 'exit' to quit): ")
    if user_question.lower() == "exit":
        print("Goodbye!")
        break

    # Prepare the input
    input_text = f"Q: {user_question}\nA:"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    # Generate the answer
    with torch.no_grad():
        output_ids = model.generate(
            input_ids, 
            max_length=128, 
            num_return_sequences=1, 
            temperature=0.7,  # Controls randomness (0.7 is balanced)
            top_p=0.9,        # Nucleus sampling
            do_sample=True,   # Enable sampling for creativity
        )
    
    # Decode and display the answer
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("\nAnswer:", answer.split("A:")[-1].strip(), "\n")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer (ensure it is saved in the main directory)
base_model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained("./llama-3_2Instruct-science-finetuned")  # Save tokenizer to the main directory

# Load the fine-tuned model from the latest checkpoint
model = AutoModelForCausalLM.from_pretrained("./llama-3_2Instruct-science-finetuned/checkpoint-4992")

# Set the model to evaluation mode
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

print("GPT-2 Fine-Tuned Model Ready for Testing!\n")

# Infinite test loop
while True:
    user_question = input("Ask your science question (or type 'exit' to quit): ")
    if user_question.lower() == "exit":
        print("Goodbye!")
        break

    # Prepare the input
    input_text = f"Q: {user_question}\nA:"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    # Generate the answer
    with torch.no_grad():
        output_ids = model.generate(
            input_ids, 
            max_length=128, 
            num_return_sequences=1, 
            temperature=0.7,  # Controls randomness (0.7 is balanced)
            top_p=0.9,        # Nucleus sampling
            do_sample=True,   # Enable sampling for creativity
        )
    
    # Decode and display the answer
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("\nAnswer:", answer.split("A:")[-1].strip(), "\n")

: 