In [None]:
import pandas as pd
import tensorflow as tf
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

# STEP 1: Load your data and filter
import pandas as pd

# Ignore dtype warnings by loading only relevant columns
df = pd.read_csv('/home/ri_rishna_urandugp/.vscode/Transformers/NCERT_dataset.csv', 
                 usecols=['Explanation', 'Question', 'Answer', 'subject'])
# Remove rows with missing values in key columns
df = df.dropna(subset=['Explanation', 'Question', 'Answer', 'subject'])

# Only keep Physics and Chemistry
df = df[df['subject'].str.lower().isin(['physics', 'chemistry'])]

print(f"Total rows after filtering: {len(df)}")


In [None]:
model_name = "google/flan-t5-small"  # Better QA performance than base t5-small
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = TFT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
max_input_length = 128
max_target_length = 64

input_encodings = tokenizer(
    inputs, max_length=max_input_length, truncation=True, padding='max_length', return_tensors='tf'
)
with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(
        targets, max_length=max_target_length, truncation=True, padding='max_length', return_tensors='tf'
    )


In [None]:
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask']
    },
    target_encodings['input_ids']
)).shuffle(200).batch(4)  # Batch size 4; adjust as you like


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

class CustomProgressCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        print(f"\nStart of epoch {epoch + 1}")
    def on_batch_end(self, batch, logs=None):
        if batch % 10 == 0:  # Print every 10 batches
            print(f"  Completed batch {batch + 1}: loss = {logs['loss']:.4f}")
    def on_epoch_end(self, epoch, logs=None):
        print(f"End of epoch {epoch + 1}: loss = {logs['loss']:.4f}")

model.compile(optimizer=optimizer)
model.fit(
    dataset,
    epochs=8,              # More epochs for small data
    verbose=0,
    callbacks=[CustomProgressCallback()]
)


In [None]:
model.save_pretrained('my_qa_model_tf')
tokenizer.save_pretrained('my_qa_model_tf')

print("Training complete! Model and tokenizer saved in 'my_qa_model_tf/'")

In [None]:
# Load model and tokenizer
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('my_qa_model_tf')
model = TFT5ForConditionalGeneration.from_pretrained('my_qa_model_tf')

def answer_qa(explanation, question):
    input_text = f"explanation: {explanation} question: {question}"
    input_ids = tokenizer(input_text, return_tensors='tf').input_ids
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example:
print(answer_qa("static electricity.", "What is the phenomenon that causes a spark when touching a doorknob?"))

In [None]:
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
import tensorflow as tf

class QASystem:
    def __init__(self, model_path='my_qa_model_tf'):
        try:
            self.tokenizer = T5Tokenizer.from_pretrained(model_path)
            self.model = TFT5ForConditionalGeneration.from_pretrained(model_path)
        except Exception as e:
            raise Exception(f"Error loading model: {str(e)}")

    def answer_qa(self, explanation, question, max_length=128, num_beams=4):
        try:
            # Input validation
            if not explanation or not question:
                raise ValueError("Both explanation and question must be provided")

            # Prepare input text
            input_text = f"explanation: {explanation} question: {question}"
            
            # Tokenize input
            inputs = self.tokenizer(
                input_text,
                return_tensors='tf',
                max_length=512,
                truncation=True,
                padding='max_length'
            )

            # Generate answer
            outputs = self.model.generate(
                inputs.input_ids,
                max_length=max_length,
                num_beams=num_beams,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                no_repeat_ngram_size=3,
                early_stopping=True
            )

            # Decode and clean answer
            answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return answer.strip()

        except Exception as e:
            return f"Error generating answer: {str(e)}"

# Create QA system instance
qa_system = QASystem()

# Example usage with better context
explanation = """
Static electricity is a phenomenon that occurs when electric charges accumulate on surfaces.
When you walk across a carpet, your body builds up electrons through friction.
When touching a metal doorknob, these excess charges suddenly transfer to the metal,
causing a spark and sometimes a small shock. This is known as electrostatic discharge.
"""

questions = [
    "What is the phenomenon that causes a spark when touching a doorknob?",
    "How does static electricity build up?",
    "What happens when you touch the doorknob?"
]

# Test multiple questions
for question in questions:
    print(f"\nQuestion: {question}")
    print(f"Answer: {qa_system.answer_qa(explanation, question)}")

# Example of error handling
try:
    print("\nTesting with empty input:")
    print(qa_system.answer_qa("", ""))
except ValueError as e:
    print(f"Caught error: {e}")