In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('path_to_TeleQnA_dataset.csv')

# Preprocess the dataset: removing missing or malformed data
df.dropna(subset=['question', 'answer'], inplace=True)
df['question'] = df['question'].str.strip()
df['answer'] = df['answer'].str.strip()

# Save the cleaned dataset
df.to_csv('cleaned_TeleQnA_dataset.csv', index=False)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

dataset = load_dataset('path_to_TeleQnA_dataset')
model_name = "microsoft/phalcon-b-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = examples['question']
    targets = examples['answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=512, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

trainer.train()


In [None]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

# Load the RAG tokenizer and retriever
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
rag_retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", passages_path="path_to_passages")

# Load the RAG sequence model
rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=rag_retriever)

# Tokenize the input
input_text = "What are the key features of 5G technology?"
inputs = rag_tokenizer([input_text], return_tensors="pt")

# Generate the response
outputs = rag_model.generate(input_ids=inputs['input_ids'])
response = rag_tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response)
