In [1]:
from datasets import Dataset
import os

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


STATIC="static/"
file_paths = [STATIC+i for i in os.listdir(STATIC)]
texts = [load_data(file) for file in file_paths]

combined_text = " ".join(texts)
dataset = Dataset.from_dict({"text": [combined_text]})


In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load the pre-trained model and tokenizer
model_name = "xlm-roberta-base"  # You can use 'bert-base-multilingual-cased' as well
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    questions = examples["text"]  # Use the same text for questions as this is unsupervised
    inputs = tokenizer(questions, padding="max_length", truncation=True, return_tensors="pt")
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)




Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [3]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 1
})

In [4]:
# Load the model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors='pt')
    outputs = model(**inputs)
    answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end]))
    return answer

# Example usage
question = "ਕੰਨਕ ਦੀ ਬੀਮਾਰੀ ਕੀ ਹੈ?"  # Example question in Punjabi
context = combined_text  # Using the entire text as context
print(answer_question(question, context))


In [9]:
from transformers import pipeline

# Load a question generation model
question_generator = pipeline("question-generation")

# Example of splitting large text into chunks
text = combined_text

# Define a function to split the text into chunks
def split_text(text, max_length=512):
    sentences = text.split("। ")  # Split by sentence (assuming '।' is the sentence delimiter)
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length <= max_length:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
    
    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Split the text
text_chunks = split_text(text)

# Generate questions for each chunk
qa_pairs = []
for chunk in text_chunks:
    questions_and_answers = question_generator(chunk)
    for qa in questions_and_answers:
        qa_pairs.append({"question": qa["question"], "answer": qa["answer"]})

# Print the generated question-answer pairs
for pair in qa_pairs:
    print(f"Question: {pair['question']}")
    print(f"Answer: {pair['answer']}\n")


KeyError: "Unknown task question-generation, available tasks are ['audio-classification', 'automatic-speech-recognition', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"

In [10]:
from transformers import pipeline

# Load the text-to-text generation model
question_generator = pipeline("text2text-generation", model="t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [12]:
# Example text from your Punjabi dataset
text = "ਪੰਜਾਬ ਵਿੱਚ ਖੇਤੀਬਾੜੀ ਪ੍ਰਧਾਨ ਆਰਥਿਕ ਸਰਗਰਮੀ ਹੈ। ਖੇਤੀਬਾੜੀ ਵਿੱਚ ਮੁੱਖ ਤੌਰ ਤੇ ਗੰਦਮ, ਧਾਨ ਅਤੇ ਮੱਕੀ ਦੀ ਕਾਸ਼ਤ ਕੀਤੀ ਜਾਂਦੀ ਹੈ।"

# Define the prompt for question generation
prompt = f"generate questions: {text}"

# Generate multiple questions using beam search
generated_questions = question_generator(prompt, max_length=100, num_return_sequences=5, num_beams=5)

# Display the questions
for i, q in enumerate(generated_questions):
    print(f"Question {i+1}: {q['generated_text']}")

Question 1:            ,          ,            ,              
Question 2:                  ,          ,                     
Question 3:                  ,         ,                      
Question 4:            ,          ,                           
Question 5:            ,         ,                            
