<a href="https://colab.research.google.com/github/LoQiseaking69/LoQiseaking69/blob/main/Copy_of_MUNNINsVOICE_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install tqdm




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Script 1: Data Preprocessing

import pandas as pd
from transformers import BertTokenizer

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/final_dataset_with_additional_data_combined_tidy.csv')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Perform tokenization
questions = df['Question'].values
answers = df['Answer'].values

input_ids, attention_masks, token_type_ids = [], [], []
for q, a in zip(questions, answers):
    encoded_dict = tokenizer.encode_plus(
        q, a,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

# Calculate start_positions and end_positions for answers
start_positions, end_positions = [], []
for i, (input_id, answer) in enumerate(zip(input_ids, answers)):
    answer_tokens = tokenizer.tokenize(answer)
    answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)

    start_idx_tensor = (input_id == answer_ids[0]).nonzero(as_tuple=True)[0]

    if len(start_idx_tensor) == 0:  # No match found
        start_positions.append(None)
        end_positions.append(None)
        continue

    start_idx = start_idx_tensor[0].item()  # Take the first match
    end_idx = start_idx + len(answer_ids) - 1

    start_positions.append(start_idx)
    end_positions.append(end_idx)

# Remove rows where start_positions or end_positions are None
df['start_positions'] = start_positions
df['end_positions'] = end_positions
df.dropna(subset=['start_positions', 'end_positions'], inplace=True)

# Add these to DataFrame if you want to save them for later use
df['input_ids'] = input_ids
df['attention_mask'] = attention_masks
df['token_type_ids'] = token_type_ids

# You can also save this DataFrame if you'd like
df.to_csv('/content/drive/MyDrive/final_dataset_with_positions.csv', index=False)



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Script 2: DataLoader Creation

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

# Convert lists of tensors to single tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)

# Prepare DataLoader
dataset = TensorDataset(input_ids, attention_masks, token_type_ids)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# Convert start_positions and end_positions to tensors
start_positions = torch.tensor(df['start_positions'].tolist())
end_positions = torch.tensor(df['end_positions'].tolist())

# Update the DataLoader
dataset = TensorDataset(input_ids, attention_masks, token_type_ids, start_positions, end_positions)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)


In [None]:
from transformers import BertForQuestionAnswering
import torch.optim as optim
import torch
from tqdm import tqdm  # Import the tqdm library for the progress bar

# Initialize the BERT model for Question Answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Initialize the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Define number of epochs
n_epochs = 3

# Training loop with checkpoint saving
for epoch in range(n_epochs):
    model.train()

    # Initialize tqdm for the progress bar
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", position=0, leave=True)

    for batch in progress_bar:
        b_input_ids, b_attention_mask, b_token_type_ids, b_start_positions, b_end_positions = batch
        optimizer.zero_grad()
        outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask, token_type_ids=b_token_type_ids,
                        start_positions=b_start_positions, end_positions=b_end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Update tqdm progress bar
        progress_bar.set_postfix({'loss': loss.item()})

        # Save the entire model at the end of all epochs
torch.save(model, "/content/drive/MyDrive/MUNNINsModelversions/MUNNINVv1.pt")




Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   2%|▏         | 2/105 [00:55<45:15, 26.36s/it, loss=4.97]  

In [None]:
# Script 4: Model Interaction

# Interact with the model
while True:
    question = input("Enter your question: ")
    encoded_question = tokenizer.encode_plus(
        question,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors='pt'
    )
    with torch.no_grad():
        output = model(encoded_question['input_ids'], attention_mask=encoded_question['attention_mask'], token_type_ids=encoded_question['token_type_ids'])
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    tokens = tokenizer.convert_ids_to_tokens(encoded_question['input_ids'][0])
    answer = ' '.join(tokens[answer_start:answer_end+1])
    print(f"Answer: {answer}")
