<a href="https://colab.research.google.com/github/HelloAlgorithmBob/NLP_project/blob/main/English_NLP_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.9 MB/s[0m eta [36m0:00:0

In [2]:
from transformers import pipeline

# Load the pipeline for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased")

# Reading passage
passage = """
There are 40 packafes in the Guangzhou. There are 30 packages in the Beijing.
"""

# List of questions
questions = [
    "How many total packages in beijing?",
]

# Answer the questions using the model
for question in questions:
    result = qa_pipeline(question=question, context=passage)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['score']}")
    print("=" * 50)


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


Question: How many total packages in beijing?
Answer: 30
Confidence: 0.6522623300552368


In [7]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-cased"
model = DistilBertForQuestionAnswering.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Reading passage
passage = """
There are 40 packages in Guangzhou. There are 30 packages in Beijing.
"""

# List of questions and corresponding answers
questions = [
    {"question": "How many total packages in Beijing?", "answer": "30"}
    # Add more questions and answers to your dataset
]

# Tokenize the data and prepare input tensors
input_ids = []
attention_masks = []
start_positions = []
end_positions = []

for example in questions:
    encoded = tokenizer.encode_plus(
        text=passage,
        question=example["question"],
        truncation="only_second",  # Passages are truncated
        padding="max_length",
        max_length=128,  # Adjust as needed
        return_tensors="pt",
    )
    input_ids.append(encoded["input_ids"])
    attention_masks.append(encoded["attention_mask"])

    answer_tokens = tokenizer.encode(example["answer"], add_special_tokens=False)

    # Find indices of answer tokens in input_ids
    ans_token_indices = [i for i, token in enumerate(encoded["input_ids"].squeeze().tolist()) if token in answer_tokens]

    # Set start and end positions
    start_positions.append(ans_token_indices[0])
    end_positions.append(ans_token_indices[-1])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, start_positions, end_positions)

# DataLoaders
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Keyword arguments {'question': 'How many total packages in Beijing?'} not recognized.


In [8]:
# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, start_positions, end_positions = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

Epoch 1/3: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]


Epoch 1/3, Average Loss: 4.8054


Epoch 2/3: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


Epoch 2/3, Average Loss: 4.3688


Epoch 3/3: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


Epoch 3/3, Average Loss: 3.8826


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json')