In [None]:
# Install the necessary libraries
!pip install transformers
!pip install datasets

In [51]:
# Import libraries
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
# Load the AG News dataset for text classification
dataset = load_dataset("ag_news")

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
# Split the dataset into training and testing sets
train_dataset, test_dataset = train_test_split(dataset["train"], test_size=0.2, random_state=42)

In [5]:
# Tokenize text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128  # Define the maximum sequence length

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Define batch size and learning rate
batch_size = 32
learning_rate = 2e-5

In [7]:
# Define a custom preprocessing function
def preprocess_text_and_label(text, label):
    inputs = tokenizer(text, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
    return {
        'input_ids': inputs['input_ids'].squeeze(),
        'attention_mask': inputs['attention_mask'].squeeze(),
        'label': label
    }

In [8]:
# Print the type of train_dataset
print(type(train_dataset))


<class 'dict'>


In [9]:
# Apply custom preprocessing to the training dataset manually
train_texts = train_dataset["text"]
train_labels = train_dataset["label"]
train_tokenized_dataset = [preprocess_text_and_label(text, label) for text, label in zip(train_texts, train_labels)]


In [10]:
# Concatenate the lists of tensors along the batch dimension
input_ids = torch.cat([example['input_ids'].unsqueeze(0) for example in train_tokenized_dataset], dim=0)
attention_mask = torch.cat([example['attention_mask'].unsqueeze(0) for example in train_tokenized_dataset], dim=0)
labels = torch.tensor([example['label'] for example in train_tokenized_dataset], dtype=torch.long)


In [11]:
# Create a TensorDataset
tensor_dataset = TensorDataset(input_ids, attention_mask, labels)
# Create data loaders for training and testing
train_dataloader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=True)

In [12]:
# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)  # Adjust 'num_labels' for your dataset


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Set up the optimiser
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [14]:
import torch

# Define the device (CPU or GPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move the model to the selected device
model.to(device)

# Initialize variables for early stopping
num_epochs = 6
best_accuracy = 0.0
patience = 5
early_stopping_counter = 0

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')


Epoch 1/6, Loss: 0.2211
Epoch 2/6, Loss: 0.1307
Epoch 3/6, Loss: 0.0886
Epoch 4/6, Loss: 0.0583
Epoch 5/6, Loss: 0.0388
Epoch 6/6, Loss: 0.0291


In [15]:
# Load the AG News dataset for text classification
dataset = load_dataset("ag_news")

# Get the test dataset
test_dataset = dataset["test"]

# Print the first few examples in the test_dataset to understand its structure
for i, example in enumerate(test_dataset):
    if i >= 5:  # Print the first 5 examples
        break

    print(f"Example {i + 1}:")
    print(example)


Example 1:
{'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.", 'label': 2}
Example 2:
{'text': 'The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\\privately funded suborbital space flight, has officially announced the first\\launch date for its manned rocket.', 'label': 3}
Example 3:
{'text': 'Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.', 'label': 3}
Example 4:
{'text': "Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colo

In [16]:
# Preprocess the text and label for the test_dataset
test_tokenized_dataset = [preprocess_text_and_label(example["text"], example["label"]) for example in test_dataset]

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(test_tokenized_dataset, batch_size=batch_size)

# Evaluate the model on the test dataset
model.eval()
all_predictions = []
all_labels = []

for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label']

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
    predictions = torch.argmax(logits, dim=-1)

    # Append the predictions for this batch
    all_predictions.extend(predictions.tolist())
    all_labels.extend(labels.tolist())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.9438


In [17]:
# Save the trained model
model.save_pretrained('/content/drive/MyDrive/text_classification_model')


In [18]:
# Load the trained model for inference
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/text_classification_model')


In [19]:
# Save the trained model to the current working directory
model.save_pretrained('./text_classification_model')


In [20]:
# Inference and predictions
# Load the trained model for inference
model = BertForSequenceClassification.from_pretrained('./text_classification_model')

In [24]:
# Analyze sentiment
text_to_analyze = "This is a fantastic product!"
inputs = tokenizer(text_to_analyze, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
inputs = {key: val.to(device) for key, val in inputs.items()}

with torch.no_grad():

    logits = model(**inputs).logits
    predicted_sentiment = torch.argmax(logits, dim=-1).item()

if predicted_sentiment == 0:
    sentiment_label = "Negative"
elif predicted_sentiment == 1:
    sentiment_label = "Neutral"
else:
    sentiment_label = "Positive"

print(f"Predicted Sentiment: {sentiment_label}")


Predicted Sentiment: Positive


In [31]:
# Tokenize the text for classification
text_to_classify = "The world cup has been facinating."
inputs = tokenizer(text_to_classify, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform classification
with torch.no_grad():
    model.to(device)

    logits = model(**inputs).logits
    predicted_label = torch.argmax(logits, dim=-1).item()
    predicted_class = class_labels[predicted_label]

print(f"Predicted Class: {predicted_class}")


Predicted Class: Sports


In [48]:
# Load a pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set up the prompt
prompt = "Hello, world"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Ensure input tensor is on CPU
input_ids = input_ids.to('cpu')

# Set attention mask to all 1s (full attention) and pad token ID to eos_token_id
attention_mask = torch.ones_like(input_ids)
pad_token_id = tokenizer.eos_token_id

# Generate text
with torch.no_grad():
    generated = model.generate(
        input_ids,
        max_length=100,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        attention_mask=attention_mask,  # Set attention mask
        pad_token_id=pad_token_id      # Set pad token ID
    )

generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"Generated Text:\n{generated_text}")

Generated Text:
Hello, world.

I'm sorry, but I'm not sure what to do. I don't know what I should do, and I can't do anything. But I know that I have to. And I want to be there for you. So I'll be here. You'll see. It's not like I've been here for a long time. Maybe I was here before. Or maybe I just didn't want you to know. Either way, I think I need to


In [50]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load a pre-trained BART model and tokenizer for summarisation
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Long text to be summarised
long_text = """
Natural Language Processing (NLP) models are a subset of artificial intelligence (AI) that focuses on the interaction between computers and human language. These models aim to enable machines to understand, interpret, and generate human language in a way that is both meaningful and contextually relevant. NLP has a wide range of applications, from sentiment analysis and text classification to machine translation and chatbots. One of the key breakthroughs in NLP is the development of transformer-based models, such as BERT and GPT, which have achieved remarkable results in various language understanding and generation tasks. These models have opened up new possibilities in language-related AI applications, making NLP a rapidly evolving field with exciting opportunities for research and innovation.
"""

# Tokenize and encode the long text
input_ids = tokenizer.encode(long_text, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary
with torch.no_grad():
    summary_ids = model.generate(input_ids, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(f"Generated Summary:\n{summary}")


Generated Summary:
Natural Language Processing (NLP) models aim to enable machines to understand, interpret, and generate human language. NLP has a wide range of applications, from sentiment analysis and text classification to machine translation and chatbots. One of the key breakthroughs in NLP is the development of transformer-based models.
