In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/domain_dataset.csv")


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.2 MB/s[0m eta [36m0:00:0

In [None]:
domains = ["Agriculture", "Education", "Health", "Finance ", "Sports"]

In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(domains))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(domains))

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_data(df):
    tokenized = tokenizer(list(df["Text"]), padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor([domains.index(label) for label in df["labels"]], dtype=torch.long)
    return tokenized, labels

In [None]:
train_encodings, train_labels = tokenize_data(train_df)
val_encodings, val_labels = tokenize_data(val_df)
test_encodings, test_labels = tokenize_data(test_df)

In [None]:
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], val_labels)
test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)

In [None]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
num_epochs = 10
learning_rate = 2e-5


In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
import time
for epoch in range(num_epochs):
    start_time = time.time()  # Record the start time for each epoch
    model.train()
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        # Calculate training progress as a percentage
        progress = (batch_idx + 1) / len(train_loader) * 100

        print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_loader)}, Progress: {progress:.2f}%", end="\r")

    end_time = time.time()  # Record the end time for each epoch
    epoch_time = end_time - start_time  # Calculate the time taken for the epoch
    print(f"\nEpoch {epoch+1}/{num_epochs} completed in {epoch_time:.2f} seconds")



Epoch 1/10, Batch 26/26, Progress: 100.00%
Epoch 1/10 completed in 17.01 seconds
Epoch 2/10, Batch 26/26, Progress: 100.00%
Epoch 2/10 completed in 17.14 seconds
Epoch 3/10, Batch 26/26, Progress: 100.00%
Epoch 3/10 completed in 17.39 seconds
Epoch 4/10, Batch 26/26, Progress: 100.00%
Epoch 4/10 completed in 17.50 seconds
Epoch 5/10, Batch 26/26, Progress: 100.00%
Epoch 5/10 completed in 17.58 seconds
Epoch 6/10, Batch 26/26, Progress: 100.00%
Epoch 6/10 completed in 17.75 seconds
Epoch 7/10, Batch 26/26, Progress: 100.00%
Epoch 7/10 completed in 17.91 seconds
Epoch 8/10, Batch 26/26, Progress: 100.00%
Epoch 8/10 completed in 18.05 seconds
Epoch 9/10, Batch 26/26, Progress: 100.00%
Epoch 9/10 completed in 18.20 seconds
Epoch 10/10, Batch 26/26, Progress: 100.00%
Epoch 10/10 completed in 18.32 seconds


In [None]:
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        correct_preds += torch.sum(predicted_labels == labels).item()
        total_preds += len(labels)

val_accuracy = correct_preds / total_preds
print(f"Validation Accuracy: {val_accuracy:.4f}")


Validation Accuracy: 0.9600


In [None]:
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        correct_preds += torch.sum(predicted_labels == labels).item()
        total_preds += len(labels)

test_accuracy = correct_preds / total_preds
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 1.0000


In [None]:
train_df.head()

Unnamed: 0,labels,Text
38,Sports,Safin slumps to shock Dubai loss\n\nMarat Safi...
143,Health,Eye conditions are remarkably common. Those wh...
84,Finance,"ALEXANDRIA , Va. , May 16 -- Kenneth Bower of ..."
55,Finance,"The new facility , to be known as Technopolis ..."
218,Agriculture,Telangana Agriculture Minister S Niranjan Redd...


In [None]:
input_text = "In a recent game, the star athlete and captain of the local football team suffered a significant injury. The player was rushed to the hospital for immediate medical attention after a collision with an opponent resulted in a suspected ankle fracture. The team's fans and management are anxiously awaiting updates on the player's condition and recovery timeline, with hopes for a speedy return to the field.."


In [None]:
input_encodings = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_encodings = {key: value.to(device) for key, value in input_encodings.items()}
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
with torch.no_grad():
    input_ids = input_encodings["input_ids"]
    attention_mask = input_encodings["attention_mask"]
    logits = model(input_ids, attention_mask=attention_mask).logits

In [None]:
predicted_class_index = torch.argmax(logits, dim=1).item()

In [None]:
predicted_label = domains[predicted_class_index]

In [None]:
print(f"Predicted Label: {predicted_label}")

Predicted Label: Sports


In [None]:
model.save_pretrained("roberta_model")

In [None]:
torch.save(model.state_dict(), "roberta.bin")

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
colab_file_path = '/content/roberta.bin'

# Specify the destination path in your Google Drive
drive_file_path ='/content/drive/MyDrive'


# Copy the file from Colab to Google Drive
!cp "$colab_file_path" "$drive_file_path"