In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report

In [3]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabert", num_labels=9)

Downloading (…)okenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
df = pd.read_csv('/content/drive/MyDrive/FinallyLemmas.csv')


In [7]:
batch_size = 32
max_length = 256
lemmaText_list = df['lemmaText'].tolist()
inputs = tokenizer(lemmaText_list, padding='max_length', truncation=True, return_tensors="pt", max_length=max_length)
label_list = torch.tensor(df['label'].tolist())

In [8]:
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], label_list)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
num_epochs = 3
learning_rate = 2e-5

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}")



Epoch 1/3, Loss: 0.1906
Epoch 2/3, Loss: 0.0635
Epoch 3/3, Loss: 0.0381


In [12]:
model.eval()
test_predictions = []
true_labels = []

with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predicted = torch.max(logits, 1)

        test_predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, test_predictions)
report = classification_report(true_labels, test_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(report)


Accuracy: 0.9932
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6447
           1       1.00      0.99      0.99      6447
           2       1.00      0.98      0.99      6447
           3       1.00      1.00      1.00      6447
           4       1.00      0.99      1.00      6447
           5       0.98      1.00      0.99      6447
           6       1.00      1.00      1.00      6447
           7       0.98      0.99      0.99      6447
           8       0.99      1.00      0.99      6447

    accuracy                           0.99     58023
   macro avg       0.99      0.99      0.99     58023
weighted avg       0.99      0.99      0.99     58023



In [13]:
model.eval()
test_predictions = []
true_labels = []

with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predicted = torch.max(logits, 1)

        test_predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, test_predictions)
report = classification_report(true_labels, test_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(report)


Accuracy: 0.9932
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6447
           1       1.00      0.99      0.99      6447
           2       1.00      0.98      0.99      6447
           3       1.00      1.00      1.00      6447
           4       1.00      0.99      1.00      6447
           5       0.98      1.00      0.99      6447
           6       1.00      1.00      1.00      6447
           7       0.98      0.99      0.99      6447
           8       0.99      1.00      0.99      6447

    accuracy                           0.99     58023
   macro avg       0.99      0.99      0.99     58023
weighted avg       0.99      0.99      0.99     58023



In [14]:
# Specify the directory paths on Google Drive to save the model and tokenizer
model_directory = "/content/drive/MyDrive/model_directory"
tokenizer_directory = "/content/drive/MyDrive/tokenizer_directory"

# Save the model and tokenizer
model.save_pretrained(model_directory)
tokenizer.save_pretrained(tokenizer_directory)

print("Model and tokenizer saved successfully to Google Drive!")


Model and tokenizer saved successfully to Google Drive!


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the model and tokenizer from Google Drive
model_directory = "/content/drive/MyDrive/model_directory"
tokenizer_directory = "/content/drive/MyDrive/tokenizer_directory"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_directory)
model = AutoModelForSequenceClassification.from_pretrained(model_directory)

# Text to classify
text_to_classify = "فاز نادي الاتحاد على غريمه نادي الهلال في مباراة يوم امس بدوري المحترفين"
# Tokenize the text
inputs = tokenizer(text_to_classify, padding=True, truncation=True, return_tensors="pt")

# Make sure the model is in evaluation mode
model.eval()

# Perform inference
with torch.no_grad():
    # Forward pass
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
    logits = outputs.logits

    # Get predicted class
    predicted_class = torch.argmax(logits, dim=1).item()

# In a multi-class classification task, you may also want to retrieve class labels
class_names = ['Finance',
'Medical' ,
'Culture',
'Politics',
'Religion' ,
'Tech',
'Sports',
'Diverse',
 'Economy']
# Get the predicted class label
predicted_label = class_names[predicted_class]

print(f"Predicted Class Label: {predicted_label}")


Predicted Class Label: Sports
