In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from datasets import Dataset
import torch

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
df = pd.read_csv('Arabic Poem Comprehensive Dataset (APCD).csv', encoding_errors="ignore")
# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")
model = AutoModelForCausalLM.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix")
model_fine_tuned = AutoModelForCausalLM.from_pretrained("fine-tuned-bert-base-arabic-camelbert-mix", local_files_only=True)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [3]:
model_fine_tuned

BertLMHeadModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [5]:
model_fine_tuned.push_to_hub('selma_model_20k_vv')

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ellzo/selma_model_20k_vv/commit/d24d34b51358613a86783fa6e3b6e969601d643a', commit_message='Upload model', commit_description='', oid='d24d34b51358613a86783fa6e3b6e969601d643a', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
for col in df.columns:
  if col!="البيت":
    df.drop(col,axis=1,inplace=True)

In [6]:
df.dropna()
df['البيت']=df['البيت'].astype(str).values
df

Unnamed: 0,البيت
0,خَليلَيَّ لا تَستَعجِلا أَن تَزَوَّدا وَأَن...
1,فَما لَبَثٌ يَوماً بِسابِقٍ مَغنَمٍ وَلا سُ...
2,وَإِن تُنظِراني اليَومَ أَقضِ لُبانَةً وَتَ...
3,لَعَمرُكَ ما نَفسٌ بِجِدٍ رَشيدَةٍ تُؤامِرُ...
4,وَإِن ظَهَرَت مِنهُ قَوارِصُ جَمَّةٌ وَأَفر...
...,...
1831765,هي أغلى ما أنشأ اللَّه في الدنيا وأحلى قصيد...
1831766,هي أغرودة الأغاريد تنساب كحلم يغشى الجفون ا...
1831767,هي شلال بهجة وبهاء يتداعى وجداً ويخفق حسنا
1831768,هي حلم الهوى ومنطلقي الباقي يدك الحدود سجنا...


In [15]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df.sample(100, random_state=42), test_size=0.2, random_state=42)
df_train=Dataset.from_pandas(df_train)
df_test=Dataset.from_pandas(df_test)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (80, 2)
Test set shape: (20, 2)


In [16]:
def tokenize_function(examples):
    return tokenizer(examples["البيت"], padding="max_length", truncation=True, max_length=200)

tokenized_datasets_train = df_train.map(tokenize_function, batched=True)
tokenized_datasets_test = df_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [17]:
tokenized_datasets_train

Dataset({
    features: ['البيت', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 80
})

### Fine-tuning CAMeL-Lab/bert-base-arabic-camelbert-mix Model

In [21]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="selma_model_20k", push_to_hub=True, num_train_epochs=1, evaluation_strategy="epoch")

In [22]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False#, mlm_probability=0.15
)

In [23]:
trainer = Trainer(
    model=model_fine_tuned,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    data_collator=data_collator,
)

In [11]:
#torch.set_num_threads(10)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1e-05


TrainOutput(global_step=10, training_loss=0.0002634365577250719, metrics={'train_runtime': 53.6731, 'train_samples_per_second': 1.491, 'train_steps_per_second': 0.186, 'total_flos': 8225100288000.0, 'train_loss': 0.0002634365577250719, 'epoch': 1.0})

In [None]:
trainer.save_model('model_selma_torch_20k_gg.model')

In [None]:
trainer.push_to_hub("selma_model_20k_gg")

In [34]:
tokenizer.push_to_hub("selma_tokenizer_20k")

CommitInfo(commit_url='https://huggingface.co/ellzo/selma_tokenizer_20k/commit/cbe1e3b0624d7f5711417bbe773196848a4df33b', commit_message='Upload tokenizer', commit_description='', oid='cbe1e3b0624d7f5711417bbe773196848a4df33b', pr_url=None, pr_revision=None, pr_num=None)