# Mount google drive to access the data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies

In [None]:
!pip install arabert
!pip install camel-tools
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
!pip install evaluate
!pip install rouge_score

# Create a transformers dataset

### 1. Import the required modules

In [3]:
from datasets import Dataset
from csv import DictReader
from arabert.preprocess import ArabertPreprocessor
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar

### 2. Define helper functions

In [None]:
stopwords = set(open('/content/drive/MyDrive/AIC-ICMTC/dataset/arabic_stopwords.txt').read().split())
preprocessor = ArabertPreprocessor('aubmindlab/bert-base-arabertv2')

def filter_stopwords(text):
  return ' '.join(word for word in text.split() if word not in stopwords)

def preprocess_text(text):
  return normalize_teh_marbuta_ar(
      normalize_alef_maksura_ar(
          normalize_alef_ar(
              normalize_unicode(
                  preprocessor.preprocess(
                      filter_stopwords(
                          text
                      )
                  )
              )
          )
      )
  )

### 3. Read the data

In [5]:
dataset = {'text': [], 'summary': []}

with open('/content/drive/MyDrive/AIC-ICMTC/dataset/ArabicMogalad_Ndeef.csv') as file:
  reader = DictReader(file)

  for row in reader:
    dataset['text'].append(row.pop('Text').strip())
    dataset['summary'].append(row.pop('Summary').strip())

### 4. Convert to a transformers dataset

In [6]:
dataset = Dataset.from_dict(dataset)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
dataset

# Fine tuning AraGPT2

### 1. Import the required modules

In [8]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import GPT2TokenizerFast
from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel
from transformers import DataCollatorForSeq2Seq
from evaluate import load
from numpy import where
from numpy import count_nonzero
from numpy import mean

### 2. Initialize the model

In [None]:
model_name = 'aubmindlab/aragpt2-base'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

rouge = load('rouge')

### 3. Define helper functions

In [10]:
def prepare_for_model(examples):
  inputs = ['summarize: ' + doc for doc in examples['text']]
  model_inputs = tokenizer(inputs, max_length=1024 * 10, truncation=True)

  labels = tokenizer(text_target=examples['summary'], max_length=128 * 10, truncation=True)

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  prediction_lens = [count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result['gen_len'] = mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

### 4. Specify the training arguments

In [None]:
tokenized_dataset = dataset.map(prepare_for_model, batched=True)

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/AIC-ICMTC/models/AraGPT2',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=4,
    save_strategy='epoch',
    save_total_limit=2,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

### 5. Train the model

In [None]:
trainer.train()

# Testing

In [None]:
from transformers import pipeline

In [None]:
generation_pipeline = pipeline('text2text-generation', model=model, tokenizer=tokenizer)

result = generation_pipeline(
    '',
    pad_token_id=tokenizer.eos_token_id,
    num_beams=10,
    max_length=200,
    top_p=0.9,
    repetition_penalty = 3.0,
    no_repeat_ngram_size = 3
)

In [None]:
result[0]['generated_text']