# Run this cell once then delete it

### For google colab

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

!pip install camel-tools
!mkdir /content/drive/MyDrive/camel_tools

os.environ['CAMELTOOLS_DATA'] = '/content/drive/MyDrive/camel_tools'
!export | camel_data -i all

### For a local environment

In [None]:
import os

!pip install camel-tools
!mkdir /camel_tools

os.environ['CAMELTOOLS_DATA'] = '/camel_tools'
!export | camel_data -i all

# Must run before starting

### For google colab

In [1]:
!pip install camel-tools

from google.colab import drive
drive.mount('/content/drive')

import os
os.environ['CAMELTOOLS_DATA'] = '/content/drive/MyDrive/camel_tools'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting camel-tools
  Downloading camel_tools-1.5.2-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.3/124.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting docopt (from camel-tools)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill (from camel-tools)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=3.0.2 (from camel-tools)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
Collecting emoji (from camel-tools)
  Downloading emoji-2.5.1.tar.gz (356 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### For a local environment

In [None]:
!pip install camel-tools

import os
os.environ['CAMELTOOLS_DATA'] = '/camel_tools'

# Data Reading

In [2]:
from json import loads

In [3]:
paragraphs = []
summaries = []

with open('/content/drive/MyDrive/AIC-ICMTC/dataset/labeled_validation_dataset.jsonl') as file:
  for line in file:
    entry = loads(line)
    paragraphs.append(entry['paragraph'])
    summaries.append(entry['summary'])

# Preprocessing

### Unicode Normalization, Ortographic Normalization, and Dediacritization

In [4]:
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar

In [5]:
for i in range(len(paragraphs)):
  paragraphs[i] = dediac_ar(
      normalize_teh_marbuta_ar(
          normalize_alef_maksura_ar(
              normalize_alef_ar(
                  normalize_unicode(
                      paragraphs[i]
                  )
              )
          )
      )
  )

### Word Tokenization

In [6]:
from camel_tools.tokenizers.word import simple_word_tokenize

In [7]:
tokenized = [simple_word_tokenize(paragraph) for paragraph in paragraphs]

### POS Tagging

In [8]:
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tagger.default import DefaultTagger

In [9]:
pos_tags_to_delete = ['punc', 'noun_prop', 'prep', 'digit', 'foreign', 'part_det', 'pron_dem', 'adj', 'part_verb', 'pron', 'abbrev', 'conj', 'aux', 'adp', 'adv', 'det', 'conj_sub', 'pron_rel', 'adv_rel', 'adv_interrog']

In [10]:
mle = MLEDisambiguator.pretrained()
tagger = DefaultTagger(mle, 'pos')

In [11]:
filtered = []

for i in range(len(tokenized)):
  pos_tags = tagger.tag(tokenized[i])
  filtered.append([tokenized[i][j] for j in range(len(tokenized[i])) if pos_tags[j] not in pos_tags_to_delete])

In [12]:
filtered[0]

['وتحت',
 'عنوان',
 'الكارثه',
 'التحدي',
 'يبدا',
 'الكاتب',
 'عرض',
 'الكتاب',
 'يوضح',
 'كانت',
 'فرحه',
 'بنصرها',
 'عام',
 'ارتاحت',
 'لاعتقادها',
 'وقتا',
 'جدا',
 'قبل',
 'يفيق',
 'العرب',
 'صدمه',
 'القوات',
 'فاجاتها',
 'بعد',
 'شهر',
 'واحد',
 'نهايه',
 'حرب',
 'بهجوم',
 'مواقعها',
 'وكان',
 'اعلانا',
 'بدايه',
 'حرب',
 'نوع',
 'حرب',
 'الاستنزاف',
 'استمرت',
 'تم',
 'وقف',
 'اطلاق',
 'النار',
 'بين',
 'الطرفين',
 'وفاه',
 'وتولي',
 'حكم',
 'واستعداده',
 'للحرب',
 'ويتعرض',
 'الكاتب',
 'ايضا',
 'وبصوره',
 'قبل',
 'ينتقل',
 'الكتاب',
 'حرب',
 'يعرض',
 'الخطط',
 'والاستعدادات',
 'الاستعدادات',
 'يبدا',
 'بعرض',
 'وقائع',
 'الحرب',
 'بدايه',
 'الضربه',
 'وانهيار',
 'خط',
 'واختراقه',
 'ويتوقف',
 'الكاتب',
 'عند',
 'يوم',
 'ويقول',
 'اليوم',
 'كان',
 'اسوا',
 'هزيمه',
 'تاريخ',
 'الجيش',
 'ينتقل',
 'المؤلف',
 'الجبهه',
 'يعود',
 'يوميات',
 'الحرب',
 'يعرض',
 'للثغره',
 'عرف',
 'بعمليه',
 'المزرعه',
 'يوم',
 'والمساعدات',
 'بدايه',
 'الضغوط',
 'الرئيس',
 'ينتقل',
 'الكاتب',
 'للاح

### Transliteration

In [13]:
from camel_tools.utils.charmap import CharMapper

In [14]:
ar2bw = CharMapper.builtin_mapper('ar2bw')

In [15]:
transliterated = [ar2bw(' '.join(tokens)) for tokens in filtered]

In [16]:
transliterated[0]

"wtHt EnwAn AlkArvh AltHdy ybdA AlkAtb ErD AlktAb ywDH kAnt frHh bnSrhA EAm ArtAHt lAEtqAdhA wqtA jdA qbl yfyq AlErb Sdmh AlqwAt fAjAthA bEd $hr wAHd nhAyh Hrb bhjwm mwAqEhA wkAn AElAnA bdAyh Hrb nwE Hrb AlAstnzAf Astmrt tm wqf ATlAq AlnAr byn AlTrfyn wfAh wtwly Hkm wAstEdAdh llHrb wytErD AlkAtb AyDA wbSwrh qbl yntql AlktAb Hrb yErD AlxTT wAlAstEdAdAt AlAstEdAdAt ybdA bErD wqA}E AlHrb bdAyh AlDrbh wAnhyAr xT wAxtrAqh wytwqf AlkAtb End ywm wyqwl Alywm kAn AswA hzymh tAryx Aljy$ yntql Alm&lf Aljbhh yEwd ywmyAt AlHrb yErD llvgrh Erf bEmlyh AlmzrEh ywm wAlmsAEdAt bdAyh AlDgwT Alr}ys yntql AlkAtb llAHdAv jrt wAEfA' Alfryq mnSbh kr}ys lArkAn AlqwAt wtwly Alfryq bdlA AlAtjAh AlmwAfqh Tlb wqf ATlAq AlnAr wAlxlAf b$An AlAmr bdAyh Alhjwm lqnAh wAlEmlyAt Almlk qrr dxwl AlHrb Dd ywm yErD AlkAtb AlmErkh bAlAstylA' mdynh tTwrAt AlmErkh Hlwl ywm wAlE$ryn kAn AlAsrA}ylywn AsrwA nHw AlAf frd AlqwAt Aglbhm wHdAt AlAmdAd wAltmwyn"