In [1]:
!pip install transformers==4.15.0 datasets==1.17.0 seqeval==1.2.2 sentencepiece==0.1.96

Collecting transformers==4.15.0
  Using cached transformers-4.15.0-py3-none-any.whl (3.4 MB)
Collecting datasets==1.17.0
  Using cached datasets-1.17.0-py3-none-any.whl (306 kB)
Collecting seqeval==1.2.2
  Using cached seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sentencepiece==0.1.96
  Using cached sentencepiece-0.1.96-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.53.tar.gz (880 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp39-cp39-win_amd64.whl (2.0 MB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
     ------------------------------------ 132.9/132.9 kB 340.6 kB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-win_amd64.whl (30 kB)
Collecting pyarrow!=4.0.0,>=3.

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, Dataset, DatasetDict

import numpy as np
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True ,max_length=512)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    global model_name, current_epoch

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [custom_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [custom_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    metric_results = {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    
    return metric_results

def compute_results(trainer, tokenized_ds, metric, custom_labels):
    predictions, labels, _ = trainer.predict(tokenized_ds)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [custom_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [custom_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return results

# Prepare the Dataset

Next, you are going to prepare the dataset for the fine-tuning phase.

You've to provide a list of all of the NER labels in the `custom_labels` list.




We're following the popular `BIO` format. Where `O` means `not-tagged`, while any tag should represented by two labels, one for the Beggining of the labelling like `B-person`, while the other for tagging any following labelled word like `I-Person`.

Example:
If we have an example:

`I went to United States and Brazil last week`

And we have two tags for `location` and `time`

The tagged example will looks like

I `(O)` went `(O)` to `(O)` United `(B-location)` States `(I-location)` and `(O)` Brazil `(B-location)` last `(B-time)` week `(I-time)`

if we want to represent it in our training data, we'll seperate the texts and tags into different two lists like this

```
train_texts = [
    ['I', 'went', 'to', 'United', 'Stated', 'and', 'Brazil', 'last', 'week'],
    # ['anther', 'example', 'words']
]

train_tags = [
    ['O', 'O', 'O', 'B-location', 'I-location', 'O', 'B-location', 'B-time', 'I-time'],
    # ['O', 'O', 'O']
]

```

In [9]:
# marefa-ner base checkpoint
base_checkpoint = "marefa-nlp/marefa-ner"
task = "ner"
label_all_tokens = True
seed = 101

# where to save the new model and its logs
new_model_path = f"./finetuned-ner"
logs_path = f"./logs"

# seqeval metric
metric = load_metric("seqeval")

## all of the tags in your dataset
custom_labels = ["O", "B_plc", "I_plc", "B_rawy","I_rawy", "B_Crime", "I_Crime", "B_Hell","I_Hell","B_Time", "I_Time", "B_Day", "I_Day", "B_Month", "I_Month", "B_Mon", "I_Mon", "B_matn", "I_matn", "B_fnarmetn", "I_fnarmetn", "B_qpart", "I_qpart", "B_wem", "I_wem", "B_pbuh", "I_pbuh", "B_prophets", "I_prophets", "B_Sect", "I_Sect", "B_cmt", "I_cmt", "B_mlk_clean", "I_mlk_clean"]

In [12]:

import pandas as pd
train = pd.read_csv('/content/train.csv', encoding='utf-8')
test = pd.read_csv('/content/test.csv')

print(train.shape)
print(test.shape)

(9368, 3)
(2342, 4)


In [13]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [14]:
tags = pd.read_csv('tags.csv')
print(tags.shape)
tags.head()

(17, 3)


Unnamed: 0,Name_entity,Meaning,label
0,plc,Names of places,1
1,rawy,Narrator,3
2,Crime,List of Crimes,2
3,Hell,Hell,11
4,Time,Time,0


In [15]:
!pip install camel_tools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting camel_tools
  Downloading camel_tools-1.4.1-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting camel-kenlm
  Downloading camel-kenlm-2021.12.27.tar.gz (418 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.2/418.2 KB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: camel-kenlm, docopt, emoji
  Building wheel for cam

In [16]:
import numpy as np
from camel_tools.utils.normalize import normalize_teh_marbuta_ar, normalize_alef_ar
import re

In [17]:
tags

Unnamed: 0,Name_entity,Meaning,label
0,plc,Names of places,1
1,rawy,Narrator,3
2,Crime,List of Crimes,2
3,Hell,Hell,11
4,Time,Time,0
5,Day,Day,9
6,Month,Months,10
7,Mon,Money,6
8,matn,The real content of the hadith (المتن,5
9,fnarmetn,First narrator,4


In [18]:
normalized_train = pd.DataFrame()
normalized_train['normalized_text'] = train['hadith_text'].apply(normalize_teh_marbuta_ar).apply(normalize_alef_ar)
normalized_train['normalized_subtext'] = train['subtext'].apply(normalize_teh_marbuta_ar).apply(normalize_alef_ar)
normalized_train['Named_entity'] = train['Named_entity']
normalized_train.head(7)

Unnamed: 0,normalized_text,normalized_subtext,Named_entity
0,حدثنا محمد بن المثنى ، حدثنا اسحاق بن يوسف ، ح...,الظهر,0
1,حدثنا ابو اليمان ، اخبرنا شعيب ، عن الزهري ، ق...,والمروه,1
2,حدثنا عبد الله بن عبد الوهاب ، حدثنا خالد بن ا...,عشاء,0
3,حدثنا موسى بن اسماعيل ، حدثنا جرير بن حازم ، ح...,الربا,2
4,حدثنا عبد الله بن يوسف ، اخبرنا مالك ، عن اسحا...,مالك,3
5,حدثنا محمد بن المثنى ، حدثنا خالد بن الحارث ، ...,محمد بن المثنى,3
6,حدثنا عبدان ، قال اخبرني ابي ، عن شعبه ، عن قت...,يزيد بن زريع,3


In [19]:
normalized_test = pd.DataFrame()
normalized_test['IDs'] = test['IDs']
normalized_test['normalized_text'] = test['hadith_text'].apply(normalize_teh_marbuta_ar).apply(normalize_alef_ar)
normalized_test['normalized_subtext'] = test['subtext'].apply(normalize_teh_marbuta_ar).apply(normalize_alef_ar)
normalized_test.head(7)

Unnamed: 0,IDs,normalized_text,normalized_subtext
0,0,حدثنا خلاد بن يحيى ، حدثنا نافع بن عمر ، عن اب...,نافع بن عمر
1,1,حدثنا عمر بن حفص ، حدثنا ابي ، حدثنا الاعمش ، ...,صفيه
2,2,حدثنا الصلت بن محمد ، حدثنا عبد الواحد ، حدثنا...,ابن عباس
3,3,حدثنا عبد الله بن يوسف ، اخبرنا مالك ، عن نافع...,نافع
4,4,حدثني عبد الله بن محمد ، حدثنا اسحاق الازرق ، ...,الظهر
5,5,حدثنا حفص بن عمر ، حدثنا شعبه ، قال اخبرني عبد...,قزعه
6,6,حدثنا مسدد ، حدثنا يزيد بن زريع ، قال حدثني ال...,ابي عثمان


In [20]:
import string

clean = lambda txt: txt.translate(str.maketrans('', '', '؟ً؟،'+string.punctuation))

normalized_train['normalized_text'][0]

cleaned_train = pd.DataFrame()
cleaned_train['cleaned_text'] = normalized_train['normalized_text'].apply(clean)
cleaned_train['cleaned_subtext'] = normalized_train['normalized_subtext'].apply(clean)
cleaned_train['Named_entity'] = normalized_train['Named_entity']

cleaned_train['cleaned_text'][0]

cleaned_test = pd.DataFrame()
cleaned_test['IDs'] = normalized_test['IDs']
cleaned_test['cleaned_text'] = normalized_test['normalized_text'].apply(clean)
cleaned_test['cleaned_subtext'] = normalized_test['normalized_subtext'].apply(clean)

txts = cleaned_train['cleaned_text'].drop_duplicates()
len(txts)

tokens = []

for txt in txts:
  tokens.extend(txt.split())
vocab = set(tokens)
len(vocab)

token2id = {token:id for id, token in enumerate(vocab, 2)}
# token2id['<UNK>']=1
# token2id['<UNK>']

count=0
for k, v in token2id.items():
  print(k, '\t', v)
  count+=1
  if count==15: break

id2token = {id:token for token, id in token2id.items()}
len(id2token)==len(token2id)

count=0
for k, v in id2token.items():
  print(k, '\t', v)
  count+=1
  if count==15: break

def tokenize(sent, unk=1):
  tokenized_sent = []
  for word in sent.split():
    tokenized_sent.append(token2id.get(word, unk))
  return tokenized_sent

tokenized_train = cleaned_train['cleaned_text'].apply(tokenize).drop_duplicates()
len(tokenized_train)

len(cleaned_test)

tokenized_train = cleaned_test['cleaned_text'].apply(tokenize).drop_duplicates()
len(cleaned_test)

tag2id = {tag:id for id, tag in zip(tags['Name_entity'].values, tags['label'].values)}
tag2id

عاثت 	 2
جزور 	 3
معتكف 	 4
اجري 	 5
فافزع 	 6
بالارض 	 7
بناءه 	 8
بكار 	 9
محمودا 	 10
ثلاثه 	 11
لصبح 	 12
ثوبه 	 13
يسالوني 	 14
نجيح 	 15
ساقطه 	 16
2 	 عاثت
3 	 جزور
4 	 معتكف
5 	 اجري
6 	 فافزع
7 	 بالارض
8 	 بناءه
9 	 بكار
10 	 محمودا
11 	 ثلاثه
12 	 لصبح
13 	 ثوبه
14 	 يسالوني
15 	 نجيح
16 	 ساقطه


{1: 'plc',
 3: 'rawy',
 2: 'Crime',
 11: 'Hell',
 0: 'Time',
 9: 'Day',
 10: 'Month',
 6: 'Mon',
 5: 'matn',
 4: 'fnarmetn',
 12: 'qpart',
 7: 'wem',
 8: 'pbuh',
 14: 'prophets',
 16: 'Sect',
 13: 'cmt',
 15: 'mlk_clean'}

In [21]:
txts = cleaned_train['cleaned_text'].drop_duplicates().reset_index(drop=True)
len(txts)

def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

a_list = [1,2,3,4,1,2,1,2,3,4]
find_indices(a_list, 1)

data = []

for d in range(len(txts)):
  hadith_df = cleaned_train[cleaned_train['cleaned_text']==txts[d]]
  hadith, sub, ne = hadith_df['cleaned_text'].values[0], hadith_df['cleaned_subtext'].values, hadith_df['Named_entity'].values
  data.append([hadith, sub, ne])

len(data)

labels = []

for sen, sub, ne in data:
  sub_ne = list(zip(sub, ne))
  tokens = sen.split()
  label = np.zeros(len(tokens))
  for toks, tag_id in sub_ne:
    for tok in toks.split():
      ids = find_indices(tokens, tok)
      label[ids] = tag_id
  labels.append(label)

len(labels)

hadith = data[0][0]
subtext = data[0][1]
named_entity = data[0][2]
label = labels[0]

print(f'hadith:\n\t{hadith}', f'subtext:\n\t{subtext}', f'named_entity:\n\t{named_entity}', f'label:\n\t{label}', sep='\n\n')

def encoding_text(hadith , sub_text , sub_lable):
    pattern = []
    idx = 0 
    while idx < len(hadith):
      founded = False
      for k, v in enumerate(sub_text):
        if hadith[idx]==v[0]:
          founded = True
          L= [f"B_{tag2id[sub_lable[k]]}"]
          L.extend([f"I_{tag2id[sub_lable[k]]}"]*(len(v)-1))
          pattern.extend(L)
          idx+=len(v)
          break

      if not founded:
        pattern.extend('O')
        idx+=1 
        
    return pattern

def extend_sub(sub):
    sub_tokens = [s.split() for s in sub]
    return sub_tokens

def sub_mapping(sub_tokens,ne):
    sub_lable = []
    sub_text = []
    for idx, sub_token in enumerate(sub_tokens):
        sub_text.append(sub_token)
        sub_lable.append(ne[idx])
    return sub_text,sub_lable

sub_tokens = extend_sub(data[1][1])
sub_tokens

sub_text,sub_lable = sub_mapping(sub_tokens,data[1][2])
print(sub_text,sub_lable)

encoding_text(data[1][0].split() , sub_text , sub_lable )

data[1][0]

# train and dev text samples
train_hadith = []
# train_patten_tag
train_tags = []

for item in range(len(data)):
    sub_text,sub_lable = sub_mapping(extend_sub(data[item][1]),data[item][2])
    pattern = encoding_text(data[item][0].split() ,sub_text,sub_lable)
    train_hadith.append(data[item][0].split())
    train_tags.append(pattern)


hadith:
	حدثنا محمد بن المثنى  حدثنا اسحاق بن يوسف  حدثنا سفيان الثوري  عن عبد العزيز بن رفيع  قال سالت انس بن مالك  اخبرني بشيء عقلته عن النبي صلى الله عليه وسلم اين صلى الظهر يوم الترويه قال بمنى  قلت فاين صلى العصر يوم النفر قال بالابطح  افعل كما يفعل امراؤك

subtext:
	['الظهر' 'انس بن مالك' 'محمد بن المثنى' 'العصر' 'انس بن مالك' 'بالابطح'
 'اين صلى الظهر يوم الترويه قال بمنى  قلت فاين صلى العصر يوم النفر قال بالابطح  افعل كما يفعل امراؤك'
 'سفيان الثوري' 'اسحاق بن يوسف']

named_entity:
	[0 3 3 0 4 1 5 3 3]

label:
	[0. 3. 3. 3. 0. 3. 3. 3. 0. 3. 3. 0. 0. 0. 3. 0. 5. 0. 4. 3. 4. 0. 0. 0.
 0. 0. 5. 0. 0. 0. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.
 5.]
[['والمروه'], ['الصفا'], ['عائشه'], ['بالبيت'], ['بالصفا']] [1, 1, 7, 1, 1]


In [22]:
len(train_hadith)

1625

In [23]:
len(train_tags)

1625

In [24]:
train_hadith[0]

['حدثنا',
 'محمد',
 'بن',
 'المثنى',
 'حدثنا',
 'اسحاق',
 'بن',
 'يوسف',
 'حدثنا',
 'سفيان',
 'الثوري',
 'عن',
 'عبد',
 'العزيز',
 'بن',
 'رفيع',
 'قال',
 'سالت',
 'انس',
 'بن',
 'مالك',
 'اخبرني',
 'بشيء',
 'عقلته',
 'عن',
 'النبي',
 'صلى',
 'الله',
 'عليه',
 'وسلم',
 'اين',
 'صلى',
 'الظهر',
 'يوم',
 'الترويه',
 'قال',
 'بمنى',
 'قلت',
 'فاين',
 'صلى',
 'العصر',
 'يوم',
 'النفر',
 'قال',
 'بالابطح',
 'افعل',
 'كما',
 'يفعل',
 'امراؤك']

In [25]:
train_texts = train_hadith[:int(0.8*(len(train_hadith)))]
dev_texts = train_hadith[int(0.8*(len(train_hadith))):]

In [26]:
train_tag = train_tags[:int(0.8*(len(train_tags)))]
dev_tag = train_tags[int(0.8*(len(train_tags))):]

In [27]:
# train and dev text samples
# train_texts = [
#     ['حدثنا', ' محمد ', 'بن', ' المثنى ', '،', ' حدثنا', ' إسحاق', ' بن', ' يوسف ', '،', ' حدثنا', ' سفيان', ' الثوري ', '، ', 'عن ', 'عبد', ' العزيز', ' بن', ' رفيع', ' ، ', 'قال ', 'سألت ', 'أنس ', 'بن', ' مالك ', '، ', 'أخبرني ', '   ', 'عقلته', '، ', 'عن ', 'النبي ', 'صلى ', 'الله ', 'عليه ', 'وسلم', '، ', 'أين', ' صلى', ' الظهر', ' يوم ', 'التروية', '؟', '،', ' قال ', 'بمنى ', '،', ' قلت ', 'فأين', ' صلى ', 'العصر', ' يوم', ' النفر', '؟', '،', ' قال ', 'بالأبطح ', '، ', 'افعل', ' كما', ' يفعل ', 'أمراؤك'],
#    ['برايان', 'ميلر', 'هو', 'سياسي', 'أسترالي', '،','ولد','في','30','يناير','1921','في','أستراليا','6','يونيو','2014','.'],
#    ['وقبل', 'الدخول', 'في', 'تفاصيل', 'معركتيه', 'الأخيرتين', '–', 'شبه', 'المتزامنتين', '–', 'مع', 'جون', 'برينان', 'مدير', 'وكالة', 'الاستخبارات', 'المركزية', '(CIA', ')', 'والموظفة', 'السابقة', 'لدى', 'البيت', 'الأبيض', 'وحملته', 'الانتخابية', 'الرئاسية', 'أوماروسا', 'مانيغولت', 'نيومان', 'ترامب', 'في', 'تنفيذ', 'حملته', 'الانتخابية', 'ومن', 'ثم', 'أجندته', 'الرئاسية', '.'],    
# ]

# dev_texts = [
#     ['ماساكي', 'فوجيتا', '(بالكانا:ふじた', 'まさあき)', 'هو', 'لاعب', 'كرة', 'قدم', 'و', 'سياسي', 'ياباني', '،', 'ولد', 'في', '3', 'يناير', '1922', 'في', 'اليابان', '27', 'مايو', '1996', '.'],
#     ['توماس', 'مور', 'هو', 'نقابي', 'و', 'سياسي', 'أسترالي', '،', 'ولد', 'في', '14', 'فبراير', '1881', 'في', 'أستراليا', '،', 'وتوفي', 'في', '13', 'يناير', '1961', 'أستراليا', '.'],
# ]


In [28]:
# train and dev tags
# train_tags = [
#     ['O','B-person','I-person','I-person','O','O','B-person','I-person','I-person','O','O','B-person','I-person','O','O','B-First_narrator','I-First_narrator','I-First_narrator','I-First_narrator'],
#     ['B-person','I-person','O','B-job','I-job','O','O','O','B-time','I-time','I-time','O','B-location','I-location','I-location','I-location','O'],
#     ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-person', 'I-person', 'B-job', 'B-organization', 'I-organization', 'I-organization', 'I-organization', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'O', 'O', 'O', 'B-person', 'I-person', 'I-person', 'I-person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
# ]

# dev_tags = [
#     ['B-person', 'I-person', 'O', 'O', 'O', 'B-job', 'I-job', 'I-job', 'O', 'B-job', 'B-nationality', 'O', 'O', 'O', 'B-time', 'I-time', 'I-time', 'O', 'B-location', 'B-time', 'I-time', 'I-time', 'O'],
#     ['B-person', 'I-person', 'O', 'B-job', 'O', 'B-job', 'B-nationality', 'O', 'O', 'O', 'B-time', 'I-time', 'I-time', 'O', 'B-location', 'O', 'O', 'O', 'B-time', 'I-time', 'I-time', 'B-location', 'O'],
# ]

In [29]:
train_texts[0]

['حدثنا',
 'محمد',
 'بن',
 'المثنى',
 'حدثنا',
 'اسحاق',
 'بن',
 'يوسف',
 'حدثنا',
 'سفيان',
 'الثوري',
 'عن',
 'عبد',
 'العزيز',
 'بن',
 'رفيع',
 'قال',
 'سالت',
 'انس',
 'بن',
 'مالك',
 'اخبرني',
 'بشيء',
 'عقلته',
 'عن',
 'النبي',
 'صلى',
 'الله',
 'عليه',
 'وسلم',
 'اين',
 'صلى',
 'الظهر',
 'يوم',
 'الترويه',
 'قال',
 'بمنى',
 'قلت',
 'فاين',
 'صلى',
 'العصر',
 'يوم',
 'النفر',
 'قال',
 'بالابطح',
 'افعل',
 'كما',
 'يفعل',
 'امراؤك']

In [30]:
train_tag[1]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_wem',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_plc',
 'B_plc',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_plc',
 'B_plc',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_plc',
 'B_plc',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_plc',
 'B_plc',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B_plc',
 'B_plc',
 'O',
 'O',
 'O',
 'O',
 'B_wem',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O'

In [31]:
## convert to Dataset
datasets = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": train_texts,
        "ner_tags": [ [ custom_labels.index(r) for r in rec ] for rec in train_tag ]
    }),
    "dev": Dataset.from_dict({
        "tokens": dev_texts,
        "ner_tags": [ [ custom_labels.index(r) for r in rec ] for rec in dev_tag ]
    }),
})

In [32]:
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1300
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 325
    })
})

## Fine-Tuning

In [33]:
tokenizer = AutoTokenizer.from_pretrained(base_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(base_checkpoint, num_labels=len(custom_labels), ignore_mismatched_sizes=True)

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.08G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at marefa-nlp/marefa-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([19, 1024]) in the checkpoint and torch.Size([35, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([19]) in the checkpoint and torch.Size([35]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# prepare dataset
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)



  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
# configure your fine-tuning process

args = TrainingArguments(
    new_model_path,
    logging_dir=logs_path,
    evaluation_strategy = "epoch",
    logging_strategy= "epoch",
    save_strategy= "epoch",
    learning_rate= 2e-5,
    load_best_model_at_end= True,
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    num_train_epochs= 2,
    weight_decay= 0.01,
    push_to_hub= False,
)

data_collator = DataCollatorForTokenClassification(tokenizer)



In [36]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
train_result = trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running training *****
  Num examples = 1300
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 164


In [None]:
## evaluate the model
dev_results = compute_results(trainer, tokenized_datasets["dev"], metric, custom_labels)

In [None]:
# save the last trained weights
trainer.save_model(f"{new_model_path}/best")
tokenizer.add_tokens(custom_labels)
tokenizer.save_pretrained(f"{new_model_path}/best")