In [None]:
import pandas as pd 
from collections import Counter
from nltk.tokenize import word_tokenize
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from farasa.pos import FarasaPOSTagger 
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm

tqdm.monitor_interval = 0  # avoid warning


import json


In [None]:
# What should the preprocessor do ?
# 1 - English tokens in the Arabic text and the numbers doesn't need to get translated -> mark special token
# that the model needs to learn where will be the output position 
# 2 - Normalize string formats : "ي\ى"... 
# 3 - Tokenize the input tokens

In [None]:
# read data
train_data = pd.read_csv("train_data.csv")
train_sequences = pd.read_pickle("./postaggedtrain.pkl")

val_data = pd.read_csv("val_data.csv")
val_sequences = pd.read_pickle("./postaggedval.pkl")

test_data = pd.read_csv("test_data.csv")
test_sequences = pd.read_pickle("./postaggedtest.pkl")

# Arabic Preprocessing

In [6]:
def new_arabic_preprocessing(data,sequences):

    data["processed_arabic_instruction"] = sequences.apply(lambda x: ["<s>"]+[y.tokens[0] for y in x]+["</s>"])

    data["pos_tag"] = sequences.apply(lambda x: ["<s>"]+[y.tags[0] for y in x]+["</s>"])
    
    return data

In [None]:
train_data = new_arabic_preprocessing(train_data)
val_data   = new_arabic_preprocessing(val_data)
test_data  = new_arabic_preprocessing(test_data)

tokenCounter = Counter([token for sublist in train_data["processed_arabic_instruction"]  for token in sublist])

to_be_ambiguated = train_data[train_data["processed_arabic_instruction"].apply(lambda x: any(tokenCounter.get(y, 0) <= 1 for y in x))]

# Replacing Very Rare words with UNK to improve generalization and decrease overfitting on noisy texts
to_be_ambiguated.loc[:,"arabic"] = to_be_ambiguated["arabic"].apply(lambda x: [y if tokenCounter.get(y) > 3 else "<UNK>" for y in x])

# because of the Segmenter we found "ال" / "ة" are the most frequent
# we will ambiguate some examples randomly and replace them with UNK
to_be_ambiguated_2 = train_data[train_data["processed_arabic_instruction"].apply(lambda x: any(tokenCounter.get(y, 0) <= 1 for y in x))]

# Encouraging robustness by making the model deal more with UNK tokens
to_be_ambiguated_2["arabic"] = to_be_ambiguated_2["arabic"].apply(lambda x: ["<UNK>" if y == "ال+" or y == "ة+" else y for y in x])
train_data = pd.concat([train_data,to_be_ambiguated,to_be_ambiguated_2])

train_data.reset_index(drop=True,inplace=True)

In [None]:
arabic_tokens = set(tokenCounter.keys())

1857

# English Preprocessing

In [None]:
def English_preprocessing(data):
    data["processed_eng_instruction"] = data["instruction"].str.lower()
    data["processed_eng_instruction"] = data["processed_eng_instruction"].map(word_tokenize)

    data["processed_eng_instruction"] = data["processed_eng_instruction"].apply(lambda x: ["<s>"]+ x + ["</s>"])

    return data

train_data = English_preprocessing(train_data)
val_data = English_preprocessing(val_data)
test_data = English_preprocessing(test_data)

english_tokens = set()
all_en_sequences = list(train_data["processed_eng_instruction"].map(set))

for sequence in all_en_sequences:
    english_tokens.update(sequence)

# Save Tokens, Preprocessed Sequences for Next stage

In [None]:
def write_data_pkl_file(data, name):
    preprocessed_data = (
    data[["arabic_instruction", "processed_eng_instruction", "pos_tag"]]
    .rename(columns={
        "arabic_instruction": "arabic",
        "processed_eng_instruction": "english"
    })
)
    # writing it in pickle format, because saving it as csv will lead
    # the list of tokens turn into String
    preprocessed_data.to_pickle(f"./{name}.pkl")

write_data_pkl_file(train_data, "preprocessed_train_data")
write_data_pkl_file(val_data, "preprocessed_val_data")
write_data_pkl_file(test_data, "preprocessed_test_data")

with open("arabic_tokens.json", "w", encoding="utf-8") as f:
    json.dump(list(arabic_tokens), f, ensure_ascii=False, indent=2)
        
with open("english_tokens.json", "w", encoding="utf-8") as f:
    json.dump(list(english_tokens), f, ensure_ascii=False, indent=2)
