In [None]:
import pandas as pd 
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
import pandas as pd
from Preprocessing import Preprocessor
import json
import re

In [2]:
# What should the preprocessor do ?
# 1 - English tokens in the Arabic text and the numbers doesn't need to get translated -> mark special token
# that the model needs to learn where will be the output position 
# 2 - Normalize string formats : "ي\ى"... 
# 3 - Tokenize the input tokens

In [16]:
# read data
train_data = pd.read_csv("train_data.csv")
train_sequences = pd.read_pickle("./postaggedtrain.pkl")

# val_data = pd.read_csv("val_data.csv")
# val_sequences = pd.read_pickle("./preprocessed_val_data.pkl")

test_data = pd.read_csv("test_data.csv")
test_sequences = pd.read_pickle("./postaggedtest.pkl")

# Arabic Preprocessing

In [18]:
def arabic_preprocessing(data,sequences):
    preprocess     = Preprocessor()
        
    data["processed_arabic_instruction"] = sequences.apply(lambda x: [y.tokens[0] for y in x])

    data["pos_tag"] = sequences.apply(lambda x: [y.tags[0] for y in x])
    
    data["processed_arabic_instruction"] = data["processed_arabic_instruction"].apply(lambda x: preprocess.normalize_and_convert_digits(x))

    data["eng_in_ar"] =  data["processed_arabic_instruction"].apply(lambda x: preprocess.extract_non_arabic_in_list(x[1:-1]))

    data["processed_arabic_instruction"] = data.apply\
                                (lambda x : preprocess.\
                                 mask_non_arabic_tokens(x["processed_arabic_instruction"],x["eng_in_ar"]) , axis =1)
    data["eng_in_ar"] = data["eng_in_ar"].apply(lambda x: [token.lower() for token in x])
     
    data["processed_arabic_instruction"] = data["processed_arabic_instruction"] .apply(lambda x: ["<s>"]+ x + ["</s>"])
    data["pos_tag"] =  data["pos_tag"].apply(lambda x: ["<s>"]+ x + ["</s>"])
    return data

In [34]:
train_data.loc[9798,"arabic_instruction"]

'ما هو عنوان github للمستودع الذي يحتوي على ملفات تستخدمها المعرف Solution ID12؟'

In [19]:
train_data = arabic_preprocessing(train_data,train_sequences)
test_data  = arabic_preprocessing(test_data, test_sequences)

In [20]:
# train_data = arabic_preprocessing(train_data,train_sequences)
# val_data   = arabic_preprocessing(val_data, val_sequences)
# test_data  = arabic_preprocessing(test_data, test_sequences)

tokenCounter = Counter([token for sublist in train_data["processed_arabic_instruction"]  for token in sublist])

to_be_ambiguated = train_data[train_data["processed_arabic_instruction"].apply(lambda x: any(tokenCounter.get(y, 0) <= 1 for y in x))]

# Replacing Very Rare words with UNK to improve generalization and decrease overfitting on noisy texts
to_be_ambiguated.loc[:,"processed_arabic_instruction"] = to_be_ambiguated["processed_arabic_instruction"]\
    .apply(lambda x: [y if tokenCounter.get(y) > 3 else "<UNK>" for y in x])

# because of the Segmenter we found "ال" / "ة" are the most frequent
# we will ambiguate some examples randomly and replace them with UNK
to_be_ambiguated_2 = train_data[train_data["processed_arabic_instruction"].apply(lambda x: any(tokenCounter.get(y, 0) <= 1 for y in x))]

# Encouraging robustness by making the model deal more with UNK tokens
to_be_ambiguated_2.loc[:,"processed_arabic_instruction"] = to_be_ambiguated_2["processed_arabic_instruction"]\
    .apply(lambda x: ["<UNK>" if y == "ال+" or y == "ة+" else y for y in x])
train_data = pd.concat([train_data,to_be_ambiguated,to_be_ambiguated_2])

train_data.reset_index(drop=True,inplace=True)

In [21]:
arabic_tokens = set(tokenCounter.keys())
with open("arabic_tokens.json", "w", encoding="utf-8") as f:
    json.dump(list(arabic_tokens), f, ensure_ascii=False, indent=2)

# English Preprocessing

In [None]:
def English_preprocessing(data):
    preprocessor = Preprocessor()
    mwe = MWETokenizer([
        ("<", "ENG", ">")
    ],"")
    data["processed_eng_instruction"] = data["instruction"].str.lower()

    data["processed_eng_instruction"] = data.apply(
        lambda row: preprocessor.mask_non_arabic_text(
            row["processed_eng_instruction"],
            re.compile(r'(' + '|'.join(map(re.escape, row["eng_in_ar"])) + r')'+preprocessor.non_arab + "?", 
                    re.VERBOSE | re.MULTILINE)
        )if len(row["eng_in_ar"]) > 0 else row["processed_eng_instruction"],
        axis=1
    )
    data["processed_eng_instruction"] = data["processed_eng_instruction"].map(word_tokenize)
    data["processed_eng_instruction"] = data["processed_eng_instruction"].map(mwe.tokenize)
    
    data["processed_eng_instruction"] = data["processed_eng_instruction"].apply(lambda x: ["<s>"]+ x + ["</s>"])
    data["processed_eng_instruction"] = data["processed_eng_instruction"].\
        apply(lambda x : [re.sub(".*[0-9]+.*","<ENG>",token) for token in x ])

    return data

train_data = English_preprocessing(train_data)
val_data = English_preprocessing(val_data)
test_data = English_preprocessing(test_data)



In [None]:
english_tokens = set()
all_en_sequences = list(train_data["processed_eng_instruction"].map(set))

for sequence in all_en_sequences:
    english_tokens.update(sequence)
with open("english_tokens.json", "w", encoding="utf-8") as f:
    json.dump(list(english_tokens), f, ensure_ascii=False, indent=2)

# Save Tokens, Preprocessed Sequences for Next stage

In [None]:
mwe = MWETokenizer([("<","s",">"),("<","ENG",">"),("<","/s",">")],"")
val_data["processed_eng_instruction"] = val_data["processed_eng_instruction"].apply(mwe.tokenize)

In [None]:
def write_data_pkl_file(data, name):
    preprocessed_data = (
    data[["processed_arabic_instruction", "processed_eng_instruction", "pos_tag"]]
    .rename(columns={
        "processed_arabic_instruction": "arabic",
        "processed_eng_instruction": "english"
    })
)
    # writing it in pickle format, because saving it as csv will lead
    # the list of tokens turn into String
    preprocessed_data.to_pickle(f"./{name}.pkl")

write_data_pkl_file(train_data, "preprocessed_train_data")
write_data_pkl_file(val_data, "preprocessed_val_data")
write_data_pkl_file(test_data, "preprocessed_test_data")
