In [None]:
!pip install pandas
!pip install gensim
!pip install conllu
!pip install transformers

In [None]:
from conllu import parse_incr
from gensim.models import Word2Vec as w2v
import pandas as pd
from transformers import AutoTokenizer


In [3]:
path = 'PerDT/fa_perdt-ud-dev.conllu'
data_file = open(path, "r", encoding="utf-8")
generator = parse_incr(data_file)

sentence_models = []

for sentence_model in generator: 
    sentence_models.append(sentence_model)

In [9]:
bert_sentences = []

model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

for sentence_model in sentence_models:
    text = sentence_model.metadata['text']
    tokenized = tokenizer.tokenize(text)
    bert_sentences.append(tokenized)


In [12]:
with open('bert.txt', 'w', encoding='utf-16') as f:
    for sent in bert_sentences:
        line = " ".join(sent)
        f.write(line)
        f.write('\n')

In [15]:
lines = []
with open('bert.txt', 'r', encoding='utf-16') as f:
    lines.append(f.readline().strip())

print(lines)

['به گزارش خبرنگار مهر در گرگان ، بر اساس باورهای دینی ترکمن [ZWNJ] ها در این روز برای پیامبر اکرم ( ص ) ناراحتی و بیماری رخ داد که چند روز بعد با رح ##لت نبی مکر ##م اسلام جهان عزادار مات ##مش شد .']


In [16]:
b_sentences = []
for line in lines:
    b_sentences.append(line.split())

b_sentences

[['به',
  'گزارش',
  'خبرنگار',
  'مهر',
  'در',
  'گرگان',
  '،',
  'بر',
  'اساس',
  'باورهای',
  'دینی',
  'ترکمن',
  '[ZWNJ]',
  'ها',
  'در',
  'این',
  'روز',
  'برای',
  'پیامبر',
  'اکرم',
  '(',
  'ص',
  ')',
  'ناراحتی',
  'و',
  'بیماری',
  'رخ',
  'داد',
  'که',
  'چند',
  'روز',
  'بعد',
  'با',
  'رح',
  '##لت',
  'نبی',
  'مکر',
  '##م',
  'اسلام',
  'جهان',
  'عزادار',
  'مات',
  '##مش',
  'شد',
  '.']]

In [99]:
sentences = []

for sentence_model in sentence_models:
    words = [token['lemma'] for token in sentence_model]
    sentences.append(words)
    

In [108]:
embedding_dimension = 50
embeddings = w2v(sentences=sentences, min_count=3, vector_size=embedding_dimension, window=3, sg=1, workers=4)

emb_df = (
    pd.DataFrame(
        [embeddings.wv.get_vector(str(n)) for n in embeddings.wv.key_to_index],
        index = embeddings.wv.key_to_index
    )
)

lemma_to_index = embeddings.wv.key_to_index

print(emb_df.shape)
emb_df.head()

(1258, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
.,0.044631,-0.019841,-0.023262,0.074152,-0.149207,-0.13272,0.447928,0.181325,-0.333659,0.033825,...,0.251829,0.019118,-0.053167,-0.054675,0.49739,0.198481,-0.006749,-0.259465,0.092721,0.085204
و,0.01825,-0.028154,-0.03781,0.056468,-0.108691,-0.11798,0.455847,0.160933,-0.362743,0.070451,...,0.315893,0.006853,-0.032276,-0.04386,0.536335,0.220656,0.001298,-0.325333,0.09405,0.08178
کرد#کن,0.007487,-0.031638,-0.018479,0.046961,-0.094246,-0.12306,0.421579,0.18079,-0.363657,0.01694,...,0.3449,-0.016122,-0.05957,-0.031726,0.49403,0.209568,0.051093,-0.30931,0.099283,0.063402
به,0.057874,-0.060395,-0.024552,0.047123,-0.130799,-0.08325,0.474455,0.192794,-0.355014,0.074864,...,0.32542,0.003204,-0.032906,-0.064811,0.500713,0.23354,0.044459,-0.327781,0.064142,0.068599
را,0.02747,-0.02716,-0.038821,0.037965,-0.110586,-0.089733,0.433595,0.172429,-0.372748,0.051758,...,0.308175,0.003813,-0.035309,-0.031173,0.511251,0.219476,0.029181,-0.314938,0.08082,0.045283


In [109]:

per_dt_average_sentence_length = 17

xpos_to_index_dictionary = {
    None          : -1,
    '_'           : -1, 
    'ADJ'         : 0,
    'ADP'         : 1,
    'ADV'         : 2,
    'AUX'         : 3,
    'CCONJ'       : 4,
    'DET'         : 5,
    'INTJ'        : 6,
    'NOUN'        : 7,
    'NUM'         : 8,
    'PART'        : 9,
    'PRON'        : 10,
    'PROPN'       : 11,
    'PUNCT'       : 12,
    'SCONJ'       : 13,
    'VERB'        : 14,
    'X'           : 15,
    'PREP'        : 16,
    'N_IANM'      : 17,
    'N_ANM'       : 18,
    'PUNC'        : 19,
    'ADJ_AJP'     : 20,
    'PREM_DEMAJ'  : 21,
    'PR_SEPER'    : 22,
    'V_PASS'      : 23,
    'CONJ'        : 24,
    'PSUS'        : 25,
    'ADV_SADV'    : 26,
    'PRENUM'      : 27,
    'POSTP'       : 28,
    'V_ACT'       : 29,
    'PR_DEMON'    : 30,
    'SUBR'        : 31,
    'V_MODL'      : 32,
    'IDEN'        : 33,
    'POSNUM'      : 34,
    'PREM_AMBAJ'  : 35,
    'PR_CREFX'    : 36,
    'PRENUM_IANM' : 37,
    'PR_JOPER'    : 38,
    'ADJ_AJCM'    : 39,
    'ADJ_AJSUP'   : 40,
    'PR_INTG'     : 41,
    'PR_UCREFX'   : 42,
    'PREM_QUAJ'   : 43,
    'PREM_EXAJ'   : 44,
    'PR_RECPR'    : 45,
    'ADR_PRADR'   : 46,
    'AUX_PASS'    : 47 }

upos_to_index_dictionary = {
    '_'     : -1, 
    'ADJ'   : 0,
    'ADP'   : 1,
    'ADV'   : 2,
    'AUX'   : 3,
    'CCONJ' : 4,
    'DET'   : 5,
    'INTJ'  : 6,
    'NOUN'  : 7,
    'NUM'   : 8,
    'PART'  : 9,
    'PRON'  : 10,
    'PROPN' : 11,
    'PUNCT' : 12,
    'SCONJ' : 13,
    'VERB'  : 14,
    'X'     : 15,
}

upos_length = len(upos_to_index_dictionary)
xpos_length = len(xpos_to_index_dictionary)
pos_length = upos_length + xpos_length

lexical_morpho_models = []

for sentence_model in sentence_models:
    lexical_morpho_model = []
    
    for i in range(len(sentence_model)):

        if i >= per_dt_average_sentence_length:
            break

        token = sentence_model[i]
        lemma = token['lemma']
        upos = token['upos']
        xpos = token['xpos']

        if lemma in lemma_to_index:
            upos_index = upos_to_index_dictionary[upos]
            xpos_index = xpos_to_index_dictionary[xpos]

            upos_vector = [0] * upos_length
            xpos_vector = [0] * xpos_length
            
            if upos_index != -1:
                upos_vector[upos_index] = 1

            if xpos_index != -1:    
                xpos_vector[xpos_index] = 1

            vector = xpos_vector + upos_vector
            lexical_morpho_model.append(vector)
        else:
            lexical_morpho_model.append([0] * pos_length)
    
    lexical_morpho_models.append(lexical_morpho_model)


In [97]:
feature_to_index_dictionary = {
    ('Number', 'Sing')   : 0,
    ('Number', 'Plur')   : 1,
    ('Person', '2')      : 2,
    ('Person', '3')      : 3,
    ('Person', '1')      : 4,
    ('Tense', 'Past')    : 5,
    ('Tense', 'Pres')    : 6,
    ('Tense', 'Fut')     : 7,
    ('Voice', 'Pass')    : 8,
    ('Voice', 'Act')     : 9,
    ('VerbForm', 'Part') : 10,
    ('VerbForm', 'Fin')  : 11,
    ('PronType', 'Prs')  : 12,
    ('Polarity', 'Neg')  : 13,
    ('Mood', 'Imp')      : 14,
    ('Mood', 'Sub')      : 15,
}

feature_length = len(feature_to_index_dictionary)
feature_models = []

for sentence_model in sentence_models:
    feature_model = []
    
    for i in range(len(sentence_model)):

        if i >= per_dt_average_sentence_length:
            break

        token = sentence_model[i]
        lemma = token['lemma']
        feats = token['feats']
        vector = [0] * feature_length

        if feats is None:
            continue

        for pair in feats.items():
            index = feature_to_index_dictionary[pair]
            vector[index] = 1
    
    feature_models.append(feature_model)

In [17]:
with open('embeddings.txt', 'w', encoding='utf-16') as f:
    for key in w.wv.key_to_index:
        word = key
        vector = w.wv.get_vector(str(key))
        text_list = [str(v) for v in vector]
        vector_text = " ".join(text_list)
        f.write(word)
        f.write(' ')
        f.write(vector_text)
        f.write('\n')