In [None]:
import spacy
nlp = spacy.load('ro_core_news_sm')

In [None]:
alignment_dict = dict()
#Alinieri cu eflomal
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def parse_alignments(alignment_file, source_sentences, target_sentences):
    with open(alignment_file, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file):
            alignment_pairs = line.strip().split()
            for pair in alignment_pairs:
                src_index, tgt_index = map(int, pair.split('-'))
                src_word = source_sentences[line_num].split()[src_index]
                tgt_word = target_sentences[line_num].split()[tgt_index]
                yield src_word, tgt_word

# Read sentences from source and target files
source_sentences = read_sentences('corpus.rup')
target_sentences = read_sentences('corpus.ro')

# Extract and print aligned word pairs
for src_word, tgt_word in parse_alignments('forward.align', source_sentences, target_sentences):
    print(f"{src_word} - {tgt_word}")
    alignment_dict[src_word] = tgt_word

In [None]:
import json
import pandas as pd
from collections import defaultdict
import tqdm
excel_path = "Papahagi.xls"
df = pd.read_excel(excel_path, header = None)

df.columns = ["POS", "aro", "ro", "origine", "IDK", "autor"]
df.head()


import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

with open('corpus.rup', 'r', encoding='utf-8') as file:
    aro_text = file.read()

aro_tokens = word_tokenize(aro_text.lower())
vectorizer = CountVectorizer()

direct_translation = dict()
for word in tqdm.tqdm(aro_tokens):
    if word in df["aro"].values.tolist():
        # print(word)
        # print(df[df["aro"] == word]["ro"].values.tolist())
        direct_translation[word] = df[df["aro"] == word]["ro"].values.tolist()



In [None]:
with open("corpus.ro", "r", encoding="utf-8") as romanian_file:
    romanian_text = romanian_file.read()

with open("corpus.rup", "r", encoding="utf-8") as aromanian_file:
    aromanian_text = aromanian_file.read()

doc_ro = nlp(romanian_text)

romanian_tokens = [token.text for token in doc_ro]
romanian_pos_tags = [token.pos_ for token in doc_ro]

aromanian_tokens = aromanian_text.split()



In [None]:

aromanian_pos_tags = []
total_words = 0
unknown_words = 0
import tqdm


total_words = 0
unknown_words = 0


for token in tqdm.tqdm(aromanian_tokens):
    total_words += 1
    pos_tag_assigned = False

    if token in alignment_dict:
        romanian_equivalent = alignment_dict[token]
        try:
            index = romanian_tokens.index(romanian_equivalent)
            aromanian_pos_tags.append(romanian_pos_tags[index])
            pos_tag_assigned = True
        except ValueError:
            pass 

    if not pos_tag_assigned and token in direct_translation:
        romanian_direct_equivalent = direct_translation[token]
        try:
            index = romanian_tokens.index(romanian_direct_equivalent)
            aromanian_pos_tags.append(romanian_pos_tags[index])
            pos_tag_assigned = True
        except ValueError:
            pass  

    if not pos_tag_assigned:
        doc_ro_token = nlp(token)
        if len(doc_ro_token) > 0:
            aromanian_pos_tags.append(doc_ro_token[0].pos_)
        else:
            unknown_words += 1
            aromanian_pos_tags.append("UNKNOWN")


print(f"Total words processed: {total_words}")
print(f"Unknown words: {unknown_words}")



for token, pos_tag in zip(aromanian_tokens, aromanian_pos_tags):
    print(f"{token} ({pos_tag})")

In [None]:
print(len(aromanian_tokens), len(aromanian_pos_tags))

# Hidden Markov Model cu algoritm Viterbi

In [None]:
aromanian_tokens = aromanian_tokens
aromanian_pos_tags = [x if x != '' else 'X' for x in aromanian_pos_tags]
aromanian_pos_tags = [x if x != "PROPN" else "NOUN" for x in aromanian_pos_tags]

train_tagged_words = [(aromanian_tokens[i], aromanian_pos_tags[i]) for i in range(len(aromanian_tokens))]

print(train_tagged_words[:50])

In [None]:
tags = set(aromanian_pos_tags)
print(len(tags))
print(tags)
vocab = set(aromanian_tokens)

In [None]:
print(len(train_tagged_words))
print(len(set(aromanian_tokens)))
print(len(aromanian_tokens))

In [None]:
# Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_given_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_given_w_with_given_tag = len(w_given_tag_list)
 
     
    return (count_given_w_with_given_tag, count_given_tag)

In [None]:
#  Transition Probability
def t2_given_t1(tag2, tag1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==tag1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==tag1 and tags[index+1] == tag2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [None]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        aux = t2_given_t1(t2, t1)
        tags_matrix[i, j] = aux[0]/aux[1]
 
#print(tags_matrix)

In [None]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

In [None]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['PUNCT', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [None]:
test_la_misto = "Sănt îndoauă învețuri, cari si amintă mași cu ghivăsirea, i cu avdzărea."
test_la_misto = test_la_misto.split()

testt = Viterbi(test_la_misto)

print(testt)

## Testare pentru Viterbi

In [None]:
def parse_annotations(text):
    
    lines = text.strip().split('\n')

    annotations = []

 
    for line in lines:
        if line.strip(): 
            word, tag = line.split()  
            if tag == "PUNCT":
                continue
            annotations.append((word, tag))  

    return annotations

def parse_annotations_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()

        return parse_annotations(text)
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"

def calculate_accuracy(actual_annotations, predicted_annotations):
    # Check if both lists are of the same length
    if len(actual_annotations) != len(predicted_annotations):
        print(len(actual_annotations), len(predicted_annotations))
        return "The number of actual and predicted annotations must be the same."

    correct_predictions = sum(1 for actual, predicted in zip(actual_annotations, predicted_annotations) if actual == predicted)

    accuracy = correct_predictions / len(actual_annotations)
    return accuracy
def calculate_f1_score(actual_annotations, predicted_annotations):
    # Check if both lists are of the same length
    if len(actual_annotations) != len(predicted_annotations):
        return "The number of actual and predicted annotations must be the same."

    true_positives = sum(1 for actual, predicted in zip(actual_annotations, predicted_annotations) if actual == predicted)
    false_positives = sum(1 for actual, predicted in zip(actual_annotations, predicted_annotations) if actual != predicted and predicted != "PUNCT")
    false_negatives = sum(1 for actual, predicted in zip(actual_annotations, predicted_annotations) if actual != predicted and actual != "PUNCT")

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

file_path = 'test_ann.txt'

actual_annotations = parse_annotations_from_file(file_path)


words_to_predict = [word for word, tag in actual_annotations if tag != "PUNCT"]
predicted_tags = Viterbi(words_to_predict)
print(predicted_tags)

# Calculate and print the accuracy, if actual annotations were successfully parsed
if isinstance(actual_annotations, list):
    accuracy = calculate_accuracy(actual_annotations, predicted_tags)
    print(accuracy)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
else:
    print(actual_annotations)


In [None]:
for i in range(min(len(actual_annotations), len(predicted_tags))):
    if actual_annotations[i] != predicted_tags[i]:
        print(actual_annotations[i], predicted_tags[i])

## Testare pentru alinieri cu elfomal, giza++ si asocieri directe in dictionar

In [None]:

file_path = 'test_seen_ann.txt'
actual_annotations = parse_annotations_from_file(file_path)
words_to_predict = [word for word, tag in actual_annotations if tag != "PUNCT"]

with open("corpus.ro", "r", encoding="utf-8") as romanian_file:
    romanian_text = romanian_file.read()

with open("corpus.rup", "r", encoding="utf-8") as aromanian_file:
    aromanian_text = aromanian_file.read()

doc_ro = nlp(romanian_text)

romanian_tokens = [token.text for token in doc_ro]
romanian_pos_tags = [token.pos_ for token in doc_ro]

aromanian_tokens = words_to_predict


aromanian_pos_tags = []
total_words = 0
unknown_words = 0


import tqdm
for token in tqdm.tqdm(aromanian_tokens):
    total_words += 1
    pos_tag_assigned = False

    if token in alignment_dict:
        romanian_equivalent = alignment_dict[token]
        try:
            index = romanian_tokens.index(romanian_equivalent)
            aromanian_pos_tags.append(romanian_pos_tags[index])
            pos_tag_assigned = True
        except ValueError:
            pass 

    if not pos_tag_assigned and token in direct_translation:
        romanian_direct_equivalent = direct_translation[token]
        try:
            index = romanian_tokens.index(romanian_direct_equivalent)
            aromanian_pos_tags.append(romanian_pos_tags[index])
            pos_tag_assigned = True
        except ValueError:
            pass  

    if not pos_tag_assigned:
        doc_ro_token = nlp(token)
        if len(doc_ro_token) > 0:
            aromanian_pos_tags.append(doc_ro_token[0].pos_)
        else:
            unknown_words += 1
            aromanian_pos_tags.append("UNKNOWN")


print(f"Total words processed: {total_words}")
print(f"Unknown words: {unknown_words}")


predicted_tags = []
for token, pos_tag in zip(aromanian_tokens, aromanian_pos_tags):
    predicted_tags.append((token, pos_tag))
print(predicted_tags)
for pred_tag in predicted_tags:
    print(pred_tag)

if isinstance(actual_annotations, list):
    accuracy = calculate_accuracy(actual_annotations, predicted_tags)
    f1 = calculate_f1_score(actual_annotations, predicted_tags)
    print(accuracy)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
else:
    print(actual_annotations)

In [None]:
# transform the rup directly in spacy using the romanian POS tagger
# doc_rup = nlp(aromanian_text)
print(words_to_predict)
words_to_predict_nlp = nlp(" ".join(words_to_predict))

In [None]:
print(words_to_predict_nlp)
predicted_tags = []
for token in words_to_predict_nlp:
    predicted_tags.append((str(token), token.pos_))
print(predicted_tags)
ac = calculate_accuracy(actual_annotations, predicted_tags[:-1])
# print(actual_annotations)
print(ac)

In [None]:
predicted_tags = []
possible_tags = ["NOUN", "VERB", "ADJ", "ADV", "PRON", "ADP", "NUM", "DET", "CONJ", "PRT", "X", "INTJ"]
import random
for token in words_to_predict_nlp:
    predicted_tags.append((str(token), random.choice(possible_tags)))
ac = calculate_accuracy(actual_annotations, predicted_tags[:-1])
print(actual_annotations)
print(ac)


In [None]:
prop = """A fost - ce n-a mai fost.
A fost odată un lup, - Cumătrul-Nicola, și-o vulpe, - Cumătra-Mara.
Ăștia doi s-au întovărășit, să vâneze împreună, iar vânatul, și-au zis, l-or împărți frățește.
Ieșind la vânat, dar, dau peste o oală cu miere.
- Aha!, se repezi Cumătru-Nicola, - o pap!
- Mai întâi și mai întâi nu trebuie să spui: „o pap!“, ci „o păpăm“, i-a zis Cumătra-Mara. Apoi, nu te gândești, nici n-am dat bine de oală, și gata: pe ea! Dacă vrei să facem casă laolaltă, află că n-o facem înghițind orice agonisim...
- Fie, Mara, cum știi tu.
- Da, cum știu eu! Să facem și noi ca toată lumea. Acum, bunăoară, ascundem mierea și din trei în trei zile mergem să mâncăm din ea."""
prop_nlp = nlp(prop)
pos_tags = []
for token in prop_nlp:
    pos_tags.append((str(token), token.pos_))
    print((str(token), token.pos_))