**KHELLADI Sid Ali**

**DOUID Mohamed**


# I) Back-end

- **Part of speech tagging:** we use the idea of “lab 01” to compute the transition uni- gram and bi-gram matrices that contain the probabilities of the different POS tags.

In [255]:
# 1
# 1.1
import xml.etree.ElementTree as ET
# 1.2
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# 2
from collections import Counter, defaultdict
from nltk import pos_tag
import pandas as pd
import numpy as np
# 4
from gensim import corpora, models
# 5
from transformers import MarianMTModel, MarianTokenizer



In [256]:
pd.set_option("display.max_rows", 40)
pd.set_option("display.max_columns", 40)

In [257]:
# nltk.download("punkt")

**1) Prétraitement des Données**

**1.1) Extraction des \<AbstractText\>**

In [258]:
def extract_abstract_texts(file_path, max_abstracts=25):
    tree = ET.parse(file_path)
    root = tree.getroot()
    abstracts = []
    
    for i, abstract in enumerate(root.iter('AbstractText')):
        if i < max_abstracts:
            abstracts.append(abstract.text)
        else:
            break
    
    return abstracts

**1.2) Segmentation en phrases**

In [259]:
def segment_sentences(abstracts):
    sentences = [sent_tokenize(abstract) for abstract in abstracts if isinstance(abstract, str)]
    return [sentence for sublist in sentences for sentence in sublist]


**1.3) Creation du fichier global**

In [260]:
def create_global_file(sentences, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')


In [261]:
# 1.1
abstracts = extract_abstract_texts("sample-0001.xml")


In [262]:
# 1.2
sentences = segment_sentences(abstracts)

In [263]:
# 1.3
output_path = "output.txt"

create_global_file(sentences, output_path)


**2) Part of speech tagging (POS Tagging)**

In [264]:
def compute_transition_matrices(tags):
    unigram_counts = Counter(tags)
    bigram_counts = Counter(zip(tags, tags[1:]))
    
    total_unigrams = sum(unigram_counts.values())
    total_bigrams = sum(bigram_counts.values())

    unigram_probs = {tag: count / total_unigrams for tag, count in unigram_counts.items()}
    bigram_probs = {bigram: count / total_bigrams for bigram, count in bigram_counts.items()}

    return unigram_probs, bigram_probs

In [265]:
words = [word for sentence in sentences for word in word_tokenize(sentence)]
tags = [tag for word, tag in pos_tag(words)]

In [299]:
unigram_probs, bigram_probs = compute_transition_matrices(tags)

list_tags = []
for key,value in unigram_probs.items():
    if key not in list_tags:
        list_tags.append(key)

{'DT': 0.07128446536650974,
 'JJ': 0.10793544048419637,
 'NN': 0.18897108271687962,
 'NNP': 0.05514458641560188,
 'IN': 0.1207128446536651,
 'NNS': 0.06657700067249496,
 'VBD': 0.03026227303295225,
 '(': 0.01714862138533961,
 ')': 0.01714862138533961,
 'CD': 0.05211835911230666,
 'TO': 0.019166106254203095,
 'VB': 0.016812373907195696,
 'CC': 0.03194351042367182,
 ',': 0.04404841963685272,
 '.': 0.03732347007397444,
 'VBN': 0.026899798251513115,
 'NNPS': 0.0006724949562878278,
 'VBP': 0.008742434431741762,
 'RB': 0.02051109616677875,
 'VBG': 0.011432414256893073,
 'JJR': 0.006724949562878279,
 'VBZ': 0.015131136516476126,
 '``': 0.0003362474781439139,
 "''": 0.0003362474781439139,
 'PRP': 0.009078681909885675,
 'WP': 0.0020174848688634837,
 'RBR': 0.0003362474781439139,
 '$': 0.0006724949562878278,
 'MD': 0.005043712172158709,
 'WDT': 0.003698722259583053,
 'WRB': 0.0010087424344317419,
 'EX': 0.0013449899125756557,
 ':': 0.004034969737726967,
 'PRP$': 0.003026227303295225,
 'JJS': 0.0

**2.1) Dispalying Unigram probabilities**

In [300]:
df_unigram_probs = pd.DataFrame(list(unigram_probs.items()), columns =["pos_tag","probs"])
df_unigram_probs

Unnamed: 0,pos_tag,probs
0,DT,0.071284
1,JJ,0.107935
2,NN,0.188971
3,NNP,0.055145
4,IN,0.120713
5,NNS,0.066577
6,VBD,0.030262
7,(,0.017149
8,),0.017149
9,CD,0.052118


**2.1.2) Dispalying Unigram probabilities on Excel file**

In [302]:
excel_file_unigram_probs = 'excel/unigram_probs.xlsx'
df_unigram_probs.to_excel(excel_file_unigram_probs, index=True)


**2.2) Dispalying Bigram probabilities**

In [281]:
def getIndexOfTag(tag,list_tags):
    for i in range(len(list_tags)):
        if list_tags[i] == tag:
            return i

In [305]:
# def df_bigram_prob_func(bigram_probs):
    
#     list_tags = []
#     for key,value in bigram_probs.items():
#         if key[0] not in list_tags:
#             list_tags.append(key[0])
            
#     premieres_tags = [cle[0] for cle in bigram_probs.keys()]
#     deuxiemes_tags = [cle[1] for cle in bigram_probs.keys()]

#     val_bigram_probs_of_tag = [cle for cle in bigram_probs.values()]
    
#     bigram_probs_mat = np.zeros((len(list_tags),len(list_tags)))
    
#     for i in range(len(premieres_tags)):
#         bigram_probs_mat[
#             getIndexOfTag(
#                 premieres_tags[i],
#                 list_tags
#             ),
#             getIndexOfTag(
#                 deuxiemes_tags[i],
#                 list_tags
#             )
#         ] = val_bigram_probs_of_tag[i]
    
#     return pd.DataFrame(bigram_probs_mat, columns=list_tags, index=list_tags)

# ------------------------------------------------------------------------------
# ------------------------------- code minimiser -------------------------------
# ------------------------------------------------------------------------------

def df_bigram_prob_func(bigram_probs):
    
    list_tags = []
    for key in bigram_probs.keys():
        if key[0] not in list_tags:
            list_tags.append(key[0])
        if key[1] not in list_tags:
            list_tags.append(key[1])

    tag_index = {tag: i for i, tag in enumerate(list_tags)}

    bigram_probs_mat = np.zeros((len(list_tags), len(list_tags)))

    for (tag1, tag2), prob in bigram_probs.items():
        bigram_probs_mat[tag_index[tag1], tag_index[tag2]] = prob

    return pd.DataFrame(bigram_probs_mat, index=list_tags, columns=list_tags)


In [306]:
df_bigram_probs = df_bigram_prob_func(bigram_probs)

df_bigram_probs

Unnamed: 0,DT,JJ,NN,NNP,IN,NNS,VBD,(,),CD,TO,VB,CC,",",.,VBN,NNPS,VBP,RB,VBG,JJR,VBZ,``,'',PRP,WP,RBR,$,MD,WDT,WRB,EX,:,PRP$,JJS,POS,RBS,PDT
DT,0.0,0.0185,0.029936,0.007064,0.001682,0.006727,0.000673,0.0,0.0,0.002018,0.0,0.0,0.000336,0.0,0.0,0.001009,0.0,0.0,0.000336,0.000336,0.000673,0.000673,0.000336,0.0,0.000336,0.0,0.0,0.0,0.000336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000336,0.0
JJ,0.0,0.012109,0.057181,0.004036,0.0037,0.018163,0.0,0.0,0.001009,0.002018,0.003364,0.0,0.001682,0.001682,0.001682,0.000673,0.0,0.000336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NN,0.000336,0.005382,0.024891,0.005718,0.052136,0.010764,0.008073,0.008073,0.006054,0.001682,0.005045,0.0,0.009418,0.019845,0.015473,0.001345,0.0,0.001009,0.000673,0.001009,0.0,0.006054,0.0,0.0,0.000336,0.0,0.0,0.0,0.001345,0.001682,0.0,0.0,0.002018,0.0,0.0,0.000673,0.0,0.0
NNP,0.0,0.002018,0.006727,0.009418,0.004373,0.003364,0.000673,0.004709,0.005718,0.008409,0.000336,0.000336,0.001682,0.001345,0.002691,0.0,0.0,0.000336,0.000336,0.001345,0.0,0.001009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000336,0.0,0.0
IN,0.029263,0.027918,0.025563,0.006727,0.002691,0.006054,0.000336,0.0,0.0,0.010427,0.0,0.000336,0.000673,0.000673,0.0,0.000673,0.000336,0.0,0.001345,0.004373,0.000336,0.0,0.0,0.0,0.000336,0.000336,0.0,0.0,0.0,0.000673,0.0,0.0,0.0,0.001009,0.000673,0.0,0.0,0.0
NNS,0.0,0.001009,0.0,0.0,0.020182,0.0,0.006727,0.002018,0.001345,0.000336,0.000673,0.0,0.004036,0.008745,0.009082,0.002018,0.0,0.0037,0.000336,0.001009,0.0,0.000673,0.0,0.0,0.000336,0.001345,0.0,0.0,0.001009,0.000673,0.0,0.0,0.001345,0.0,0.0,0.0,0.0,0.0
VBD,0.004709,0.004036,0.001345,0.000336,0.002355,0.001009,0.0,0.000336,0.0,0.001682,0.001345,0.0,0.0,0.0,0.000336,0.007064,0.0,0.0,0.002355,0.000673,0.001682,0.0,0.0,0.0,0.000336,0.0,0.000336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000336
(,0.000336,0.002691,0.000336,0.009418,0.000336,0.0,0.0,0.0,0.0,0.003364,0.0,0.000673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
),0.000336,0.0,0.001009,0.0,0.002691,0.0,0.001682,0.0,0.0,0.0,0.0,0.0,0.001345,0.002355,0.005045,0.0,0.0,0.0,0.0,0.000336,0.0,0.001009,0.0,0.0,0.0,0.0,0.0,0.0,0.001345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD,0.0,0.004036,0.021191,0.0,0.002691,0.011436,0.000336,0.001009,0.003027,0.0,0.003027,0.0,0.0,0.004373,0.0,0.0,0.0,0.0,0.0,0.000336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000673,0.0,0.0,0.0,0.0,0.0


In [304]:
excel_file_bigram_probs = 'excel/bigram_probs.xlsx'
df_bigram_probs.to_excel(excel_file_bigram_probs, index=True)
