**KHELLADI Sid Ali**

**DOUID Mohamed**


# I) Back-end

In [12]:
# 1
# 1.1
import xml.etree.ElementTree as ET
# 1.2
import nltk
from nltk.tokenize import sent_tokenize
# 2
from collections import Counter, defaultdict
# 4
from gensim import corpora, models
# 5
from transformers import MarianMTModel, MarianTokenizer



In [13]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**1) Prétraitement des Données**

**1.1) Extraction des \<AbstractText\>**

In [23]:
def extract_abstract_texts(file_path, max_abstracts=25):
    tree = ET.parse(file_path)
    root = tree.getroot()
    abstracts = []
    
    for i, abstract in enumerate(root.iter('AbstractText')):
        if i < max_abstracts:
            abstracts.append(abstract.text)
        else:
            break
    
    return abstracts

**1.2) Segmentation en phrases**

In [24]:
def segment_sentences(abstracts):
    sentences = [sent_tokenize(abstract) for abstract in abstracts if isinstance(abstract, str)]
    return [sentence for sublist in sentences for sentence in sublist]


**1.3) Creation du fichier global**

In [32]:
def create_global_file(sentences, output_path):
    # Ouverture du fichier avec l'encodage UTF-8
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')


In [26]:
# 1.1
abstracts = extract_abstract_texts("sample-0001.xml")


In [29]:
# 1.2
sentences = segment_sentences(abstracts)

In [31]:
# 1.3
output_path = "output.txt"

create_global_file(sentences, output_path)


**2) Etiquetage Morphosyntaxique (POS Tagging)**

In [33]:
def compute_transition_matrices(tags):
    unigram_counts = Counter(tags)
    bigram_counts = Counter(zip(tags, tags[1:]))
    
    total_unigrams = sum(unigram_counts.values())
    total_bigrams = sum(bigram_counts.values())

    unigram_probs = {tag: count / total_unigrams for tag, count in unigram_counts.items()}
    bigram_probs = {bigram: count / total_bigrams for bigram, count in bigram_counts.items()}

    return unigram_probs, bigram_probs

In [47]:
myList = ["NOUN","VERB","NOUN","ADJ"]
unigram_probs, bigram_probs = compute_transition_matrices(myList)
print("unigram_probs : ",unigram_probs)
print("bigram_probs : ",bigram_probs)

unigram_probs :  {'NOUN': 0.5, 'VERB': 0.25, 'ADJ': 0.25}
bigram_probs :  {('NOUN', 'VERB'): 0.3333333333333333, ('VERB', 'NOUN'): 0.3333333333333333, ('NOUN', 'ADJ'): 0.3333333333333333}


**3) Reconnaissance d'Entités Nommées (NER)**

In [42]:
def compute_ner_transition_matrices(tags):
    unigram_counts = Counter(tags)
    bigram_counts = Counter(zip(tags, tags[1:]))

    total_unigrams = sum(unigram_counts.values())
    total_bigrams = sum(bigram_counts.values())

    unigram_probs = {tag: count / total_unigrams for tag, count in unigram_counts.items()}
    bigram_probs = {bigram: count / total_bigrams for bigram, count in bigram_counts.items()}

    return unigram_probs, bigram_probs


**4) Modélisation des Sujets (Topics Modeling)**

In [45]:
def topic_modeling(texts, num_topics=3):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words=4)
    return topics


**5) Traduction**

In [48]:
def translate_text(text, src_lang='en', tgt_lang='fr'):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    translation = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translation


# II) Front-end 

In [49]:
from flask import Flask, request, jsonify

**1) Interface Utilisateur**

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/')
def index():
    return 'Hello, World!'

@app.route('/api/data', methods=['POST'])
def get_data():
    data = request.get_json()
    response = {'received': data}
    return jsonify(response)

if __name__ == '__main__':
    try:
        app.run(debug=True, use_reloader=False)
    except Exception as e:
        print(f"An error occurred: {e}")
