In [60]:
import xml.etree.ElementTree as ET
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.linear_model import Perceptron, SGDClassifier

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/flore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
party_mapping = {'ELDR': 0, 'GUE-NGL': 1, 'PPE-DE': 2, 'PSE': 3, 'Verts-ALE': 4}

In [62]:
def pretrait_texte(texte):
    texte = texte.lower()
    texte = ''.join([char for char in texte if char not in string.punctuation])
    return texte

In [63]:
def parse_xml(file_path, is_train=True):
    tree = ET.parse(file_path)
    root = tree.getroot()

    textes = []
    labels = []
    text_id = {}

    for doc in root.findall('.//doc'):
        doc_id = int(doc.attrib['id'].split(":")[-1]) - 1
        texte = " ".join([p.text.replace('\xa0', '') for p in doc.findall('.//texte/p') if p.text is not None])
        texte = pretrait_texte(texte)
        textes.append(texte)
        text_id[texte] = doc_id

        if is_train:
            label = doc.find('.//PARTI').attrib['valeur']
            labels.append(party_mapping[label])

    if is_train:
        return textes, labels, text_id

    return textes, text_id

In [64]:
def parse_reference(file_path, test_dict, X_test):
    references = {}
    valid_textes = []
    valid_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            partis = line.strip().split('\t')
            if len(partis) < 2 or partis[1] not in party_mapping:
                print(f"Skipping line: {line.strip()}")
                continue

            ref_id = int(partis[0]) - 1
            if ref_id in test_dict.values():
                references[ref_id] = party_mapping[partis[1]]

    for texte in X_test:
        if test_dict[texte] in references:
            valid_textes.append(texte)
            valid_labels.append(references[test_dict[texte]])

    return valid_textes, valid_labels

## Corpus anglais

In [65]:
train_file = './Corpus d_apprentissage/deft09_parlement_appr_en.xml'
X_train, y_train, train_dict = parse_xml(train_file, is_train=True)

test_file = './Corpus de test/deft09_parlement_test_en.xml'
X_test, test_dict = parse_xml(test_file, is_train=False)

ref_file = './Données de référence/deft09_parlement_ref_en.txt'
X_test, y_test = parse_reference(ref_file, test_dict, X_test)

Skipping line: 2602
Skipping line: 12172


In [66]:
print(len(X_test), len(y_test))

12913 12913


In [67]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [68]:
# liste de modèles

models = {
    'LinearSVC': LinearSVC(),
    'Perceptron': Perceptron(),
    'SGDClassifier': SGDClassifier(),
}

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")
    clf = model
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    print(classification_report(y_test, y_pred))


Evaluating model: LinearSVC
              precision    recall  f1-score   support

           0       0.83      0.66      0.74      1339
           1       0.84      0.78      0.81      1792
           2       0.74      0.83      0.79      4570
           3       0.72      0.74      0.73      3627
           4       0.79      0.67      0.73      1585

    accuracy                           0.76     12913
   macro avg       0.79      0.74      0.76     12913
weighted avg       0.77      0.76      0.76     12913

Evaluating model: Perceptron
              precision    recall  f1-score   support

           0       0.75      0.72      0.74      1339
           1       0.84      0.78      0.81      1792
           2       0.79      0.81      0.80      4570
           3       0.75      0.78      0.76      3627
           4       0.75      0.72      0.74      1585

    accuracy                           0.78     12913
   macro avg       0.78      0.76      0.77     12913
weighted avg       0

## Corpus français

In [69]:
train_file = './Corpus d_apprentissage/deft09_parlement_appr_fr.xml'
X_train, y_train, train_dict = parse_xml(train_file, is_train=True)

test_file = './Corpus de test/deft09_parlement_test_fr.xml'
X_test, test_dict = parse_xml(test_file, is_train=False)

ref_file = './Données de référence/deft09_parlement_ref_fr.txt'
X_test, y_test = parse_reference(ref_file, test_dict, X_test)

Skipping line: 1175
Skipping line: 4574


In [70]:
print(len(X_test), len(y_test))

12915 12915


In [71]:
french_stopwords = stopwords.words('french')

vectorizer = TfidfVectorizer(stop_words=french_stopwords)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [72]:
# liste de modèles

models = {
    'LinearSVC': LinearSVC(),
    'Perceptron': Perceptron(),
    'SGDClassifier': SGDClassifier(),
}

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")
    clf = model
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    print(classification_report(y_test, y_pred))


Evaluating model: LinearSVC
              precision    recall  f1-score   support

           0       0.82      0.65      0.73      1339
           1       0.85      0.82      0.83      1793
           2       0.76      0.84      0.80      4571
           3       0.74      0.76      0.75      3627
           4       0.82      0.70      0.76      1585

    accuracy                           0.78     12915
   macro avg       0.80      0.76      0.77     12915
weighted avg       0.78      0.78      0.78     12915

Evaluating model: Perceptron
              precision    recall  f1-score   support

           0       0.73      0.70      0.71      1339
           1       0.85      0.79      0.82      1793
           2       0.79      0.82      0.81      4571
           3       0.75      0.77      0.76      3627
           4       0.77      0.71      0.74      1585

    accuracy                           0.78     12915
   macro avg       0.78      0.76      0.77     12915
weighted avg       0

## Corpus italien

In [73]:
train_file = './Corpus d_apprentissage/deft09_parlement_appr_it.xml'
X_train, y_train, train_dict = parse_xml(train_file, is_train=True)

test_file = './Corpus de test/deft09_parlement_test_it.xml'
X_test, test_dict = parse_xml(test_file, is_train=False)

ref_file = './Données de référence/deft09_parlement_ref_it.txt'
X_test, y_test = parse_reference(ref_file, test_dict, X_test)

Skipping line: 1239
Skipping line: 8634


In [74]:
print(len(X_test), len(y_test))

12915 12915


In [77]:
italian_stopwords = stopwords.words('italian')

vectorizer = TfidfVectorizer(stop_words=italian_stopwords)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [78]:
# liste de modèles

models = {
    'LinearSVC': LinearSVC(),
    'Perceptron': Perceptron(),
    'SGDClassifier': SGDClassifier(),
}

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")
    clf = model
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)

    print(classification_report(y_test, y_pred))


Evaluating model: LinearSVC
              precision    recall  f1-score   support

           0       0.82      0.65      0.73      1339
           1       0.86      0.79      0.82      1793
           2       0.76      0.84      0.80      4571
           3       0.75      0.78      0.76      3627
           4       0.80      0.69      0.74      1585

    accuracy                           0.78     12915
   macro avg       0.80      0.75      0.77     12915
weighted avg       0.78      0.78      0.78     12915

Evaluating model: Perceptron
              precision    recall  f1-score   support

           0       0.72      0.69      0.71      1339
           1       0.82      0.79      0.80      1793
           2       0.80      0.81      0.80      4571
           3       0.76      0.78      0.77      3627
           4       0.73      0.70      0.72      1585

    accuracy                           0.77     12915
   macro avg       0.76      0.75      0.76     12915
weighted avg       0