In [46]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [47]:
party_mapping = {'ELDR': 0, 'GUE-NGL': 1, 'PPE-DE': 2, 'PSE': 3, 'Verts-ALE': 4}

In [48]:
def parse_xml(file_path, is_train=True):
    tree = ET.parse(file_path)
    root = tree.getroot()

    textes = []
    labels = []
    text_id = {}

    for doc in root.findall('.//doc'):
        doc_id = int(doc.attrib['id'].split(":")[-1]) - 1
        texte = " ".join([p.text.replace('\xa0', '') for p in doc.findall('.//texte/p') if p.text is not None])
        textes.append(texte)
        text_id[texte] = doc_id

        if is_train:
            label = doc.find('.//PARTI').attrib['valeur']
            labels.append(party_mapping[label])

    if is_train:
        return textes, labels, text_id

    return textes, text_id

In [49]:
def parse_reference(file_path, test_dict, X_test):
    references = {}
    valid_textes = []
    valid_labels = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            partis = line.strip().split('\t')
            if len(partis) < 2 or partis[1] not in party_mapping:
                print(f"Skipping line: {line.strip()}")
                continue

            ref_id = int(partis[0]) - 1
            if ref_id in test_dict.values():
                references[ref_id] = party_mapping[partis[1]]

    for texte in X_test:
        if test_dict[texte] in references:
            valid_textes.append(texte)
            valid_labels.append(references[test_dict[texte]])

    return valid_textes, valid_labels

In [50]:
train_file = './Corpus d_apprentissage/deft09_parlement_appr_en.xml'
X_train, y_train, train_dict = parse_xml(train_file, is_train=True)

In [51]:
test_file = './Corpus de test/deft09_parlement_test_en.xml'
X_test, test_dict = parse_xml(test_file, is_train=False)

In [52]:
ref_file = './Données de référence/deft09_parlement_ref_it.txt'
X_test, y_test = parse_reference(ref_file, test_dict, X_test)

Skipping line: 1239
Skipping line: 8634


In [53]:
print(len(X_test), len(y_test))

12915 12915


In [54]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [55]:
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.10      0.08      0.09      1338
           1       0.13      0.12      0.13      1793
           2       0.36      0.42      0.39      4573
           3       0.29      0.29      0.29      3627
           4       0.13      0.10      0.11      1584

    accuracy                           0.27     12915
   macro avg       0.20      0.20      0.20     12915
weighted avg       0.25      0.27      0.26     12915

