# Importing Necessary Libraries
This section imports the libraries required for processing the data and training models.

In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron, SGDClassifier

# Helper Functions
This section defines functions for data extraction, processing, and cleaning.

In [2]:
def extract_text_party_pairs(file_path: str) -> np.ndarray:
    with open(file_path, encoding="utf-8") as file:
        xml_string = file.read()
    root = ET.fromstring(xml_string)
    pairs = []
    for doc in root.findall('.//doc'):
        parti_elem = doc.find('.//PARTI')
        if parti_elem is not None:
            party = parti_elem.get('valeur')
            text_elem = doc.find('.//texte')
            if text_elem is not None:
                paragraphs = [p.text for p in text_elem.findall('p') if p.text]
                full_text = ' '.join(paragraphs)
                pairs.append([full_text, party])
    return np.array(pairs, dtype=str)

def extract_text_pairs_without_party(file_path: str) -> np.ndarray:
    with open(file_path, encoding="utf-8") as file:
        xml_string = file.read()
    root = ET.fromstring(xml_string)
    texts = []
    for doc in root.findall('.//doc'):
        text_elem = doc.find('.//texte')
        if text_elem is not None:
            paragraphs = [p.text for p in text_elem.findall('p') if p.text]
            full_text = ' '.join(paragraphs)
            texts.append(full_text)
    return np.array(texts, dtype=str)

def load_reference_labels(file_path: str) -> np.ndarray:
    with open(file_path, encoding="utf-8") as file:
        labels = [line.strip() for line in file.readlines()]
    return np.array(labels, dtype=str)

def remove_punctuation(text):
    return ''.join(char.lower() for char in text if char not in string.punctuation)

# Defining Dataset Paths and Models
This section specifies the paths to the datasets for English, French, and Italian and defines the models for evaluation.

In [3]:
party_mapping = {
    "ELDR": 0,
    "GUE-NGL": 1,
    "PPE-DE": 2,
    "PSE": 3,
    "Verts-ALE": 4
}

datasets = {
    "en": {
        "train": "./Corpus d_apprentissage/deft09_parlement_appr_en.xml",
        "test": "./Corpus de test/deft09_parlement_test_en.xml",
        "reference": "./Données de référence/deft09_parlement_ref_en.txt"
    },
    "fr": {
        "train": "./Corpus d_apprentissage/deft09_parlement_appr_fr.xml",
        "test": "./Corpus de test/deft09_parlement_test_fr.xml",
        "reference": "./Données de référence/deft09_parlement_ref_fr.txt"
    },
    "it": {
        "train": "./Corpus d_apprentissage/deft09_parlement_appr_it.xml",
        "test": "./Corpus de test/deft09_parlement_test_it.xml",
        "reference": "./Données de référence/deft09_parlement_ref_it.txt"
    }
}

models = {
    "LinearSVC": LinearSVC(),
    "Perceptron": Perceptron(),
    "SGDClassifier": SGDClassifier()
}

# Training and Evaluating Models
This section loops through each language and evaluates multiple models for classification.

In [4]:
for lang, paths in datasets.items():
    print(f"Processing language: {lang.upper()}")
    
    train_data = extract_text_party_pairs(paths["train"])
    if len(train_data) == 0:
        raise ValueError(f"Training data is empty for {lang}. Please check the training file format.")
    X_train = train_data[:, 0]
    y_train = train_data[:, 1]
    
    test_data = extract_text_pairs_without_party(paths["test"])
    if len(test_data) == 0:
        raise ValueError(f"Testing data is empty for {lang}. Please check the test file format.")
    X_test = test_data
    
    # Load and clean reference labels
    y_reference = load_reference_labels(paths["reference"])
    y_reference_cleaned = []
    X_test_aligned = []
    for text, line in zip(X_test, y_reference):
        if line.strip() and '\t' in line:  # Check if line is valid
            try:
                party_label = line.split('\t')[1].strip()
                if party_label in party_mapping:
                    y_reference_cleaned.append(party_label)
                    X_test_aligned.append(text)
                else:
                    print(f"Unknown party label: {party_label}")
            except IndexError:
                print(f"Malformed line: {line}")
        else:
            print(f"Ignored line: {line}")
    
    # Ensure the cleaned data is consistent
    if len(X_test_aligned) != len(y_reference_cleaned):
        raise ValueError("Aligned test data and reference labels have inconsistent lengths.")

    y_train_encoded = np.array([party_mapping[label] for label in y_train])
    y_reference_encoded = np.array([party_mapping[label] for label in y_reference_cleaned])
    
    # Vectorize the text data
    vectorizer = TfidfVectorizer(preprocessor=remove_punctuation, stop_words='english', max_df=0.9, ngram_range=(1, 2))
    X_train_transformed = vectorizer.fit_transform(X_train)
    X_test_transformed = vectorizer.transform(X_test_aligned)
    
    # Train and evaluate each model
    for model_name, model in models.items():
        print(f"Evaluating model: {model_name} for language: {lang.upper()}")
        model.fit(X_train_transformed, y_train_encoded)
        y_pred = model.predict(X_test_transformed)
        print(classification_report(y_reference_encoded, y_pred, target_names=list(party_mapping.keys())))
        print("-" * 80)


Processing language: EN
Ignored line: 2602
Ignored line: 12172
Evaluating model: LinearSVC for language: EN
              precision    recall  f1-score   support

        ELDR       0.93      0.69      0.80      1339
     GUE-NGL       0.90      0.81      0.85      1793
      PPE-DE       0.76      0.90      0.82      4571
         PSE       0.79      0.79      0.79      3627
   Verts-ALE       0.89      0.70      0.78      1585

    accuracy                           0.81     12915
   macro avg       0.85      0.78      0.81     12915
weighted avg       0.82      0.81      0.81     12915

--------------------------------------------------------------------------------
Evaluating model: Perceptron for language: EN
              precision    recall  f1-score   support

        ELDR       0.81      0.74      0.77      1339
     GUE-NGL       0.85      0.81      0.83      1793
      PPE-DE       0.80      0.85      0.82      4571
         PSE       0.80      0.79      0.79      3627
   Ve