In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import string 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

In [2]:
def extract_train_party_text_pairs(xml_string: str) -> list[tuple[str, str]]:
    
    # Parse the XML string
    root = ET.fromstring(xml_string)
    
    # Store results
    pairs = []
    
    # Process each doc element
    for doc in root.findall('.//doc'):
        # Extract party label
        parti_elem = doc.find('.//PARTI')
        if parti_elem is not None:
            party = parti_elem.get('valeur')
            
            # Extract text content
            text_elem = doc.find('.//texte')
            if text_elem is not None:
                # Combine all paragraph texts
                paragraphs = [p.text for p in text_elem.findall('p') if p.text]
                full_text = ' '.join(paragraphs)
                
                # Add to results
                pairs.append([full_text, party])
    
    pairs = np.array(pairs, dtype=str)
    
    return pairs

In [3]:
with open("./Corpus d_apprentissage/deft09_parlement_appr_en.xml", encoding="utf-8") as f:
    train_text = f.read()

data = extract_train_party_text_pairs(train_text)




In [33]:
X_data = data[:, 0]
y_data = data[:, 1]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, stratify=y_data, random_state=42)

def remove_punctuation(text):
    return ''.join(char.lower() for char in text if char not in string.punctuation)



vectorizer = TfidfVectorizer(preprocessor=remove_punctuation, stop_words='english', max_df=0.9, ngram_range=(1, 2))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [34]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17433, 1372670)
(17433,)
(1937, 1372670)
(1937,)


In [45]:
print(y_train)

['ELDR' 'PSE' 'PPE-DE' ... 'PSE' 'PPE-DE' 'PPE-DE']


In [43]:
clf = LinearSVC()

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ELDR       0.69      0.22      0.33       201
     GUE-NGL       0.66      0.49      0.56       269
      PPE-DE       0.51      0.77      0.62       686
         PSE       0.46      0.45      0.46       544
   Verts-ALE       0.49      0.24      0.32       237

    accuracy                           0.52      1937
   macro avg       0.56      0.43      0.46      1937
weighted avg       0.53      0.52      0.50      1937
