In [12]:
import xml.etree.ElementTree as ET
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
!ls "/content/drive/My Drive/deft09"

'Corpus d_apprentissage'  'Corpus de test'  'Données de référence'


In [5]:
def preprocess(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

In [6]:
def parse_xml_train(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []
    labels = []

    for doc in root.findall('.//doc'):
        party = doc.find('.//PARTI').attrib['valeur']
        labels.append(party)
        paragraphs = [p.text.strip() if p.text is not None else '' for p in doc.findall('.//texte/p')]
        data.append(preprocess(' '.join(paragraphs)))

    return data, labels

In [7]:
def load_text_file(text_file_path):
    with open(text_file_path, 'r') as file:
        lines = file.readlines()

    # Extract the mapping between document IDs and numerical party labels
    party_id_mapping = {}
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            doc_id, label = parts
            party_id_mapping[int(doc_id)] = label

    return party_id_mapping

In [8]:
def parse_xml_test(file_path_xml, file_path_txt):

    party_info = load_text_file(file_path_txt)

    doc_id_to_text = {}
    party_labels = []
    texts = []

    tree = ET.parse(file_path_xml)
    root = tree.getroot()


    for doc in root.findall('.//doc'):
        doc_id = doc.get('id')
        text_data = ' '.join([p.text if p.text is not None else '' for p in doc.findall('.//texte/p')])
        doc_id_to_text[int(doc_id)] = text_data

    # Iterate through common keys in both dictionaries
    common_keys = set(party_info.keys()) & set(doc_id_to_text.keys())
    for doc_id in common_keys:
        party_labels.append(party_info[doc_id])
        texts.append(preprocess(doc_id_to_text[doc_id]))

    return  texts, party_labels

In [9]:
train_texts, train_labels = parse_xml_train('/content/drive/My Drive/deft09/Corpus d_apprentissage/deft09_parlement_appr_en.xml')

test_texts, test_labels = parse_xml_test('/content/drive/My Drive/deft09/Corpus de test/deft09_parlement_test_en.xml', '/content/drive/My Drive/deft09/Données de référence/deft09_parlement_ref_en.txt')

In [39]:
# Trying to resize train-test corpora
# Merging both corpora to be able to split them differently
full_texts = train_texts + test_texts
full_labels = train_labels + test_labels



# Splitting the corpus
X_train, X_test, y_train, y_test = train_test_split(full_texts, full_labels, test_size=0.2, random_state=42)


In [41]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

25828 25828 6457 6457


In [42]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [43]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [44]:
# print(X_train.shape, y_train.shape)

(25828, 25788) (25828,)


In [45]:
clf = SVC()
clf.fit(X_train, y_train)

In [46]:
y_test_pred = clf.predict(X_test)

In [27]:
# Decoding the party labels back to strings
# predicted_party_labels = label_encoder.inverse_transform(y_test_pred)

In [47]:
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy}")
precision = precision_score(y_test, y_test_pred, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(y_test, y_test_pred, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"F1 Score: {f1}")

Test Accuracy: 0.6636208765680657
Precision: 0.7010126459873917
Recall: 0.6636208765680657
F1 Score: 0.6547958237047097


In [48]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
conf_matrix

array([[ 259,   12,  255,  145,   14],
       [   5,  584,  145,  162,   17],
       [   6,   39, 1932,  272,   17],
       [  10,   28,  578, 1180,   34],
       [   2,   30,  255,  146,  330]])