In [2]:
%pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [9]:
from bs4 import BeautifulSoup
import pandas as pd

topics = ["Climate change, energy", "Cost of living", "Defence", "Foreign policy", "Government spending", "Housing", "Immigration", "Infrastructure", "U.S. relations, tariffs"]
parties = ["Liberal", "Conservative", "New Democrat", "Bloc Québécois", "Green", "People's Party"]
party_ids = {"Liberal": 0, "Conservative": 1, "New Democrat": 2, "Bloc Québécois": 3, "Green": 4, "People's Party": 5}

scraped_parties = []
scraped_labels = []
scraped_topics = []
scraped_texts = []

with open("../data/2025/platform-comparison.html") as fp:
    soup = BeautifulSoup(fp)
    fetched_topics = soup.find_all("div", class_="an-issue")
    
    for topic_index, topic in enumerate(fetched_topics):
        paragraphs = topic.find_all("p")
        
        for party_index, party in enumerate(parties):
            scraped_parties.append(party)
            scraped_labels.append(party_ids[party])
            scraped_topics.append(topics[topic_index])
            scraped_texts.append(paragraphs[party_index].text)
            
df = pd.DataFrame({"party": scraped_parties, "topic": scraped_topics, "text": scraped_texts, "label": scraped_labels})
            

In [None]:
# Bag of Words
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True)),  # Vectorizer step
    ('svm', SVC(kernel='linear', decision_function_shape='ovo'))  # SVM classifier step
])

# Split the data
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['party'], test_size=0.2)

# Train the model with the pipeline
print("Training model...")
pipeline.fit(x_train, y_train)
print("Finished training.")

# Test the model
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}.")

# Save the entire pipeline
joblib.dump(pipeline, "../model-weights/svc_model_pipeline.joblib")

Training model...
Finished training.
Accuracy: 0.45454545454545453.


['../model-weights/svc_model_pipeline.joblib']

In [25]:
# Bag of Words - Predictions

input_text = ["Axe the tax. STop the crime. Build the homes and fix the budget."]

pred = pipeline.predict(input_text)

print(f"Predicted party: {pred}")

Predicted party: ['Green']
