In [2]:
%pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [4]:
from bs4 import BeautifulSoup
import pandas as pd

topics = ["Climate change, energy", "Cost of living", "Defence", "Foreign policy", "Government spending", "Housing", "Immigration", "Infrastructure", "U.S. relations, tariffs"]
parties = ["Liberal", "Conservative", "New Democrat", "Bloc Québécois", "Green", "People's Party"]
party_ids = {"Liberal": 0, "Conservative": 1, "New Democrat": 2, "Bloc Québécois": 3, "Green": 4, "People's Party": 5}

scraped_parties = []
scraped_labels = []
scraped_topics = []
scraped_texts = []

with open("../data/2025/platform-comparison.html") as fp:
    soup = BeautifulSoup(fp)
    fetched_topics = soup.find_all("div", class_="an-issue")
    
    for topic_index, topic in enumerate(fetched_topics):
        paragraphs = topic.find_all("p")
        
        for party_index, party in enumerate(parties):
            topic_position = paragraphs[party_index].text
            position_sentences = topic_position.split(". ")
            
            for sentence in position_sentences:
                if len(sentence) > 5:
                    scraped_parties.append(party)
                    scraped_labels.append(party_ids[party])
                    scraped_topics.append(topics[topic_index])
                    scraped_texts.append(sentence)
            
df = pd.DataFrame({"party": scraped_parties, "topic": scraped_topics, "text": scraped_texts, "label": scraped_labels})
            

In [None]:
# Creating synthetic data to increase dataset size
import requests
import os

# Add API Key here (don't save)
API_KEY = ""

url = f'https://translation.googleapis.com/language/translate/v2?key={API_KEY}'

scraped_parties = []
scraped_labels = []
scraped_topics = []
scraped_texts = []

translation_languages = ["fr", "sw", "sv", "su", "ta"]

for index, row in df.iterrows():
    for language in translation_languages:
        translate_params = {
            'q': row["text"],
            'source': 'en',
            'target': language,
            'format': 'text'
        }
        
        response = requests.post(url, data=translate_params).json()
        translated_text = response["data"]["translations"][0]["translatedText"]
        
        revert_params = {
            'q': translated_text,
            'source': language,
            'target': 'en',
            'format': 'text'
        }
        
        response = requests.post(url, data=revert_params).json()
        reverted_text = response["data"]["translations"][0]["translatedText"]
        
        # Use reverted text to increase dataset
        scraped_parties.append(row["party"])
        scraped_labels.append(row["label"])
        scraped_texts.append(reverted_text)
        scraped_topics.append(row["topic"])
        
        print(f"Translated text #{index + 1} (language: {language}), added '{reverted_text}'.")
    
translated_df = pd.DataFrame({"party": scraped_parties, "topic": scraped_topics, "text": scraped_texts, "label": scraped_labels})
df = pd.concat([df, translated_df], ignore_index=True)

Translated text #1 (language: fr), added 'Liberals cut carbon price on consumers before campaign'.
Translated text #1 (language: sw), added 'The Liberals lowered the consumer carbon price before the campaign'.
Translated text #1 (language: sv), added 'The Liberals lowered consumer prices for carbon dioxide before the election campaign.'.
Translated text #1 (language: su), added 'The Liberals cut consumer carbon prices ahead of the campaign'.
Translated text #1 (language: ta), added 'Liberals cut consumer carbon price before campaign'.
Translated text #2 (language: fr), added 'They plan to introduce incentives to help families invest in clean energy, including reinstating the zero-emission vehicle grant program.'.
Translated text #2 (language: sw), added 'They plan to introduce incentives to help families invest in clean energy, including reinstating the zero-emission vehicle subsidy program.'.
Translated text #2 (language: sv), added 'They plan to introduce incentives to help families 

In [12]:
# Save Dataframe to a file
import pandas as pd

df.to_csv('2025-data.csv', index=False)

In [13]:
# Reload data frame from file
import pandas as pd

df = pd.read_csv('2025-data.csv')

In [32]:
# Bag of Words
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(lowercase=True)),  # Vectorizer step
    ('svm', SVC(kernel='linear', decision_function_shape='ovo'))  # SVM classifier step
])

# Split the data
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['party'], test_size=0.2)

# Train the model with the pipeline
print("Training model...")
pipeline.fit(x_train, y_train)
print("Finished training.")

# Test the model
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}.")

# Save the entire pipeline
joblib.dump(pipeline, "../model-weights/svc_model_pipeline.joblib")

Training model...
Finished training.
Accuracy: 0.9829351535836177.


['../model-weights/svc_model_pipeline.joblib']

In [27]:
# Bag of Words - Predictions

input_text = ["I want to have a middle class tax cut. I also think we need to spend less, and invest more. Additionally, I think that Universal Child Care should be kept."]

if not input_text[0]:
    input_text[0] = input("Please input text to classify here: ")

pred = pipeline.predict(input_text)

print(f"Predicted party: {pred}")

Predicted party: ['Conservative']


In [None]:
#