In [2]:
%pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bs4 import BeautifulSoup
import pandas as pd

topics = ["Climate change, energy", "Cost of living", "Defence", "Foreign policy", "Government spending", "Housing", "Immigration", "Infrastructure", "U.S. relations, tariffs"]
parties = ["Liberal", "Conservative", "New Democrat", "Bloc Québécois", "Green", "People's Party"]
party_ids = {"Liberal": 0, "Conservative": 1, "New Democrat": 2, "Bloc Québécois": 3, "Green": 4, "People's Party": 5}

scraped_parties = []
scraped_labels = []
scraped_topics = []
scraped_texts = []

with open("../data/2025/platform-comparison.html") as fp:
    soup = BeautifulSoup(fp)
    fetched_topics = soup.find_all("div", class_="an-issue")
    
    for topic_index, topic in enumerate(fetched_topics):
        paragraphs = topic.find_all("p")
        
        for party_index, party in enumerate(parties):
            scraped_parties.append(party)
            scraped_labels.append(party_ids[party])
            scraped_topics.append(topics[topic_index])
            scraped_texts.append(paragraphs[party_index].text)
            
df = pd.DataFrame({"party": scraped_parties, "topic": scraped_topics, "text": scraped_texts, "label": scraped_labels})
            

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize_function, batched=True)
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels}).map(tokenize_function, batched=True)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=6)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

# Train the model
trainer.train()

# Save model
model.save_pretrained("./canadian_political_bert")
tokenizer.save_pretrained("./canadian_political_bert")

print("Model training complete and saved.")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 43/43 [00:00<00:00, 952.35 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 816.24 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.805711
2,No log,1.775119
3,No log,1.784791
4,No log,1.786881
5,No log,1.786179


Model training complete and saved.


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_political_affiliation(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU if available

    with torch.no_grad():
        print(inputs)
        
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    print(predicted_class)

    # Map class index to party
    return parties[predicted_class]

print(predict_political_affiliation("Conservatives would repeal all carbon pricing, including on industrial emitters. They would boost incentives (e.g., expand tax credits) for businesses that reduce emissions and to promote clean energy technologies. The party hasn't announced its carbon emissions reduction target, but says it would use the Paris Agreement to export Canadian resources and technology to lower global emissions. The party would also offer a tax incentive for manufacturers who lower emissions. It would support an east-west pipeline and would approve projects such as LNG Quebec. The party would be open to exporting oil from the Port of Churchill and has pledged to speed up development of the port. Conservatives would ban the dumping of raw sewage into waterways and repeal the federal ban on single-use plastics."))


{'input_ids': tensor([[  101, 11992,  2052, 21825,  2035,  6351, 20874,  1010,  2164,  2006,
          3919, 12495, 24168,  1012,  2027,  2052, 12992, 21134,  1006,  1041,
          1012,  1043,  1012,  1010,  7818,  4171,  6495,  1007,  2005,  5661,
          2008,  5547, 11768,  1998,  2000,  5326,  4550,  2943,  6786,  1012,
          1996,  2283,  8440,  1005,  1056,  2623,  2049,  6351, 11768,  7312,
          4539,  1010,  2021,  2758,  2009,  2052,  2224,  1996,  3000,  3820,
          2000,  9167,  3010,  4219,  1998,  2974,  2000,  2896,  3795, 11768,
          1012,  1996,  2283,  2052,  2036,  3749,  1037,  4171, 20438,  2005,
          8712,  2040,  2896, 11768,  1012,  2009,  2052,  2490,  2019,  2264,
          1011,  2225, 13117,  1998,  2052, 14300,  3934,  2107,  2004,  1048,
          3070,  5447,  1012,  1996,  2283,  2052,  2022,  2330,  2000,  9167,
          2075,  3514,  2013,  1996,  3417,  1997, 10888,  1998,  2038, 16970,
          2000,  3177,  2039,  2458,  