In [1]:
%pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.1.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pypdf import PdfReader
import re
import pandas as pd

# Data extraction
parties = ['liberal', 'conservative', 'ndp', 'green']

not_alpha_regex = r"[^a-zA-Z]"
not_main_chars_regex = r"[^a-zA-Z\s.]"
excess_spaces_regex = r"\s+"

party_labels = []
texts = []

def hasAlphabeticalChars(string):
    return len(re.sub(not_alpha_regex, "", string)) > 0

for party in parties:
    reader = PdfReader(f"../data/platforms/{party}.pdf")

    for page in reader.pages:
        raw_text = page.extract_text()
        cleaned_text = re.sub(not_main_chars_regex, "", raw_text)
        cleaned_text = re.sub(excess_spaces_regex, " ", cleaned_text)
        
        page_sentences = list(filter(hasAlphabeticalChars, cleaned_text.split('.')))
        
        # Appending sentences to list
        for passage in page_sentences:
            party_labels.append(party)
            texts.append(passage)

sentences_data = {'party': party_labels, 'text': texts}

sentences = pd.DataFrame(sentences_data)

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizing input
sentence_vectorizor = CountVectorizer(lowercase=True)
sentence_vectors = sentence_vectorizor.fit_transform(sentences['text'])

# Training model
x_train, x_test, y_train, y_test = train_test_split(sentence_vectors, sentences['party'], test_size=0.2)

sentences_svm = SVC(verbose=True, kernel='linear', decision_function_shape='ovo')

print("Training model...")
sentences_svm.fit(x_train, y_train)
print("Finished training.")

# Testing model
y_pred = sentences_svm.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}.")

Training model...
[LibSVM]Finished training.
Accuracy: 0.6496402877697842.


In [10]:
# Make predictions with model
input_text = ["The Conservative Party is focused on restoring Canada's economy and ensuring that all Canadians have the opportunity to succeed. We believe in lower taxes to put more money back in the pockets of hard-working families and businesses, which will help fuel growth and job creation. Our plan for climate action emphasizes innovation and technology rather than imposing costly carbon taxes. We're committed to maintaining a strong national defense, securing our borders, and standing up for Canadian values. The Conservatives will invest in building infrastructure that creates jobs while ensuring that we control government spending and return to fiscal responsibility."]

input_vector = sentence_vectorizor.transform(input_text)
pred = sentences_svm.predict(input_vector)

print(f"Predicted party: {pred}")

Predicted party: ['conservative']
