In [None]:
import os
import re
import numpy as np
import pandas as pd

import nltk
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find
from nltk.stem import *
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
year = '2013'

In [None]:
df_lst = []
file_path = f"/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Colab/{year}/"
#os.rmdir(file_path + ".ipynb_checkpoints")
#os.remove(file_path + ".")

i = 0
for folder in os.listdir(file_path):
    for f in os.listdir(file_path + folder + '/'):
        df_lst += [pd.read_csv(file_path + folder + '/' + f)]

In [None]:
full_df = pd.concat(df_lst)
full_df["speakerparty"] = full_df["speakerparty"].replace("NDP", "New Democratic Party")
full_df["speakerparty"] = full_df["speakerparty"].replace("Bloc", "Bloc Québécois")
full_df["speakerparty"] = full_df["speakerparty"].replace("Green", "Green Party")

In [None]:
full_df.head(2)

In [None]:
reduced_df = full_df[["speakername", "speakerparty", "speechtext"]]
reduced_df.dropna(inplace=True)

## Web Scraping

### Cities

In [None]:
import requests # library to handle requests
from bs4 import BeautifulSoup

In [None]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_the_largest_population_centres_in_Canada"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})

In [None]:
city_df=pd.read_html(str(indiatable))
# convert list to dataframe
city_df=pd.DataFrame(city_df[0])

In [None]:
cities = city_df['Population centre[5]'].tolist()
cities += ["Ottawa", "Gatineau", "Niagara Falls", "St. Catharines", "Sydney", "Whitehorse", "Yellowknife", "Iqaluit"]
cities = [city.lower() for city in cities]

### Names

In [None]:
names = pd.read_csv('https://www.usna.edu/Users/cs/roche/courses/s15si335/proj1/files.php%3Ff=names.txt&downloadcode=yes', skiprows=0, header=None, sep='\s+')[0].to_list()

In [None]:
names += [item for sublist in reduced_df["speakername"].str.split(" ").to_list() for item in sublist]
names = [item.lower() for item in names]

## Tokenization, Stemming, Stop Words and Lemmatization

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
reduced_df["tokenized"] = reduced_df["speechtext"].str.lower().apply(tokenizer.tokenize)

In [None]:
stop = set(stopwords.words('english'))
stop.add("mr")
stop.add("speaker")
stop.update(set(names))
stop.update(set(cities))

stemmer = PorterStemmer()

In [None]:
reduced_df["stem_no_stopwords"] = reduced_df["tokenized"].apply(lambda x: [stemmer.stem(item) for item in x if item not in stop])

In [None]:
full_stem_list = [a for b in reduced_df["stem_no_stopwords"].tolist() for a in b]

In [None]:
num_tokens = len(full_stem_list)

unique_tokens = set(full_stem_list)
num_unique_tokens = len(unique_tokens)

In [None]:
i = 0
i_max_achieved = 0
rm = 0
full_stem_list_reduced = full_stem_list

####
i_max_achieved = 15000
full_stem_list_reduced = pd.read_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/reduced_full_stem_after_{i_max_achieved}_words.csv')["words"].to_list()
####

for token in unique_tokens:
    i += 1
    if i < i_max_achieved:
        continue

    if full_stem_list.count(token) < 50:
        full_stem_list_reduced = list(filter(lambda a: a != token, full_stem_list_reduced))
        rm += 1
    if i % 500 == 0:
        print("{} completed".format(i))
    if i % 5000 == 0:
        save_df = pd.DataFrame(full_stem_list_reduced, columns=["words"])
        save_df.to_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Checkpoints/reduced_full_stem_after_{i}_words.csv', index=False)

print("Total removed: " + str(rm))
save_df.to_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Checkpoints/reduced_full_stem_done_{year}.csv', index=False)

In [None]:
reduced_unique_tokens = set(full_stem_list_reduced)
reduced_unique_tokens = {x for x in reduced_unique_tokens if x==x} # removes nan

full_stem_list_reduced_common = full_stem_list_reduced
half_records = len(reduced_df["stem_no_stopwords"])//2

i = 0
rm = 0

for token in reduced_unique_tokens:
    i += 1
    if sum(token in item for item in reduced_df["stem_no_stopwords"].tolist()) > half_records:
        full_stem_list_reduced_common = list(filter(lambda a: a != token, full_stem_list_reduced_common))
        rm += 1
    if token[0].isdigit() or (token[0] == '-' and token[1:].isdigit()):
        full_stem_list_reduced_common = list(filter(lambda a: a != token, full_stem_list_reduced_common))
        rm += 1
    
    if i % 500 == 0:
        print("{} completed".format(i))
    if i % 5000 == 0:
        save_df = pd.DataFrame(full_stem_list_reduced, columns=["words"])
        save_df.to_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Checkpoints/common_reduced_full_stem_after_{i}_words.csv', index=False)

print("Total removed: " + str(rm))
save_df.to_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Checkpoints/common_reduced_full_stem_done_{year}.csv', index=False)

In [None]:
reduced_unique_tokens_common = set(full_stem_list_reduced_common)

reduced_df["processed"] = ""

for i, row in enumerate(reduced_df["stem_no_stopwords"].tolist()):
    reduced_df["processed"].iloc[i] = list(filter(lambda a: a not in reduced_unique_tokens_common, row))

In [None]:
reduced_df = reduced_df[reduced_df["processed"].apply(lambda x: len(x) > 4)]

In [None]:
reduced_df.to_csv(f'/content/drive/MyDrive/Uni stuff/Classes/Fourth Year MI/Thesis/Processed Data/processed_{year}.csv')