In [1]:
import pandas as pd
import json
import pathlib

# Set variables
start_year = 1877  # First year of data sought (1877 +)
end_year = 2024  # Last year of data sought (2024 -)


# Set path to data
data_path = pathlib.Path('scc_bulk_data/DATA/YEARLY/')

# load data (all years, json files)
results = []
for year in range(start_year, end_year+1):
    with open(data_path / f'{year}.json') as f:
        results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)
df.head()

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,(1877) 1 SCR 110,,SCC,1877,Boak et al. v. The Merchant's Marine Insurance...,en,1877-01-23,https://decisions.scc-csc.ca/scc-csc/scc-csc/e...,2022-08-31,Boak et al. v. The Merchant's Marine Insurance...,
1,(1877) 1 SCR 114,,SCC,1877,Smyth v. McDougall,en,1877-02-01,https://decisions.scc-csc.ca/scc-csc/scc-csc/e...,2022-08-31,Smyth v. McDougall\nCollection\nSupreme Court ...,
2,(1877) 1 SCR 117,,SCC,1877,The Queen v. Laliberté,en,1877-02-03,https://decisions.scc-csc.ca/scc-csc/scc-csc/e...,2022-08-31,The Queen v. Laliberté\nCollection\nSupreme Co...,
3,(1877) 1 SCR 145,,SCC,1877,Brassard et al. v. Langevin,en,1877-02-28,https://decisions.scc-csc.ca/scc-csc/scc-csc/e...,2022-08-31,Brassard et al. v. Langevin\nCollection\nSupre...,
4,(1877) 1 SCR 235,,SCC,1877,Johnstone v. The Minister & Trustees of St. An...,en,1877-06-28,https://decisions.scc-csc.ca/scc-csc/scc-csc/e...,2022-08-31,Johnstone v. The Minister & Trustees of St. An...,


In [2]:
df['length'] = df['unofficial_text'].apply(
    lambda row: min(len(row.split(" ")), len(row)) if isinstance(row, str) else None
)

In [3]:
df['length'].mean()

np.float64(7364.978799261476)

In [3]:
df_english = df[df['language'] == 'en']

In [None]:
import os

def generate_full_text_doc(df: pd.DataFrame, output_path: str):
    directory = os.path.dirname(output_path)
    if directory:
        os.makedirs(directory, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        for text in df['clean_text'].astype(str):
            f.write(text)
            f.write("\n")

In [6]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

LEMMA = WordNetLemmatizer()

def cleanup_text(s: str, lemmatize = True):
    if not isinstance(s, str):
        return ""
    else:
        s = s.lower()
        s = s.replace('\r\n', ' ').replace('\n', ' ') # remove linebreaks
        s = re.sub(r'\s+', ' ', s).strip() # remove white space
        s = s.translate(str.maketrans('', '', string.punctuation)) #remove punctuation
        
        tokens = []
        for w in s.split():
            if len(w) <= 1 or w in set(stopwords.words('english')):
                continue
            tok = LEMMA.lemmatize(w) if lemmatize else w # lemmatize each word, if lemmatization is enabled
            if not re.fullmatch(r'[a-z]+', tok):
                continue
            tokens.append(tok)
    return " ".join(tokens)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Irene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Irene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Irene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
df_english['clean_text'] = df_english['unofficial_text'].apply(cleanup_text)

In [None]:
generate_full_text_doc(df_english, 'outputs/full_text_english_cleaned.txt')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


text = open('outputs/full_text_english.txt', encoding='utf-8').read()

wc = WordCloud(
    width=800,
    height=400,
    background_color='white',
    max_words=200
).generate(all_text)

plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.show()

KeyboardInterrupt: 