In [197]:
import os
import re

import scattertext as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy

from tqdm.auto import tqdm

from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [62]:
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7ff4d43f3820>

In [56]:
roe_opinion = BeautifulSoup(
    requests.get('https://www.law.cornell.edu/supremecourt/text/410/113').content, 
    "html.parser"
).find('div', attrs={'class': 'bodytext'}).text

# Note: this is from an OCR performed via OSX preview
dobbs_draft_opinion = open('draft_opinion.txt').read()

In [192]:
paragraphs_to_remove = [
    x for x in dobbs_draft_opinion.split('\n') if 'miscarriage' in x and 'medicine' in x
][1:] + [x for x in dobbs_draft_opinion.split('\n') if 'JACKSON WOMEN’S HEALTH ORGANIZATION' in x or "JACKSON WOMEN'S HEALTH ORGANIZATION" in x] + [x for x in dobbs_draft_opinion.split('\n') if  'administer to any woman pregnant' in x][:1]
non_redundant_dobbs_paragraphs = [x for x in dobbs_draft_opinion.split('\n')
                                  if x not in paragraphs_to_remove]

In [193]:
case_df = pd.DataFrame([
    {'Title': 'Roe','Text': roe_opinion},
    {'Title': 'Dobbs','Text': '\n'.join(non_redundant_dobbs_paragraphs)},
]).assign(
    Parse = lambda df: df.Text.apply(nlp)
)

In [194]:
corpus = st.CorpusFromParsedDocuments(
    case_df,
    category_col = 'Title',
    parsed_col = 'Parse'
).build().remove_infrequent_words(
    3
).filter_out(
    lambda x: re.match(r'^([a-z]| )+$', x) is None or x in st.Common.MY_ENGLISH_STOP_WORDS
)
print('Num terms', corpus.get_num_terms())

Num terms 2650


In [199]:

html = st.produce_frequency_explorer(
    corpus,
    category='Roe', 
    not_category_name='Dobbs',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=2,
    width_in_pixels=1000, 
    term_scorer=st.RankDifference()
)

fn = './roe_vs_dobbs.html'
open(fn, 'w').write(html)

1073904

In [200]:
dIFrame(src=fn, width=1300, height=800)