# Compute the vocabulary overlap of different copora to investigate their similarity

## Interesting questions

1. Is the vocabulary overlap higher between similar texts than between more different texts?
    - different legal areas
    - within court vs between courts
    - legal texts vs other texts (Wikipedia, News, Scientific Articles, etc.)
2. Is the vocabulary overlap higher when we compare larger corpora than smaller ones?
3. Is the vocabulary overlap higher in italian and in french than in German because of less compound words?
4. Does the vocabulary overlap get higher in German texts when we split compound words?

TODO: Make medium post out of this

In [1]:
# download spacy models
#!python -m spacy download de_core_news_md 
#!python -m spacy download fr_core_news_md 
#!python -m spacy download it_core_news_md 

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
import random
from tqdm import tqdm # for progress bars

import spacy
from spacy.tokens import DocBin

import de_core_news_md, fr_core_news_md, it_core_news_md

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [32]:
seed = 42

In [3]:
def load_spacy_model(lang='de', type='core', genre='news', size='md'):
    # disable pipelines for faster processing since we only need the vectors
    nlp = spacy.load(f"{lang}_{type}_{genre}_{size}", disable=['tagger', 'parser', 'attribute_ruler', 'ner', 'textcat'])
    nlp.max_length = 2000000 # increase max length for long texts
    print(f"Loaded {lang} model with enabled pipes: {nlp.pipe_names}")
    return nlp

In [4]:
data_dir = Path('/home/fdn-admin/SwissCourtRulingCorpus/data')
csv_dir = data_dir / 'csv'
raw_csv_dir = csv_dir / 'raw'
clean_csv_dir = csv_dir / 'clean'

In [5]:
def chunker(iterable, chunk_size):
    return (iterable[pos: pos + chunk_size] for pos in range(0, len(iterable), chunk_size))

In [6]:
def process_and_save(texts, path, nlp):
    doc_bin = DocBin(attrs=["LEMMA", "POS"], store_user_data=True)
    for doc in tqdm(nlp.pipe(texts, n_process=-1, batch_size=1), total=len(texts)):
        doc_bin.add(doc)
    doc_bin.to_disk(path)

In [7]:
def get_chunk_path(court, chunk_num, extension="spacy"):
    return Path(f"{court}-{chunk_num}.{extension}")

In [8]:
def read_docs(court: str, nlp):
    """ yields the docs saved by chunk"""
    chunk_num = 0
    while True: # while there are still more chunks to read
        chunk_path = get_chunk_path(court, chunk_num)
        if chunk_path.exists():
            doc_bin = DocBin().from_disk(chunk_path)
            yield list(doc_bin.get_docs(nlp.vocab))
            chunk_num += 1 # go to the next chunk
        else: # if there are no more chunks to read
            break # abort

In [9]:
def create_vocab_for_doc(doc: spacy.tokens.Doc) -> set:
    """
    take lemma without underscore for faster computation (int instead of str) 
    take casefold of lemma to remove capital letters and ß
    """  
    return set([token.lemma_.casefold()
                for token in doc 
                if (not token.is_stop
                    and not token.is_punct
                    and not token.pos_ in ['NUM', 'SYM', 'X'])])

In [10]:
def create_vocab(docs_gen) -> set:
    """ Creates the vocab (set of words appearing in the corpus/doc) from the document generator"""
    print("Creating the vocabulary")
    vocabs = []
    for docs in docs_gen:
        vocab = [create_vocab_for_doc(doc) for doc in tqdm(docs)]
        vocabs.extend(vocab)
             
    total_vocab = set()
    for vocab in vocabs:
        total_vocab |= vocab # make the union of all individual vocabs
        
    print(f"Here is a sample of 20 random words from the vocabulary: {list(total_vocab)[:20]}")
    print(f"The vocabulary contains {len(total_vocab)} words")
    return total_vocab

In [53]:
def create_docs(df: pd.DataFrame, court: str, nlp, chunk_size = 10000, override=False):
    """ 
    Creates and saves the docs generated by the spacy pipeline. 
    """
    first_chunk_path = get_chunk_path(court, 0)
    if override or not first_chunk_path.exists() : # if the first chunked docbin does not exist or we want to override it
        print(f"Running spacy pipeline to create docs for court {court}")
        df = df[df['text'].notna()] # Make sure we don't have any NaN values in the text
        texts = df.text.tolist()
        chunks = chunker(texts, chunk_size) # make chunks because otherwise the doc_bins get too large
        for chunk_num, chunk in enumerate(chunks):
            print(f"Processing chunk {chunk_num}")
            process_and_save(chunk, get_chunk_path(court, chunk_num), nlp)
    else:
        print(f"Preprocessed docs already exist at {first_chunk_path}. To calculate again set 'override' to True.")

In [51]:
def run_for_court(court_name, court_df, nlp, override=False):
    """ Run the necessary steps to create the vocab for a court """
    assert len(court_df.index) > 0
    create_docs(court_df, court_name, nlp, override=override)
    docs_gen = read_docs(court_name, nlp)
    return create_vocab(docs_gen)

In [52]:
# TODO for German texts use compound splitter from dtuggener
def compute_vocabulary_overlap(court_name_1, court_df_1, court_name_2, court_df_2, nlp):
    result = {"vocab (num lemmas)": {}} # prepare result dict
        
    vocab_1 = run_for_court(court_name_1, court_df_1, nlp)
    result["vocab (num lemmas)"][court_name_1] = len(vocab_1)
    
    if court_name_1 == court_name_2: # if both have the same name we want to test within court
        override = True # so we need to override the previously created doc_bins
    vocab_2 = run_for_court(court_name_2, court_df_2, nlp, override=override)
    result["vocab (num lemmas)"][court_name_2] = len(vocab_2)

    union = vocab_1 | vocab_2 # compute union
    result['union (num lemmas)'] = len(union)

    intersection = vocab_1.intersection(vocab_2) # compute intersection
    result['intersection (num lemmas)'] = len(intersection)
    
    # the overlap is the intersection divided by the mean of the vocab lengths
    result['overlap (%)'] = len(intersection) / np.mean([len(vocab_1), len(vocab_2)]) 
    
    return result

In [22]:
german = load_spacy_model('de') # load language model  

Loaded de model with enabled pipes: ['tok2vec', 'morphologizer', 'lemmatizer']


In [42]:
df = pd.read_csv(clean_csv_dir / f"CH_BGer.csv") # read df
df = df[df.language.str.contains('de')] # select only german documents
df = df[df.court.notna()] # remove courts which are NA
df.head()

Unnamed: 0,spider,language,canton,court,chamber,date,file_name,file_number,file_number_additional,html_url,pdf_url,text
0,CH_BGer,de,CH,CH_BGer,CH_BGer_008,2015-12-17,CH_BGer_008_8C-873-2015_2015-12-17,8C 873/2015,,https://www.bger.ch/ext/eurospider/live/de/php...,,Bundesgericht Tribunal fédéral Tribunale fed...
1,CH_BGer,de,CH,CH_BGer,CH_BGer_008,2008-02-04,CH_BGer_008_8C-71-2007_2008-02-04,8C 71/2007,,https://www.bger.ch/ext/eurospider/live/de/php...,,Tribunale federale Tribunal federal {T 0/2} 8C...
3,CH_BGer,de,CH,CH_BGer,CH_BGer_008,2008-08-07,CH_BGer_008_8C-237-2008_2008-08-07,8C 237/2008,,https://www.bger.ch/ext/eurospider/live/de/php...,,Tribunale federale Tribunal federal {T 0/2} 8C...
11,CH_BGer,de,CH,CH_BGer,CH_BGer_004,2007-05-16,CH_BGer_004_4A-32-2007_2007-05-16,4A 32/2007,,https://www.bger.ch/ext/eurospider/live/de/php...,,Tribunale federale Tribunal federal {T 0/2} 4A...
12,CH_BGer,de,CH,CH_BGer,CH_BGer_011,2019-04-12,CH_BGer_011_6B-224-2019_2019-04-12,6B 224/2019,,https://www.bger.ch/ext/eurospider/live/de/php...,,Bundesgericht Tribunal fédéral Tribunale fed...


In [56]:
%%time
court_name_1 = 'CH_BGer_004'
court_df_1 = df[df.chamber.str.contains(court_name_1)] # select court
court_df_1 = court_df_1.sample(1000, random_state=seed, axis=0)

court_name_2 = 'CH_BGer_004'
court_df_2 = df[df.chamber.str.contains(court_name_2)] # select court
court_df_2 = court_df_2.sample(1000, random_state=seed+1, axis=0)

result = compute_vocabulary_overlap(court_name_1, court_df_1, court_name_2, court_df_2, german)
result

Preprocessed docs already exist at CH_BGer_004-0.spacy. To calculate again set 'override' to True.
Creating the vocabulary


100%|██████████| 1000/1000 [00:01<00:00, 684.01it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Here is a sample of 20 random words from the vocabulary: ['law', 'auslegungsregeln', 'luterbacher', 'gegensätzlichen', 'wirteverband', 'ruch', 'anpreisen', 'verkehrsschutzes', 'ursprungsbezeichnung', 'wettbewerbsrechtlich', 'zvd-werte', 'sinken', 'konzernverhalten', 'vorleistungspflicht', 'definierbarer', 'wortfolge', 'mängelfreie', 'verfügen', 'reaktionszeit', 'routinemassnahmen']
The vocabulary contains 46716 words
Running spacy pipeline to create docs for court CH_BGer_004
Processing chunk 0


100%|██████████| 1000/1000 [01:03<00:00, 15.80it/s]


Creating the vocabulary


100%|██████████| 1000/1000 [00:01<00:00, 624.17it/s]


Here is a sample of 20 random words from the vocabulary: ['law', 'auslegungsregeln', 'gegensätzlichen', 'feststellungsentscheid', 'monatelang', "18'794'115.78", 'anpreisen', 'verkehrsschutzes', 'ursprungsbezeichnung', 'wettbewerbsrechtlich', 'eintragungsprinzip', 'immobilienangebot', 'berufungsduplik', 'zinsfreien', 'sinken', 'ionisierend', 'nichteinreichen', 'vorleistungspflicht', 'mängelfreie', 'wortfolge']
The vocabulary contains 48232 words
CPU times: user 41.3 s, sys: 4.56 s, total: 45.9 s
Wall time: 1min 12s


{'vocab (num lemmas)': {'CH_BGer_004': 48232},
 'union (num lemmas)': 68057,
 'intersection (num lemmas)': 26891,
 'overlap (%)': 0.5664363651683026}

In [16]:
#%%time
#result = compute_vocabulary_overlap('TI_PP', 'TI_TCA', lang='it')
#result