# Compute the vocabulary overlap of different copora to investigate their similarity

## Interesting questions

1. Is the vocabulary overlap higher between similar texts than between more different texts?
    - different legal areas
    - within court vs between courts
    - legal texts vs other texts (Wikipedia, News, Scientific Articles, etc.)
2. Is the vocabulary overlap higher when we compare larger corpora than smaller ones?
3. Is the vocabulary overlap higher in italian and in french than in German because of less compound words?
4. Does the vocabulary overlap get higher in German texts when we split compound words?
5. Is there a temporal difference?

TODO: Make medium post out of this

TODO: Make venn diagrams

In [126]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import random
from collections import Counter
from tqdm import tqdm # for progress bars

In [7]:
from sqlalchemy import create_engine

def get_engine():
    return create_engine(
        f"postgresql+psycopg2://postgres:postgres@localhost:5432/scrc",
        # echo=True # good for debugging
    )

def query(query_str):
    with get_engine().connect() as conn:
        return pd.read_sql(query_str, conn)

# Spearman's rank correlation coefficient

In [209]:
def get_counter(lang, level, level_instance):
    df = query(f"""
        SELECT counter
        FROM {lang}_{level}s
        WHERE {level} = '{level_instance}'
    """)
    assert len(df.index) == 1
    return Counter(df.iloc[0].counter)

In [215]:
def compile_dfs(counter1, counter2, top_n_most_common_words=500):
    both = (counter1+counter2).most_common(top_n_most_common_words)

    # convert to dfs
    df1 = pd.DataFrame.from_dict(dict(counter1), orient='index', columns=['frequencies1'])
    df2 = pd.DataFrame.from_dict(dict(counter2), orient='index', columns=['frequencies2'])
    df3 = pd.DataFrame.from_dict(dict(both), orient='index', columns=['frequencies_both'])

    df = pd.concat([df1, df2, df3], axis=1) # combine dfs
    df = df.dropna() # drop all values which are not in the common top n list
    df = df.sort_values(by=['frequencies_both', 'frequencies1', 'frequencies2'], ascending=False) # sort by common frequency count
    df['ranks1'] = stats.rankdata(df.frequencies1) # compute ranks
    df['ranks2'] = stats.rankdata(df.frequencies2) # compute ranks
    return df

In [219]:
top_n_most_common_words = 500

In [243]:
counter1 = get_counter('fr', 'canton', 'CH')
counter2 = get_counter('fr', 'canton', 'BE')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.6700121397419581, pvalue=4.771150405525206e-66)


In [242]:
counter1 = get_counter('de', 'canton', 'CH')
counter2 = get_counter('de', 'canton', 'BE')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.5193173980215102, pvalue=8.265192497302621e-36)


In [244]:
counter1 = get_counter('it', 'canton', 'CH')
counter2 = get_counter('it', 'canton', 'GR')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.554169369250421, pvalue=3.423099903142155e-41)


In [245]:
counter1 = get_counter('de', 'canton', 'CH')
counter2 = get_counter('de', 'canton', 'GR')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.5261228417110858, pvalue=8.432380668280986e-37)


In [238]:
counter1 = get_counter('de', 'canton', 'CH')
counter2 = get_counter('de', 'canton', 'ZH')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.394005352021408, pvalue=5.123592331588386e-20)


In [240]:
counter1 = get_counter('de', 'canton', 'ZH')
counter2 = get_counter('de', 'canton', 'BE')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.6504787420298781, pvalue=1.772135396477654e-61)


In [239]:
counter1 = get_counter('de', 'canton', 'GR')
counter2 = get_counter('de', 'canton', 'BE')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.45538216915196184, pvalue=7.268102849689201e-27)


In [221]:
counter1 = get_counter('de', 'court', 'CH_BVGE')
counter2 = get_counter('de', 'court', 'CH_PATG')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.3791486544492725, pvalue=2.3567366726097117e-16)


In [223]:
counter1 = get_counter('de', 'court', 'CH_BVGE')
counter2 = get_counter('de', 'court', 'CH_WBK')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.29096099665927583, pvalue=2.868504464863533e-10)


In [224]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'CH_BGer_011') # andere strafrechtliche Abteilung

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.999967503841173, pvalue=0.0)


In [225]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'CH_BGer_015') # Verwaltungskommission

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.510280529686221, pvalue=8.824122455441942e-27)


In [226]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'CH_BGer_004') # I. zivilrechtliche Abteilung

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.3617031155371713, pvalue=7.198141675918217e-17)


In [232]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'CH_BSTG_001')

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)
#df.head(20)

SpearmanrResult(correlation=0.18419150348018015, pvalue=3.477925363804312e-05)


In [233]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'AG_OG_003') # AG Strafgericht

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.40448226112899044, pvalue=2.7448579216088433e-15)


In [236]:
counter1 = get_counter('de', 'chamber', 'CH_BGer_006') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'BE_OG_005') # BE Strafkammer

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.26364110678801866, pvalue=2.7745336467725673e-09)


In [237]:
counter1 = get_counter('de', 'chamber', 'CH_BSTG_001') # strafrechtliche Abteilung
counter2 = get_counter('de', 'chamber', 'BE_OG_005') # BE Strafkammer

df = compile_dfs(counter1, counter2, top_n_most_common_words)
spearman = stats.spearmanr(df.ranks1, df.ranks2)
print(spearman)

SpearmanrResult(correlation=0.14540144702993016, pvalue=0.0011921142834947194)


# Vocabulary Overlap

In [100]:
def compute_vocabulary_overlap(name_1, vocab_1, name_2, vocab_2):
    """
    Computes the vocabulary overlap between two vocabularies.
    Returns a result dict with the names given.
    """
    result = {name_1 + ' (num lemmas)': len(vocab_1), name_2 + ' (num lemmas)': len(vocab_2)} # prepare result dict

    intersection = vocab_1.intersection(vocab_2) # compute intersection
    result['intersection (num lemmas)'] = len(intersection)
    
    smaller_vocab = vocab_1 if len(vocab_1) < len(vocab_2) else vocab_2
    result['smaller vocab (num lemmas)'] = len(smaller_vocab)
    
    mean = np.mean([len(vocab_1), len(vocab_2)]) 
    result['mean (num lemmas)'] = mean
    
    union = vocab_1 | vocab_2 # compute union
    result['union (num lemmas)'] = len(union)
    
    # the overlap is the intersection divided by the mean of the vocab lengths
    result['overlap (intersection / smaller vocab) (%)'] = round(100 * len(intersection) / len(smaller_vocab), 2)
    result['overlap (intersection / mean) (%)'] = round(100 * len(intersection) / mean, 2)
    result['overlap (intersection / union) (%)'] = round(100 * len(intersection) / len(union), 2)
    
    return pd.DataFrame.from_dict(result, orient='index')

In [34]:
def get_vocab(lang, chamber):
    df = query(f"""
        SELECT vocabulary
        FROM {lang}_chambers
        WHERE chamber = '{chamber}'
    """)
    return set(np.array(df.vocabulary.to_list()).flat) # flatten possibly nested list

In [18]:
def run_for_chambers(lang, chamber_1, chamber_2): 
    vocab_1 = get_vocab(lang, chamber_1)
    vocab_2 = get_vocab(lang, chamber_2)    

    return compute_vocabulary_overlap(chamber_1, vocab_1, chamber_2, vocab_2)

## Within Court, different chambers

In [101]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BGer_002')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BGer_002 (num lemmas),205305.0
intersection (num lemmas),76772.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),204354.5
union (num lemmas),331937.0
overlap (intersection / smaller vocab) (%),37.74
overlap (intersection / mean) (%),37.57
overlap (intersection / union) (%),23.13


In [102]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BGer_016')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BGer_016 (num lemmas),144190.0
intersection (num lemmas),49221.0
smaller vocab (num lemmas),144190.0
mean (num lemmas),173797.0
union (num lemmas),298373.0
overlap (intersection / smaller vocab) (%),34.14
overlap (intersection / mean) (%),28.32
overlap (intersection / union) (%),16.5


In [103]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BVGE_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BVGE_001 (num lemmas),629294.0
intersection (num lemmas),105505.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),416349.0
union (num lemmas),727193.0
overlap (intersection / smaller vocab) (%),51.87
overlap (intersection / mean) (%),25.34
overlap (intersection / union) (%),14.51


In [108]:
result = run_for_chambers('de', 'AG_OG_003', 'CH_BVGE_001')
result

Unnamed: 0,0
AG_OG_003 (num lemmas),2315.0
CH_BVGE_001 (num lemmas),629294.0
intersection (num lemmas),2207.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),315804.5
union (num lemmas),629402.0
overlap (intersection / smaller vocab) (%),95.33
overlap (intersection / mean) (%),0.7
overlap (intersection / union) (%),0.35


In [109]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_PATG_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_PATG_001 (num lemmas),28330.0
intersection (num lemmas),13884.0
smaller vocab (num lemmas),28330.0
mean (num lemmas),115867.0
union (num lemmas),217850.0
overlap (intersection / smaller vocab) (%),49.01
overlap (intersection / mean) (%),11.98
overlap (intersection / union) (%),6.37


In [106]:
result = run_for_chambers('de', 'CH_BGer_001', 'AG_OG_003')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
AG_OG_003 (num lemmas),2315.0
intersection (num lemmas),2155.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),102859.5
union (num lemmas),203564.0
overlap (intersection / smaller vocab) (%),93.09
overlap (intersection / mean) (%),2.1
overlap (intersection / union) (%),1.06


In [107]:
result = run_for_chambers('de', 'CH_BGer_001', 'AG_OG_004')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
AG_OG_004 (num lemmas),5689.0
intersection (num lemmas),4495.0
smaller vocab (num lemmas),5689.0
mean (num lemmas),104546.5
union (num lemmas),204598.0
overlap (intersection / smaller vocab) (%),79.01
overlap (intersection / mean) (%),4.3
overlap (intersection / union) (%),2.2


In [110]:
result = run_for_chambers('de', 'AG_OG_003', 'AG_OG_004')
result

Unnamed: 0,0
AG_OG_003 (num lemmas),2315.0
AG_OG_004 (num lemmas),5689.0
intersection (num lemmas),1210.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),4002.0
union (num lemmas),6794.0
overlap (intersection / smaller vocab) (%),52.27
overlap (intersection / mean) (%),30.23
overlap (intersection / union) (%),17.81


In [111]:
result = run_for_chambers('de', 'CH_BGer_001', 'ZH_SVG_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),70829.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),400718.5
union (num lemmas),730608.0
overlap (intersection / smaller vocab) (%),34.82
overlap (intersection / mean) (%),17.68
overlap (intersection / union) (%),9.69


In [112]:
result = run_for_chambers('de', 'CH_BVGE_001', 'ZH_SVG_001')
result

Unnamed: 0,0
CH_BVGE_001 (num lemmas),629294.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),148373.0
smaller vocab (num lemmas),598033.0
mean (num lemmas),613663.5
union (num lemmas),1078954.0
overlap (intersection / smaller vocab) (%),24.81
overlap (intersection / mean) (%),24.18
overlap (intersection / union) (%),13.75


In [113]:
result = run_for_chambers('de', 'ZH_VG_001', 'ZH_SVG_001')
result

Unnamed: 0,0
ZH_VG_001 (num lemmas),221311.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),79881.0
smaller vocab (num lemmas),221311.0
mean (num lemmas),409672.0
union (num lemmas),739463.0
overlap (intersection / smaller vocab) (%),36.09
overlap (intersection / mean) (%),19.5
overlap (intersection / union) (%),10.8


In [114]:
result = run_for_chambers('de', 'ZH_OG_001', 'ZH_SVG_001')
result

Unnamed: 0,0
ZH_OG_001 (num lemmas),279126.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),88597.0
smaller vocab (num lemmas),279126.0
mean (num lemmas),438579.5
union (num lemmas),788562.0
overlap (intersection / smaller vocab) (%),31.74
overlap (intersection / mean) (%),20.2
overlap (intersection / union) (%),11.24


In [115]:
result = run_for_chambers('de', 'ZH_OG_001', 'ZH_VG_001')
result

Unnamed: 0,0
ZH_OG_001 (num lemmas),279126.0
ZH_VG_001 (num lemmas),221311.0
intersection (num lemmas),75771.0
smaller vocab (num lemmas),221311.0
mean (num lemmas),250218.5
union (num lemmas),424666.0
overlap (intersection / smaller vocab) (%),34.24
overlap (intersection / mean) (%),30.28
overlap (intersection / union) (%),17.84


## Within court, same chamber, different decisions

In [None]:
# TODO for German texts use compound splitter from dtuggener

In [None]:
# TODO alternatively rank words by number of occurrences and compute overlap of 50000 most frequent words