# Compute the vocabulary overlap of different copora to investigate their similarity

## Interesting questions

1. Is the vocabulary overlap higher between similar texts than between more different texts?
    - different legal areas
    - within court vs between courts
    - legal texts vs other texts (Wikipedia, News, Scientific Articles, etc.)
2. Is the vocabulary overlap higher when we compare larger corpora than smaller ones?
3. Is the vocabulary overlap higher in italian and in french than in German because of less compound words?
4. Does the vocabulary overlap get higher in German texts when we split compound words?
5. Is there a temporal difference?

TODO: Make medium post out of this

TODO: Make venn diagrams, maybe it does not make sense though, since we can only compare two or max three corpora

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import random
from collections import Counter
from tqdm import tqdm # for progress bars

In [2]:
from sqlalchemy import create_engine

def get_engine(db):
    return create_engine(
        f"postgresql+psycopg2://postgres:postgres@localhost:5432/{db}",
        # echo=True # good for debugging
    )

def query(db, query_str):
    with get_engine(db).connect() as conn:
        return pd.read_sql(query_str, conn)

In [31]:
def compile_dfs(counter1, counter2, top_n_most_common_words=500):
    both = (counter1+counter2).most_common(top_n_most_common_words)

    # convert to dfs
    df1 = pd.DataFrame.from_dict(dict(counter1), orient='index', columns=['frequencies1'])
    df2 = pd.DataFrame.from_dict(dict(counter2), orient='index', columns=['frequencies2'])
    df3 = pd.DataFrame.from_dict(dict(both), orient='index', columns=['frequencies_both'])

    df = pd.concat([df1, df2, df3], axis=1) # combine dfs
    df = df.dropna() # drop all values which are not in the common top n list
    df = df.sort_values(by=['frequencies_both', 'frequencies1', 'frequencies2'], ascending=False) # sort by common frequency count
    df['ranks1'] = stats.rankdata(df.frequencies1) # compute ranks
    df['ranks2'] = stats.rankdata(df.frequencies2) # compute ranks
    return df

In [44]:
def get_counter(db, table, where, counter_type):
    df = query(db, f"""
        SELECT {counter_type}
        FROM {table}
        WHERE {where}
    """)
    assert len(df.index) == 1
    return Counter(df.iloc[0][counter_type])

In [35]:
def get_agg_counter(db, where, counter_type):
    return get_counter(db, "agg", where, counter_type)

In [58]:
counter_types = ['counter_lemma', 'counter_pos', 'counter_tag']
def similarity(counter1_args, counter2_args, top_n_most_common_words):
    result = {'spearman': dict(), 'df': dict()}
    for counter_type in counter_types:
        counter1 = get_counter(counter1_args['db'], counter1_args['table'], counter1_args['where'], counter_type)
        counter2 = get_counter(counter2_args['db'],counter1_args['table'], counter2_args['where'], counter_type)

        df = compile_dfs(counter1, counter2, top_n_most_common_words)
        spearman = stats.spearmanr(df.ranks1, df.ranks2)
        result['spearman'][counter_type] = spearman
        result['df'][counter_type] = df
    return result

# Spearman's rank correlation coefficient

In [37]:
top_n_most_common_words = 500

## Jureko

In [59]:
result = similarity({"db": 'scrc', "table": 'agg', "where": "lang = 'de_cantons'"}, 
                    {"db": 'jureko', "table": 'agg', "where": "type = 'statute'"},
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.3742911364801356, pvalue=4.132353195456298e-17),
 'counter_pos': SpearmanrResult(correlation=0.8852941176470588, pvalue=5.147147576842463e-06),
 'counter_tag': SpearmanrResult(correlation=0.949761768629693, pvalue=6.757404432189277e-28)}

In [60]:
result = similarity({"db": 'scrc', "table": 'agg', "where": "lang = 'de_cantons'"}, 
                    {"db": 'jureko', "table": 'agg', "where": "type = 'decision'"},
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.4094061055500691, pvalue=3.8734293132136994e-20),
 'counter_pos': SpearmanrResult(correlation=0.9529411764705882, pvalue=1.2096579348829016e-08),
 'counter_tag': SpearmanrResult(correlation=0.9756813417190774, pvalue=5.959458231214732e-36)}

In [61]:
result = similarity({"db": 'jureko', "table": 'agg', "where": "type = 'statute'"}, 
                    {"db": 'jureko', "table": 'agg', "where": "type = 'decision'"},
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.4193305750632952, pvalue=2.2546914216482267e-22),
 'counter_pos': SpearmanrResult(correlation=0.911764705882353, pvalue=8.818985741384042e-07),
 'counter_tag': SpearmanrResult(correlation=0.9663807890222984, pvalue=2.4141728467124868e-32)}

## SCRC

In [62]:
result = similarity({"db": 'scrc', "table": 'de_cantons', "where": "canton = 'BE'"}, 
                    {"db": 'scrc', "table": 'de_cantons', "where": "canton = 'CH'"},
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.6444280824173008, pvalue=8.816303010879707e-60),
 'counter_pos': SpearmanrResult(correlation=0.9558823529411766, pvalue=7.760274461610307e-09),
 'counter_tag': SpearmanrResult(correlation=0.9749190013340955, pvalue=1.3173792307984008e-35)}

In [64]:
result = similarity({"db": 'scrc', "table": 'de_courts', "where": "court = 'BL_KG'"}, 
                    {"db": 'scrc', "table": 'de_courts', "where": "court = 'BS_APG'"},
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.5687185100508927, pvalue=4.2446846291865734e-44),
 'counter_pos': SpearmanrResult(correlation=0.9794117647058824, pvalue=3.982009789755264e-11),
 'counter_tag': SpearmanrResult(correlation=0.9929672760956454, pvalue=7.194796285030253e-50)}

In [65]:
result = similarity({"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_006'"}, # strafrechtliche Abteilung
                    {"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_011'"}, # andere strafrechtliche Abteilung
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.9999755678882, pvalue=0.0),
 'counter_pos': SpearmanrResult(correlation=1.0, pvalue=0.0),
 'counter_tag': SpearmanrResult(correlation=0.9999999999999999, pvalue=0.0)}

In [69]:
result = similarity({"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_006'"}, # strafrechtliche Abteilung
                    {"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BSTG_001'"}, # Bundesstrafgericht
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=nan, pvalue=nan),
 'counter_pos': SpearmanrResult(correlation=nan, pvalue=nan),
 'counter_tag': SpearmanrResult(correlation=nan, pvalue=nan)}

In [66]:
result = similarity({"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_006'"}, # strafrechtliche Abteilung
                    {"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_015'"}, # Verwaltungskommission
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.6770836938504984, pvalue=3.599289480560448e-58),
 'counter_pos': SpearmanrResult(correlation=0.9892857142857142, pvalue=2.9989342571550623e-12),
 'counter_tag': SpearmanrResult(correlation=0.9799440935027947, pvalue=7.099315953432226e-34)}

In [67]:
result = similarity({"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_006'"}, # strafrechtliche Abteilung
                    {"db": 'scrc', "table": 'de_chambers', "where": "chamber = 'CH_BGer_004'"}, # I. zivilrechtliche Abteilung
                    top_n_most_common_words)
result['spearman']

{'counter_lemma': SpearmanrResult(correlation=0.5964925573391929, pvalue=1.6654713966254975e-49),
 'counter_pos': SpearmanrResult(correlation=0.9911764705882352, pvalue=1.090829151044044e-13),
 'counter_tag': SpearmanrResult(correlation=0.9915380217267008, pvalue=8.681137791267902e-48)}

# Vocabulary Overlap

In [100]:
def compute_vocabulary_overlap(name_1, vocab_1, name_2, vocab_2):
    """
    Computes the vocabulary overlap between two vocabularies.
    Returns a result dict with the names given.
    """
    result = {name_1 + ' (num lemmas)': len(vocab_1), name_2 + ' (num lemmas)': len(vocab_2)} # prepare result dict

    intersection = vocab_1.intersection(vocab_2) # compute intersection
    result['intersection (num lemmas)'] = len(intersection)
    
    smaller_vocab = vocab_1 if len(vocab_1) < len(vocab_2) else vocab_2
    result['smaller vocab (num lemmas)'] = len(smaller_vocab)
    
    mean = np.mean([len(vocab_1), len(vocab_2)]) 
    result['mean (num lemmas)'] = mean
    
    union = vocab_1 | vocab_2 # compute union
    result['union (num lemmas)'] = len(union)
    
    # the overlap is the intersection divided by the mean of the vocab lengths
    result['overlap (intersection / smaller vocab) (%)'] = round(100 * len(intersection) / len(smaller_vocab), 2)
    result['overlap (intersection / mean) (%)'] = round(100 * len(intersection) / mean, 2)
    result['overlap (intersection / union) (%)'] = round(100 * len(intersection) / len(union), 2)
    
    return pd.DataFrame.from_dict(result, orient='index')

In [34]:
def get_vocab(lang, chamber):
    df = query("scrc", f"""
        SELECT vocabulary
        FROM {lang}_chambers
        WHERE chamber = '{chamber}'
    """)
    return set(np.array(df.vocabulary.to_list()).flat) # flatten possibly nested list

In [18]:
def run_for_chambers(lang, chamber_1, chamber_2): 
    vocab_1 = get_vocab(lang, chamber_1)
    vocab_2 = get_vocab(lang, chamber_2)    

    return compute_vocabulary_overlap(chamber_1, vocab_1, chamber_2, vocab_2)

## Within Court, different chambers

In [101]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BGer_002')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BGer_002 (num lemmas),205305.0
intersection (num lemmas),76772.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),204354.5
union (num lemmas),331937.0
overlap (intersection / smaller vocab) (%),37.74
overlap (intersection / mean) (%),37.57
overlap (intersection / union) (%),23.13


In [102]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BGer_016')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BGer_016 (num lemmas),144190.0
intersection (num lemmas),49221.0
smaller vocab (num lemmas),144190.0
mean (num lemmas),173797.0
union (num lemmas),298373.0
overlap (intersection / smaller vocab) (%),34.14
overlap (intersection / mean) (%),28.32
overlap (intersection / union) (%),16.5


In [103]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_BVGE_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_BVGE_001 (num lemmas),629294.0
intersection (num lemmas),105505.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),416349.0
union (num lemmas),727193.0
overlap (intersection / smaller vocab) (%),51.87
overlap (intersection / mean) (%),25.34
overlap (intersection / union) (%),14.51


In [108]:
result = run_for_chambers('de', 'AG_OG_003', 'CH_BVGE_001')
result

Unnamed: 0,0
AG_OG_003 (num lemmas),2315.0
CH_BVGE_001 (num lemmas),629294.0
intersection (num lemmas),2207.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),315804.5
union (num lemmas),629402.0
overlap (intersection / smaller vocab) (%),95.33
overlap (intersection / mean) (%),0.7
overlap (intersection / union) (%),0.35


In [109]:
result = run_for_chambers('de', 'CH_BGer_001', 'CH_PATG_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
CH_PATG_001 (num lemmas),28330.0
intersection (num lemmas),13884.0
smaller vocab (num lemmas),28330.0
mean (num lemmas),115867.0
union (num lemmas),217850.0
overlap (intersection / smaller vocab) (%),49.01
overlap (intersection / mean) (%),11.98
overlap (intersection / union) (%),6.37


In [106]:
result = run_for_chambers('de', 'CH_BGer_001', 'AG_OG_003')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
AG_OG_003 (num lemmas),2315.0
intersection (num lemmas),2155.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),102859.5
union (num lemmas),203564.0
overlap (intersection / smaller vocab) (%),93.09
overlap (intersection / mean) (%),2.1
overlap (intersection / union) (%),1.06


In [107]:
result = run_for_chambers('de', 'CH_BGer_001', 'AG_OG_004')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
AG_OG_004 (num lemmas),5689.0
intersection (num lemmas),4495.0
smaller vocab (num lemmas),5689.0
mean (num lemmas),104546.5
union (num lemmas),204598.0
overlap (intersection / smaller vocab) (%),79.01
overlap (intersection / mean) (%),4.3
overlap (intersection / union) (%),2.2


In [110]:
result = run_for_chambers('de', 'AG_OG_003', 'AG_OG_004')
result

Unnamed: 0,0
AG_OG_003 (num lemmas),2315.0
AG_OG_004 (num lemmas),5689.0
intersection (num lemmas),1210.0
smaller vocab (num lemmas),2315.0
mean (num lemmas),4002.0
union (num lemmas),6794.0
overlap (intersection / smaller vocab) (%),52.27
overlap (intersection / mean) (%),30.23
overlap (intersection / union) (%),17.81


In [111]:
result = run_for_chambers('de', 'CH_BGer_001', 'ZH_SVG_001')
result

Unnamed: 0,0
CH_BGer_001 (num lemmas),203404.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),70829.0
smaller vocab (num lemmas),203404.0
mean (num lemmas),400718.5
union (num lemmas),730608.0
overlap (intersection / smaller vocab) (%),34.82
overlap (intersection / mean) (%),17.68
overlap (intersection / union) (%),9.69


In [112]:
result = run_for_chambers('de', 'CH_BVGE_001', 'ZH_SVG_001')
result

Unnamed: 0,0
CH_BVGE_001 (num lemmas),629294.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),148373.0
smaller vocab (num lemmas),598033.0
mean (num lemmas),613663.5
union (num lemmas),1078954.0
overlap (intersection / smaller vocab) (%),24.81
overlap (intersection / mean) (%),24.18
overlap (intersection / union) (%),13.75


In [113]:
result = run_for_chambers('de', 'ZH_VG_001', 'ZH_SVG_001')
result

Unnamed: 0,0
ZH_VG_001 (num lemmas),221311.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),79881.0
smaller vocab (num lemmas),221311.0
mean (num lemmas),409672.0
union (num lemmas),739463.0
overlap (intersection / smaller vocab) (%),36.09
overlap (intersection / mean) (%),19.5
overlap (intersection / union) (%),10.8


In [114]:
result = run_for_chambers('de', 'ZH_OG_001', 'ZH_SVG_001')
result

Unnamed: 0,0
ZH_OG_001 (num lemmas),279126.0
ZH_SVG_001 (num lemmas),598033.0
intersection (num lemmas),88597.0
smaller vocab (num lemmas),279126.0
mean (num lemmas),438579.5
union (num lemmas),788562.0
overlap (intersection / smaller vocab) (%),31.74
overlap (intersection / mean) (%),20.2
overlap (intersection / union) (%),11.24


In [115]:
result = run_for_chambers('de', 'ZH_OG_001', 'ZH_VG_001')
result

Unnamed: 0,0
ZH_OG_001 (num lemmas),279126.0
ZH_VG_001 (num lemmas),221311.0
intersection (num lemmas),75771.0
smaller vocab (num lemmas),221311.0
mean (num lemmas),250218.5
union (num lemmas),424666.0
overlap (intersection / smaller vocab) (%),34.24
overlap (intersection / mean) (%),30.28
overlap (intersection / union) (%),17.84


## Within court, same chamber, different decisions

In [None]:
# TODO for German texts use compound splitter from dtuggener

In [None]:
# TODO alternatively rank words by number of occurrences and compute overlap of 50000 most frequent words