## Prerequisites

 * Load corpus data

In [None]:
import sys
sys.path.append("..")

from collections import defaultdict
from chaininglib.search.metadata import get_available_metadata
from chaininglib.ui.dfui import display_df, save_dataframe
from chaininglib.search.CorpusQuery import *


corpus_name="zeebrieven"
query=r'[pos="NOU"]'
fields = get_available_metadata(corpus_name)
chosen_fields = ["witnessYear_from", "witnessYear_to", "afz_klasse","afz_geslacht"]
c = create_corpus(corpus_name).pattern(query).extra_fields_doc(chosen_fields).search()
df_corpus = c.kwic()

save_dataframe(df_corpus, "zeebrieven_case_study.csv")






 * Compute frequencies

In [None]:
from chaininglib.utils.dfops import df_filter
from chaininglib.process.corpus import get_frequency_list
from chaininglib.ui.dfui import load_dataframe
import itertools

df_corpus = load_dataframe("zeebrieven_case_study.csv")

df = {}
df["17th"]= df_corpus[df_corpus["witnessYear_from"] < 1700] 
df["18th"] = df_corpus[df_corpus["witnessYear_from"] > 1700] 

eras = ["17th", "18th"]
soc_classes = ["low","high"]
genders = ["male","female"]

# Then, filter Zeebrieven per sociolinguistic variable (social class/occupation)
# Take only extremes: high and low. Remove middle-low and middle-high
freq_list_aggr = defaultdict(lambda: dict())
freq_list = defaultdict(lambda: defaultdict(lambda: dict()))
for era, df_era in df.items():
    for soc_class in soc_classes:
        cond_era_class = df_filter(df_era["afz_klasse"], pattern=soc_class)
        df_era_class = df_era[ cond_era_class ]
        freq_list_aggr[era][soc_class] = get_frequency_list(df_era_class)
        for gender in genders:
            cond_era_gender = df_filter(df_era["afz_geslacht"], pattern=gender)
            df_era_class_gender = df_era[cond_era_class & cond_era_gender ]
            freq_list[era][soc_class][gender] = get_frequency_list(df_era_class_gender)

from chaininglib.utils.dfops import get_relfreq_diff
SHOW=20


## Comparisons

### Class difference, era fixed

In [None]:
df_relfreq_diff = {}

for era in eras:
    soc_class_pairs = itertools.combinations(soc_classes,2)
    for soc_class1,soc_class2 in soc_class_pairs:
        df_relfreq_diff = get_relfreq_diff(freq_list_aggr[era][soc_class1], freq_list_aggr[era][soc_class2], index="lemmas", label1=soc_class1, label2=soc_class2 )
        label_df1 = f"Comparison {soc_class1} and {soc_class2} class in {era} century. Words specific for {soc_class1}."
        display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW), labels=label_df1)
        label_df2 = f"Comparison {soc_class1} and {soc_class2} class in {era} century. Words specific for {soc_class2}."
        display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW), labels=label_df2)
        
        d1 = df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW)
        d2 = df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW)


### Class difference, era and gender fixed

In [None]:
df_relfreq_diff = {}
for era in eras:
    for gender in genders:
        soc_class_pairs = itertools.combinations(soc_classes,2)
        for soc_class1,soc_class2 in soc_class_pairs:
            df_relfreq_diff = get_relfreq_diff(freq_list[era][soc_class1][gender], freq_list[era][soc_class2][gender], index="lemmas", label1=soc_class1, label2=soc_class2 )
            label_df1 = f"Comparison {soc_class1} and {soc_class2} class in {era} century with gender {gender}. Words specific for {soc_class1}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW), labels=label_df1)
            label_df2 = f"Comparison {soc_class1} and {soc_class2} class in {era} century with gender {gender}. Words specific for {soc_class2}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW), labels=label_df2)


### Evolution, social class fixed

In [None]:
df_relfreq_diff = {}
for soc_class in soc_classes:
    era_pairs = itertools.combinations(eras,2)
    for era1, era2 in era_pairs:
        df_relfreq_diff = get_relfreq_diff(freq_list_aggr[era1][soc_class], freq_list_aggr[era2][soc_class], index="lemmas", label1=era1, label2=era2 )
        label_df1 = f"Comparison {era1} and {era2} century for class {soc_class}. Words specific for {era1} century."
        display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW), labels=label_df1)
        label_df2 = f"Comparison {era1} and {era2} century for class {soc_class}. Words specific for {era2} century."
        display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW), labels=label_df2)


### Evolution, social class and gender fixed

In [None]:
df_relfreq_diff = {}
for gender in genders:
    for soc_class in soc_classes:
        era_pairs = itertools.combinations(eras,2)
        for era1, era2 in era_pairs:
            df_relfreq_diff = get_relfreq_diff(freq_list[era1][soc_class][gender], freq_list[era2][soc_class][gender], index="lemmas", label1=era1, label2=era2 )
            label_df1 = f"Comparison {era1} and {era2} century for class {soc_class} and gender {gender}. Words specific for {era1}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW), labels=label_df1)
            label_df2 = f"Comparison {era1} and {era2} century for class {soc_class} and gender {gender}. Words specific for {era2}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW), labels=label_df2)

            

### Gender analysis, era and social class fixed

In [None]:
df_relfreq_diff = {}
for era in eras:
    for soc_class in soc_classes:
        gender_pairs = itertools.combinations(genders,2)
        for gender1, gender2 in gender_pairs:
            df_relfreq_diff = get_relfreq_diff(freq_list[era][soc_class][gender1], freq_list[era][soc_class][gender2], index="lemmas", label1=gender1, label2=gender2 )
            label_df1 = f"Comparison {gender1} and {gender2} for {era} century and class {soc_class}. Words specific for {gender1}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=False).head(SHOW), labels=label_df1)
            label_df2 = f"Comparison {gender1} and {gender2} for {era} century and class {soc_class}. Words specific for {gender2}."
            display_df(df_relfreq_diff.sort_values(by="relfreq_diff", ascending=True).head(SHOW), labels=label_df2)
