## Jupyter Notebook zur Berechnung der Levenshtein Distanzen zwischen Maurer- und DWA Graphemen und dem Abfragewort

Ein Ansatz um herauszufinden, ob eine der beiden Befragungen signifikant mehr oder weniger Variabilität enthält, besteht darin die Levensthein Distanz zum Ursprungswort zu berechnen. Dadurch kann absolut gesehen werden, ob eine Befragung einen signifikant höheren Wert hat als die andere.
Hierzu werden in den drei Kategorien "item", "phontype" und "lextype" jeweils alle Maurer- und DWA-Werte summiert. Daraus wird wiederrum eine Summe pro Wort gebildet. Somit ist ersichtlich wie viel Varianz pro Wort in jeder Befragung vorkommt.
Daraus abgeleitet wird für jedes Wort der Mittelwert und die Standardabweichung berechnet. Schließlich gibt es noch Auskunft darüber, in welchem Perzentil sich die tatsächliche Abweichung befindet.

In [1]:
import pandas as pd
from Levenshtein import distance
import math

def make_dataframe(file):
    """
    creates a pandas dataframe and drops "korrektur" and "org" columns
    
    Parameters:
    file(csv): must provide source to the csv file that is to be made into a dataframe
    
    Returns:
    df_sorted: pd.DataFrame
    """
    df = pd.read_csv(file, sep = "\t", index_col = False)
    df_sorted = df.drop(columns = ["korrektur", "org"])
    return df_sorted

In [2]:
def levenshtein_counter(index, word, df, spalte):
    """
    computes and adds up Levenshtein distance between questionnaire words and original word asked in the survey

    Parameters:
    index(int): to find the correct cell within the column
    word(str): original word
    df(DataFrame)
    spalte(str): name of column

    Returns:
    distanz(int): Levenshtein distance
    """
    word_clean = df[spalte][index].strip()

    if spalte != "lextype":
        word = word.lower()
    if word_clean != "na":
        distanz = distance(word_clean, word)
    else:
        distanz = 0

    return int(distanz)

In [3]:
words = ["Ameise", "Begräbnis", "Deichsel", "Elster", "Fledermaus", "Gurke", "Hagebutte", "Hebamme", "Kartoffel", "Maulwurf", "Pflaume", "Stecknadel", "Ziege", "Zimmerfliege"]

In [4]:
# Summeriung der Levenshtein Distanzen nach Kategorie
list_maurer_item = []
list_dwa_item = []
list_maurer_phontype = []
list_dwa_phontype = []
list_maurer_lextype = []
list_dwa_lextype = []
list_maurer_gesamt = []
list_dwa_gesamt = []

for word in words:
    df = make_dataframe(f"/home/kopatsch/Masterarbeit/MA/masterarbeit/data/csv/DWA_Maurer_{word}.csv")
    df = df.fillna("na")

    counter_maurer_item = 0
    counter_dwa_item = 0

    counter_maurer_phontype = 0
    counter_dwa_phontype = 0

    counter_maurer_lextype = 0
    counter_dwa_lextype = 0

    index = 0

    for i in df["erhebung"]:
        if i == "maurer":
            maurer_count_item = levenshtein_counter(index, word, df, "item")
            counter_maurer_item += maurer_count_item
            maurer_count_phontype = levenshtein_counter(index, word, df, "phontype")
            counter_maurer_phontype += maurer_count_phontype
            maurer_count_lextype = levenshtein_counter(index, word, df, "lextype")
            counter_maurer_lextype += maurer_count_lextype
        elif i == "DWA":
            dwa_count_item = levenshtein_counter(index, word, df, "item")
            counter_dwa_item += dwa_count_item
            dwa_count_phontype = levenshtein_counter(index, word, df, "phontype")
            counter_dwa_phontype += dwa_count_phontype
            dwa_count_lextype = levenshtein_counter(index, word, df, "lextype")
            counter_dwa_lextype += dwa_count_lextype
                
        index += 1

    counter_maurer_gesamt = counter_maurer_item + counter_maurer_phontype + counter_maurer_lextype
    counter_dwa_gesamt = counter_dwa_item + counter_dwa_phontype + counter_dwa_lextype

    list_maurer_item.append(counter_maurer_item)
    list_dwa_item.append(counter_dwa_item)
    list_maurer_phontype.append(counter_maurer_phontype)
    list_dwa_phontype.append(counter_dwa_phontype)
    list_maurer_lextype.append(counter_maurer_lextype)
    list_dwa_lextype.append(counter_dwa_lextype)
    list_maurer_gesamt.append(counter_maurer_gesamt)
    list_dwa_gesamt.append(counter_dwa_gesamt)

In [5]:
# DataFrame erstellen
df_neu = pd.DataFrame()


df_neu["Maurer Item"] = list_maurer_item
df_neu["DWA Item"] = list_dwa_item
df_neu["Maurer Phontype"] = list_maurer_phontype
df_neu["DWA Phontype"] = list_dwa_phontype
df_neu["Maurer Lextype"] = list_maurer_lextype
df_neu["DWA Lextype"] = list_dwa_lextype
df_neu["Maurer kumuliert"] = list_maurer_gesamt
df_neu["DWA kumuliert"] = list_dwa_gesamt

# Indizes mit Abfrageworten benennen
df_neu = df_neu.rename({0: words[0],
                        1: words[1],
                        2: words[2],
                        3: words[3],
                        4: words[4],
                        5: words[5],
                        6: words[6],
                        7: words[7],
                        8: words[8],
                        9: words[9],
                        10: words[10],
                        11: words[11],
                        12: words[12],
                        13: words[13]}, axis = "index")

In [6]:
# Summierung der Werte pro Kategorie
add_list = []
for spalte in df_neu:
    summe = sum(df_neu[spalte])
    add_list.append(summe)

df_neu = df_neu.T
df_neu["Total"] = add_list
df_neu = df_neu.T


In [7]:
# Differenz zwischen Maurer und DWA Abweichungen in Prozent
index = 0
difference = []

for wert_maurer in df_neu["Maurer kumuliert"]:
    wert_dwa = df_neu["DWA kumuliert"][index]

    maximum = max([wert_maurer, wert_dwa])
    minimum = min([wert_maurer, wert_dwa])

    dif = minimum/maximum
    prozent = (1.0 - dif) * 100
    pro = round(prozent, 2)
    difference.append(str(pro) + " %")

    index += 1

df_neu["Differenz"] = difference


In [8]:
# Mittelwert zwischen Maurer und DWA Abweichungen
mittel = []

mean = (df_neu["Maurer kumuliert"][-1] + df_neu["DWA kumuliert"][-1])/((len(df_neu["Maurer kumuliert"]) -1) * 2)

for w_maurer in df_neu["Maurer kumuliert"][0:-1]:
    mittel.append(mean)

mean_total = (df_neu["Maurer kumuliert"][-1] + df_neu["DWA kumuliert"][-1])/2
mittel.append(mean_total)

df_neu["Mean"] = mittel

In [9]:
# Standardabweichung
standard = []

wert = 0
for w_maurer in df_neu["Maurer kumuliert"][0:-1]:
    zw_wert = (w_maurer - mean)**2
    wert += zw_wert

for w_dwa in df_neu["DWA kumuliert"][0:-1]:
    zw_wert = (w_dwa - mean)**2
    wert += zw_wert

std = math.sqrt(wert/((len(df_neu["Maurer kumuliert"]) - 1)*2))

for w_maurer in df_neu["Maurer kumuliert"][0:-1]:
    standard.append(std)

std_total = math.sqrt(((df_neu["Maurer kumuliert"][-1] - df_neu["Mean"][-1])**2 + (df_neu["DWA kumuliert"][-1] - df_neu["Mean"][-1])**2)/2)
standard.append(std_total)

df_neu["StDev"] = standard

In [10]:
# Angabe, ob tatsächliche Werte innerhalb Mean +1 StDev, +2 StDev oder +3 StDev liegen
std1_m = []
std2_m = []
std3_m = []

std1_d = []
std2_d = []
std3_d = []

index = 0

for s in df_neu["StDev"]:
    mittelwert = df_neu["Mean"][index]
    maurer = df_neu["Maurer kumuliert"][index]
    dwa = df_neu["DWA kumuliert"][index]

    std11 = mittelwert + s
    std12 = mittelwert - s

    std21 = mittelwert + (2*s)
    std22 = mittelwert - (2*s)

    std31 = mittelwert + (3*s)
    std32 = mittelwert - (3*s)

    if maurer > std12 and maurer < std11:
        std1_m.append("x")
        std2_m.append(".")
        std3_m.append(".")
    elif maurer > std22 and maurer < std21:
        std1_m.append(".")
        std2_m.append("x")
        std3_m.append(".")
    elif maurer > std32 and maurer < std31:
        std1_m.append(".")
        std2_m.append(".")
        std3_m.append("x")

    if dwa > std12 and dwa < std11:
        std1_d.append("x")
        std2_d.append(".")
        std3_d.append(".")
    elif dwa > std22 and dwa < std21:
        std1_d.append(".")
        std2_d.append("x")
        std3_d.append(".")
    elif dwa > std32 and dwa < std31:
        std1_d.append(".")
        std2_d.append(".")
        std3_d.append("x")
        
    index += 1

df_neu["Maurer Mean + 1 Std (68%)"] = std1_m
df_neu["Maurer Mean + 2 Std (95%)"] = std2_m
df_neu["Maurer Mean + 3 Std (99%)"] = std3_m

df_neu["DWA Mean + 1 Std (68%)"] = std1_d
df_neu["DWA Mean + 2 Std (95%)"] = std2_d
df_neu["DWA Mean + 3 Std (99%)"] = std3_d

In [11]:
# speichern
df_neu.to_csv(f"/home/kopatsch/Masterarbeit/MA/masterarbeit/data/levenshtein_to_original/levenshtein_distance_to_abfragewort.csv", sep = "\t")