In [1]:
import pandas as pd
from Levenshtein import distance

words = ["ameise", "begräbnis", "deichsel", "elster", "fledermaus", "gurke", "hagebutte", "hebamme", "kartoffel", "maulwurf", "pflaume", "stecknadel", "ziege", "zimmerfliege"]

In [2]:
def leven(val1, val2, ziel):
    """
    computes Levenshtein distance for two given values

    Parameters:
    val1(str or list): list must be comprised of str elements. Function checks if parameter is str or list. 
    If parameter is list, function will compute Levenshtein distance for each str within list.
    val2(str or list): list must be comprised of str elements. 
    Function checks if parameter is str or list. If parameter is list, function will compute Levenshtein distance for each str within list.

    Returns:
    minimum (int): minimal Levenshein distance computed for two given values
    """
    val1 = str(val1)
    val2 = str(val2)
    
    if ", " in val1:
        wert1 = val1.split(", ")
    else:
        wert1 = val1.strip()

    if ", " in val2:
        wert2 = val2.split(", ")
    else:
        wert2 = val2.strip()

    dist = []

    if type(wert1) == str and type(wert2) == str:
        if wert1 == "na" or wert1 == "999" or wert1 == "k.a." or wert1 == "nan" or wert2 == "na" or wert2 == "999" or wert2 == "k.a." or wert2 == "nan":
            dist_item = 999
        else:  
            dist_item = distance(wert1, wert2)
        dist.append(dist_item)

    elif type(wert1) == str and type(wert2) == list:
        for a in wert2:
            a = a.strip()
            if wert1 == "na" or wert1 == "999" or wert1 == "k.a." or wert1 == "nan" or a == "na" or a == "999" or a == "k.a." or a == "nan":
                dist_item = 999
            else:  
                dist_item = distance(wert1, a)
            dist.append(dist_item)

    elif type(wert2) == str and type(wert1) == list:
        for c in wert1:
            c = c.strip()
            if c == "na" or c == "999" or c == "k.a." or c == "nan" or wert2 == "na" or wert2 == "999" or wert2 == "k.a." or wert2 == "nan":
                dist_item = 999
            else:
                dist_item = distance(wert2, c)
            dist.append(dist_item)
    
    elif type(wert1) == list and type(wert2) == list:
        for l in wert1:
            l = l.strip()
            for b in wert2:
                b = b.strip()
                if l == "na" or l == "999" or l == "k.a." or l == "nan" or b == "na" or b == "999" or b == "k.a." or b == "nan":
                    dist_item = 999
                else:
                    dist_item = distance(l, b)
                dist.append(dist_item)
    
    minimum = min(dist)
    ziel.append(minimum)

In [3]:
# Erstellung der Lexem files
for word in words:
    word_upper = word[0].upper() + word[1::]
    df = pd.read_csv(f"/home/kopatsch/Masterarbeit/MA/masterarbeit/data/csv/DWA_Maurer_{word_upper}.csv", sep = "\t")
    df = df.fillna("na")
    df = df.replace("nan", "na")

    lexem = set(df["lextype"])
    lexem = list(lexem)
    lexem.remove("na")

    for lex in lexem:
        df1 = df.drop(df[df["lextype"] != lex].index)
        df1["coord"] = df1["LONG"].astype(str) + ", " + df1["LAT"].astype(str)

        df1 = df1.drop(columns = ["korrektur", "org", "belegnr.", "LONG", "LAT", "GID", "lfd"])
        df1 = df1.reset_index(drop=True)
        
        columns_liste = []
        for i in df1:
            columns_liste.append(i)

        df1 = df1.rename({columns_liste[0]: "Fragebogen",
                        columns_liste[1]: "Ort",
                        columns_liste[2]: "item",
                        columns_liste[3]: "phontype",
                        columns_liste[4]: "lextype",
                        columns_liste[5]: "erhebung",
                        columns_liste[6]: "coord"}, axis = "columns")

        maurer = df1.drop(df1[df1["erhebung"] != "maurer"].index)
        dwa = df1.drop(df1[df1["erhebung"] != "DWA"].index)

        switch = False
        # weil dwa an zweiter Stelle steht beim Merge, muss diese Ausnahmeregelung eingebaut werden
        if len(dwa) == 0:
            dwa_unique = pd.DataFrame({"Fragebogen_DWA": ["na"],
                                "Ort_DWA": ["na"],
                                "item_DWA": ["na"],
                                "phontype_DWA": ["na"],
                                "lextype_DWA": ["na"],
                                "erhebung_DWA": ["na"],
                                "coord": [maurer["coord"][0]]})
            switch = True

        maurer_unique = maurer.groupby("coord").agg(Fragebogen_maurer = ("Fragebogen", set),
                                                  Ort_maurer = ("Ort", set),
                                                  item_maurer = ("item", set),
                                                  phontype_maurer = ("phontype", set),
                                                  lextype_maurer = ("lextype", set),
                                                  erhebung_maurer = ("erhebung", set))

        if switch == False:
            dwa_unique = dwa.groupby("coord").agg(Fragebogen_DWA = ("Fragebogen", set),
                                                Ort_DWA = ("Ort", set),
                                                item_DWA = ("item", set),
                                                phontype_DWA = ("phontype", set),
                                                lextype_DWA = ("lextype", set),
                                                erhebung_DWA = ("erhebung", set))


        result = pd.merge(maurer_unique, dwa_unique, how="outer", on=["coord"])
        if switch == True:
            # Koordinaten werden im Index abgespeichert
            result.index = maurer_unique.index
        result = result.astype(str)
        result["Ort"] = result["Ort_maurer"] + ", " + result["Ort_DWA"]
        
        result_clean = pd.DataFrame()
        for column in result:
            liste = []
            for j in result[column]:
                j = str(j)
                j = j.replace("{", "")
                j = j.replace("}", "")
                j = j.replace("\'", "")
                j = j.replace("\"", "")
                j = j.replace("nan", "na")
                j = j.replace("na, ", "")
                j = j.replace(", na", "")
                j = j.strip()
                liste.append(j)
            result_clean[column] = liste

        result_clean = result_clean.fillna("na")
        result_clean = result_clean.drop(columns = ["Ort_maurer", "Ort_DWA", "erhebung_maurer", "erhebung_DWA"])
        result_clean = result_clean[["Fragebogen_maurer", "Fragebogen_DWA", "Ort", "item_maurer", "item_DWA", "phontype_maurer", "phontype_DWA", "lextype_maurer", "lextype_DWA"]]

        item = []
        phontype = []
        lextype = []

        for i in range(len(result_clean)):
            leven(result_clean["item_maurer"][i], result_clean["item_DWA"][i], item)
            leven(result_clean["phontype_maurer"][i], result_clean["phontype_DWA"][i], phontype)
            leven(result_clean["lextype_maurer"][i], result_clean["lextype_DWA"][i], lextype)

        result_clean.insert(loc=5, column='Levenshtein_item', value=item)
        result_clean.insert(loc=8, column='Levenshtein_phontype', value=phontype)
        result_clean.insert(loc=11, column='Levenshtein_lextype', value=lextype)
        result_clean.insert(loc=3, column='Koordinaten', value=result.index)

        
        result_clean.to_csv(f"/home/kopatsch/Masterarbeit/MA/masterarbeit/data/lexem/{word}/DWA_Maurer_lexem_{lex}.csv", sep = "\t")