# Description
In this notebook methods to find the ags for factories.

# OpenToDos:
- What to do with AGS = DG00000?

In [None]:
import pandas as pd
import numpy as np

# Load data

In [None]:
def add_zeros(ags):
    ags = str(ags)
    length_ags = len(ags)
    if length_ags < 8:
        ags = "0" * (8-length_ags) + ags
    return ags

df_ags = pd.read_csv("../data/public/ags/master.csv")

# reading of old AGS List (deprecated)

# df_ags = pd.read_csv("ags_liste.csv", encoding="cp1252", engine="python", sep=";", dtype={"AGS": str})
# df_ags.rename(columns={"8-stellige AGS": "AGS", "Einheit": "Ort"}, inplace=True)
# df_ags.drop(
#     columns=["Zahl", "Insgesamt", "männlich", "weiblich", "Prüfzahl"], inplace=True
# )
# df_ags["AGS"] = df_ags["AGS"].apply(add_zeros)

df_anlagen = pd.read_csv(
    "Anlagenliste_AGS_V2.csv", encoding="cp1252", engine="python", sep=";", converters={"Bundesland": str, "AGS_manual": str}
)
df_anlagen.rename(
    columns={
        "Standort der Anlage": "Ort",
        "AGS": "AGS_xls",
        "VET 2018 [t CO2 Äq]": "VET2018_t",
        "Haupttätigkeit nach TEHG": "TEHG_Nr",
        "Bezeichnung Haupttätigkeit nach TEHG": "TEHG_name",
    },
    inplace=True,
)
df_anlagen.drop(
    columns=[
        "Prüfsumme",
    ],
    inplace=True,
)

# Data Processing
## Add industrie cateogires to facilities

In [None]:
dict_industry_categories = {
    "miner_cement": ["Herstellung von Zementklinker"],
    "miner_chalk": ["Herstellung von Kalk"],
    "miner_glas": ["Herstellung von Glas"],
    "miner_ceram": ["Herstellung von Keramik"],
    "chem_basic": [
        "Herstellung von Salpetersäure",
        "Herstellung organischer Grundchemikalien",
        "Herstellung von Soda",
        "Herstellung von Adipinsäure",
        "Herstellung von Industrieruß",
    ],
    "chem_ammonia": ["Herstellung von Ammoniak"],
    "chem_other": ["Herstellung von Glyoxal und Glyoxylsäure"],
    "metal_steel": [
        "Herstellung von Roheisen und Stahl",
        "Verarbeitung von Eisenmetallen",
    ],
    "metal_steel_primary": [
        "Herstellung von Roheisen und Stahl"
    ],  # contained in metal_steel
    "metal_steel_secondary": [
        "Verarbeitung von Eisenmetallen"
    ],  # contained in metal_steel
    "metal_nonfe": [
        "Verarbeitung von Nichteisenmetallen",
        "Herstellung von Primäraluminium",
        "Verarbeitung von Metallerzen",
    ],
    "other_paper": ["Herstellung von Papier", "Herstellung von Zellstoff"],
    "other_food": [],
    "other_further": ["Herstellung von Mineralfasern", "Herstellung von Gips"],
}

# Herstellung von Mineralfasern ?, Herstellung von Gips ?


In [None]:
df_anlagen["i_categorie"] = "No categorie"
for key, item in dict_industry_categories.items():
    if len(item) > 0:
        for name in item:
            df_anlagen.loc[df_anlagen["TEHG_name"] == name, "i_categorie"] = key

## Select "Anlagen" from industry

In [None]:
df_anlagen = df_anlagen[df_anlagen["i_categorie"] != "No categorie"]

In [None]:
print(f"{len(df_anlagen)} selected")

## Add state information to df_ags

In [None]:
ags_to_state = {
    "01" : "SH",
    "02" : "HH",
    "03" : "NI",
    "04" : "HB",
    "05" : "NW",
    "06" : "HE",
    "07" : "RP",
    "08" : "BW",
    "09" : "BY",
    "10" : "SL",
    "11" : "BE",
    "12" : "BB",
    "13" : "MV",
    "14" : "SN",
    "15" : "ST",
    "16" : "TH"
}

In [None]:
df_ags["ags_state_digits"] = [x[:2] for x in df_ags["ags"]]
df_ags["state"] = df_ags["ags_state_digits"].map(ags_to_state)

## Remove not necessary parts of the Names in AGS

In [None]:
# remove " , Stadt"
df_ags["name"] = df_ags["description"].str.split(",").str[0]

## Select duplicates in AGS List (column: "description")

In [None]:
df_ags_duplicates = df_ags[df_ags.duplicated(subset="name", keep=False)]
df_ags_duplicates = df_ags_duplicates.sort_values("name")
df_ags_no_duplicates = df_ags.drop_duplicates(subset="name", keep=False)

In [None]:
n = 0
mask_duplicates_state_unique = [False] * len(df_ags_duplicates)
while True:
    row = df_ags_duplicates.iloc[n]
    ort_count = df_ags_duplicates[df_ags_duplicates["name"] == row["name"]].shape[0]

    if ort_count ==2:
        if row["state"] != df_ags_duplicates.iloc[n + 1]["state"]:
            mask_duplicates_state_unique[n] = True
            mask_duplicates_state_unique[n + 1] = True

    elif ort_count ==3:
        if (
            (row["state"] != df_ags_duplicates.iloc[n + 1]["state"])
            & (
                row["state"]
                != df_ags_duplicates.loc[:, "state"].iloc[n + 2]
            )
            & (
                df_ags_duplicates.loc[:, "state"].iloc[n + 1]
                != df_ags_duplicates.loc[:, "state"].iloc[n + 2]
            )
        ):
            mask_duplicates_state_unique[n] = True
            mask_duplicates_state_unique[n + 1] = True
            mask_duplicates_state_unique[n + 2] = True

    n += ort_count
    if n >= len(df_ags_duplicates):
        break

In [None]:
df_ags_duplicates_state_unique = df_ags_duplicates[mask_duplicates_state_unique]

# Find AGS for "Ort"

In [None]:
df_anlagen

In [None]:
for index, row in df_anlagen.iterrows():
    state_anlage = row.loc["Bundesland"]
    location_anlage = row.loc["Ort"]
    if ~df_anlagen.loc[:,"AGS_manual"].isna().loc[index]:
        df_anlagen.loc[index, "AGS"] = df_anlagen.loc[index, "AGS_manual"]
        df_anlagen.loc[index, "match_type"] = "manual_entry"
    else:
        temp_ags_match = df_ags_no_duplicates.loc[(df_ags_no_duplicates["name"] == location_anlage)]
        # select entries with only one match
        if temp_ags_match.shape[0] == 1:
            if temp_ags_match["state"].values[0] == state_anlage:
                df_anlagen.loc[index, "AGS"] = temp_ags_match["ags"].values[0]
                df_anlagen.loc[index, "match_type"] = "direct"

        temp_ags_match_dup = df_ags_duplicates_state_unique.loc[(df_ags_duplicates_state_unique["name"] == location_anlage)]
        if temp_ags_match_dup.shape[0] == 1:
            raise ValueError("Duplication selection wrong. Single entry found in AGS list")
        elif temp_ags_match_dup.shape[0] > 1:
            for index_match, match in temp_ags_match_dup.iterrows():
                if match["state"] == state_anlage:
                    df_anlagen.loc[index, "AGS"] = match["ags"]
                    df_anlagen.loc[index, "match_type"] = "dublicate_match_state_unique"

In [None]:

for index, row in df_anlagen[df_anlagen["match_type"].isna()].iterrows():
    state_anlage = row.loc["Bundesland"]
    location_anlage = row.loc["Ort"]
    temp_ags_match = df_ags_no_duplicates[df_ags_no_duplicates["name"].str.startswith(location_anlage) == True]
    if temp_ags_match.shape[0] == 1:
        if temp_ags_match["state"].values[0] == state_anlage:
            df_anlagen.loc[index, "AGS"] = temp_ags_match["ags"].values[0]
            df_anlagen.loc[index, "match_type"] = "starts_with_only_one_match"

In [None]:
df_anlagen["match_type"].value_counts(dropna=False)

In [None]:
temp =df_anlagen[df_anlagen["match_type"].isna()]