# Thesis intro

## Helper Functions

In [None]:
# Imports
import pandas as pd
import tempun
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from typing import List, Dict


# Constants
fabric_types = ["esa", "esb", "esc", "esd", "its", "arsw", "lrd", "lrc"]

chronological_lower_date = "standard_typo_chronological_lower_date"
chronological_upper_date = "standard_typo_chronological_upper_date"

deposit_lower_date = "deposit_lower_date"
deposit_upper_date = "deposit_upper_date"

processed_data_path = "./data/processed/processed.csv"

colors_dict = {
    "esa": "black",
    "esb": "green",
    "esc": "blue",
    "esd": "red",
    "its": "pink",
    "arsw": "orange",
    "lrd": "purple",
    "lrc": "yellow"
}

# Column names
id_column = "standard_form_id"
fabric_column = "fabric_h1"

# Helper functions
def create_date_dict_and_remove_empty(data, lower_date, upper_date):
    minimum = data[lower_date].min()
    maximum = data[upper_date].max()

    date_dict = dict.fromkeys(range(int(minimum), int(maximum)), 0)

    cleaned = data.dropna(subset=[lower_date])
    cleaned = cleaned.dropna(subset=[upper_date])

    return cleaned, date_dict


def simulate_dates(data, lower_date, upper_date, size, column):
    return data.apply(lambda row: tempun.model_date(
        start=row[lower_date], stop=row[upper_date], size=size, count=row[column]), axis=1)


def plot_graph(
    dicts_of_df: Dict[str, pd.Series],
    ax,
    palette: List[str],
    linewidth: int = 3,
    linestyle: str = "solid",
):

    for key, colour in zip(dicts_of_df.keys(), palette):
        data = dicts_of_df.get(key)
        sns.lineplot(
            data=data,
            ax=ax,
            label=key,
            color=colour,
            linewidth=linewidth,
            linestyle=linestyle,
        )


def plot_and_save(observed, input_data, title, y_label, file_name, xlims):

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=300)

    plot_graph(dicts_of_df=observed,
               palette=colors_dict.values(),
               ax=axs[0],
               linewidth=2)

    axs[0].set_xlim(xlims)
    axs[1].set_xlim(xlims)

    ax = axs.ravel()[0]
    ax.set_ylim(0, None)
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.0f"))
    ax.set_xlabel(None)
    ax.get_legend().remove()
    ax.set_ylabel(None)
    ax.set_xticklabels([int(i) if i != 0 else 1 for i in ax.get_xticks()])
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, bbox_to_anchor=(0.985, 0.957), fontsize=8)

    for fabric in fabric_types:
        color = colors_dict[fabric]
        filtered_input = input_data[input_data["fabric_h1"]
                                    == fabric]["random_dates"]
        filtered_input = [el for el in filtered_input if type(el) != float]

        tempun.kdeplot_from_randoms(filtered_input, ax=axs[1], color=color)

    axs[1].set_ylim(0, None)
    axs[1].yaxis.set_major_formatter(FormatStrFormatter("%.3f"))
    axs[1].set_xlabel(None)
    axs[1].set_ylabel(None)

    axs[0].set_ylabel(y_label)

    fig.supxlabel("Year")

    axs[0].set_title(label=title, size=15, y=1.04)
    axs[1].set_title(label=f"Simulated {title}", size=15, y=1.04)

    plt.tight_layout()
    plt.savefig(f"./data/figures/{file_name}.png")

## Preprocessing

In [None]:
catalogue = pd.read_csv("./data/ICRATES_CATALOGUE.csv",
                        usecols=[
                            "ICRATES_ID", "Deposit_ID", "Location_ID",
                            "Fabric_ID", "Fabric", "Standard_Form_ID",
                            "Standard_Form_Publication_Uncertain", "Standard_Form_ICRATES",
                        ],
                        encoding="latin-1")

deposit = pd.read_csv("./data/ICRATES_DEPOSIT.csv",
                      usecols=[
                          "Deposit_ID", "Lower_Date", "Upper_Date",
                      ],
                      encoding="latin-1")
deposit.rename(columns={"Lower_Date": deposit_lower_date,
               "Upper_Date": deposit_upper_date}, inplace=True)

standard = pd.read_csv("./data/ICRATES_STANDARD_FORM.csv", encoding="latin-1")
fabric = pd.read_csv("./data/fabric_h1.csv", sep=";")

data = pd.merge(catalogue, deposit, on="Deposit_ID", how="left")
data = pd.merge(data, fabric, left_on="Fabric", right_on="fabric", how="left")
data = data.drop(["Fabric"], axis=1)

data = pd.merge(data, standard, on="Standard_Form_ID", how='left')

data.columns = data.columns.str.lower()
data[fabric_column] = data[fabric_column].replace({"CRSW": "LRD", "PRSW": "LRC"})

data[fabric_column] = data[fabric_column].apply(lambda x: str(x).lower())
data = data[data[fabric_column].isin(fabric_types)]

data.to_csv(processed_data_path, index=False)

# Clean up values to not use space
catalogue = None
deposit = None
standard = None
fabric = None
data = None

## Calculations

### Frequency

In [None]:
frequency_column = "frequency"
summed_column = "summed"

processed = pd.read_csv(processed_data_path, encoding="latin-1", usecols=[
    id_column, chronological_lower_date, chronological_upper_date,
    deposit_lower_date, deposit_upper_date, fabric_column
], )

processed[frequency_column] = 1


def calculate_summed_frequency(lower_date, upper_date):
    data_ = processed.groupby([id_column, lower_date, upper_date, fabric_column])[
        frequency_column].sum()

    data_ = data_.reset_index()

    for _ in range(len(data_)):
        data_[summed_column] = data_[frequency_column] / \
            (data_[upper_date] - data_[lower_date])

    return data_


def calculate_frequency(summed, lower_date, upper_date):
    cleaned, date_dict = create_date_dict_and_remove_empty(
        summed, lower_date, upper_date)

    for row in range(len(cleaned)):
        for year in range(
            cleaned[lower_date].astype(
                int).iloc[row], cleaned[upper_date].astype(int).iloc[row]
        ):
            date_dict[year] += cleaned[summed_column].iloc[row]

    series = pd.Series(date_dict, name="Frequency")
    series.index.name = "Year"
    series.reset_index()

    return series


def calculate_frequency_data(lower_date, upper_date):
    summed_frequencies = calculate_summed_frequency(
        lower_date, upper_date)
    freq_dictionary = {}
    for fabric in fabric_types:
        freq_dictionary[fabric.upper()] = calculate_frequency(
            summed_frequencies[fabric == summed_frequencies[fabric_column]], lower_date, upper_date)
    return freq_dictionary


def full_frequency(title, file_name, lower_date, upper_date):

    processed["random_dates"] = simulate_dates(
        processed, lower_date, upper_date, 100, frequency_column)

    plot_and_save(calculate_frequency_data(lower_date, upper_date), processed, title,
                  "Frequency", file_name, [-300, 800])


# Type Dates
full_frequency("Type Dates", "type_dates_freq",
               chronological_lower_date, chronological_upper_date)

# Deposit Dates
full_frequency("Deposit dates", "deposit_dates_freq",
               deposit_lower_date, deposit_upper_date)

# Clean up values to not use space
processed = None

In [None]:
location_id_column = "location_id"

processed = pd.read_csv(processed_data_path, encoding="latin-1", usecols=[
    id_column, chronological_lower_date, chronological_upper_date,
    deposit_lower_date, deposit_upper_date, fabric_column, location_id_column
])

# Code breaks if location id is None, so need to replace with 1 
processed[location_id_column] = processed[location_id_column].apply(
    lambda x: int(x) if x == x else 1)


def calculate_site_count(group, lower_date, upper_date):
    cleaned, date_dict = create_date_dict_and_remove_empty(
        group, lower_date, upper_date)

    for date_entry in date_dict:

        result_list = []

        for row in range(len(cleaned)):
            if (
                cleaned[lower_date].astype(int).iloc[row]
                <= date_entry
                <= cleaned[upper_date].astype(int).iloc[row]
            ):
                result_list.append(cleaned[location_id_column].iloc[row])

        length = len(result_list)

        if length == 0:
            continue

        else:
            flat_list = [item for sublist in result_list for item in sublist]

            date_dict[date_entry] = len(set(flat_list))

    series = pd.Series(date_dict, name="Site Count")
    series.index.name = "Year"
    series.reset_index()

    return series


def site_count_calculation(lower_date, upper_date):
    precleaned = processed.dropna(subset=[lower_date, upper_date])
    groupedby = precleaned.groupby([id_column, lower_date, upper_date, fabric_column])[
        location_id_column].apply(list)
    groupedby = groupedby.reset_index()

    site_count_dictionary = {}
    for fabric in fabric_types:
        site_count_dictionary[fabric.upper()] = calculate_site_count(
            groupedby[fabric == groupedby[fabric_column]], lower_date, upper_date)
    return site_count_dictionary


def full_site_count(title, file_name, lower_date, upper_date):
    processed["random_dates"] = processed.dropna(subset=[lower_date, upper_date]).apply(lambda row: tempun.model_date(
        start=row[lower_date], stop=row[upper_date], size=100, count=1), axis=1)

    plot_and_save(site_count_calculation(lower_date, upper_date), processed, title,
                  "Site Count", file_name, [-300, 800])


# Type Dates
full_site_count("Type Dates", "type_dates_site_count",
                chronological_lower_date, chronological_upper_date)

# Deposit Dates
full_site_count("Deposit dates", "deposit_dates_site_count",
                deposit_lower_date, deposit_upper_date)

# Clean up values to not use space
processed = None

In [None]:

processed = pd.read_csv(processed_data_path, encoding="latin-1", usecols=[
    id_column, chronological_lower_date, chronological_upper_date,
    deposit_lower_date, deposit_upper_date, fabric_column, "standard_form"
])

# Code breaks if location id is None, so need to replace with zero
# processed[location_id_column] = processed[location_id_column].apply(
#   lambda x: int(x+0) if x == x else 0)


def calculate_type_count(group, lower_date, upper_date):
    cleaned, date_dict = create_date_dict_and_remove_empty(
        group, lower_date, upper_date)

    for date_entry in date_dict:

        result_list = []

        for row in range(len(cleaned)):
            if (
                cleaned[lower_date].astype(int).iloc[row]
                <= date_entry
                <= cleaned[upper_date].astype(int).iloc[row]
            ):
                result_list.append(cleaned[id_column].iloc[row])

        length = len(result_list)

        if length == 0:
            continue

        else:
            flat_list = [item for sublist in result_list for item in sublist]

            date_dict[date_entry] = len(set(flat_list))

    series = pd.Series(date_dict, name="Type Count")
    series.index.name = "Year"
    series.reset_index()

    return series


def type_count_calculation(lower_date, upper_date):
    groupedby = processed.groupby(["standard_form", lower_date, upper_date, fabric_column])[
        id_column].apply(list)
    groupedby = groupedby.reset_index()

    site_count_dictionary = {}
    for fabric in fabric_types:
        site_count_dictionary[fabric.upper()] = calculate_type_count(
            groupedby[fabric == groupedby[fabric_column]], lower_date, upper_date)
    return site_count_dictionary


def full_type_count(title, file_name, lower_date, upper_date):
    processed["random_dates"] = processed.dropna(subset=[lower_date, upper_date]).apply(lambda row: tempun.model_date(
        start=row[lower_date], stop=row[upper_date], size=100, count=1), axis=1)

    plot_and_save(type_count_calculation(lower_date, upper_date), processed, title,
                  "Type Count", file_name, [-300, 800])


# Type Dates
full_type_count("Type Dates", "type_dates_type_count",
                chronological_lower_date, chronological_upper_date)

# Deposit Dates
full_type_count("Deposit dates", "deposit_dates_type_count",
                deposit_lower_date, deposit_upper_date)

# Clean up values to not use space
processed = None

In [None]:
# Plot all datasets
import seaborn as sns
import matplotlib.pyplot as plt

processed = pd.read_csv(processed_data_path)
processed[frequency_column] = 1

freq_deposit = calculate_frequency_data(deposit_lower_date, deposit_upper_date)
freq_chronological = calculate_frequency_data(
    chronological_lower_date, chronological_upper_date)

site_deposit = site_count_calculation(deposit_lower_date, deposit_upper_date)
site_chronological = site_count_calculation(
    chronological_lower_date, chronological_upper_date)

type_deposit = type_count_calculation(deposit_lower_date, deposit_upper_date)
type_chronological = type_count_calculation(
    chronological_lower_date, chronological_upper_date)

datasets = [freq_chronological, freq_deposit, site_chronological,
            site_deposit, type_chronological, type_deposit]

sns.set_style("white", {"font.family": "serif",
              "font.serif": "Times New Roman"})

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))

axs[0, 0].set_xlim([-300, 700])
axs[0, 1].set_xlim([-300, 700])
axs[1, 0].set_xlim([-300, 700])
axs[1, 1].set_xlim([-300, 700])
axs[2, 0].set_xlim([-300, 700])
axs[2, 1].set_xlim([-300, 700])

for ax, df in zip(axs.ravel(), datasets):
    plot_graph(dicts_of_df=df,
               palette=["black", "green", "blue", "red",
                        "pink", "orange", "purple", "yellow"],
               ax=ax,
               linewidth=2)

    ax.set_ylim(0, None)
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.0f"))
    ax.set_xlabel(None)
    ax.get_legend().remove()
    ax.set_ylabel(None)
    ax.set_xticklabels([int(i) if i != 0 else 1 for i in ax.get_xticks()])
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, bbox_to_anchor=(0.985, 0.957), fontsize=8)

axs[0, 0].set_ylabel("Frequency")
axs[1, 0].set_ylabel("Site count")
axs[2, 0].set_ylabel("Type count")

axs[0, 0].set_title(label="Type dates", size=15, y=1.04)
axs[0, 1].set_title(label="Deposit dates", size=15, y=1.04)

axs[0, 0].text(-280, 29, "A", fontsize=25)
axs[0, 1].text(-280, 21, "B", fontsize=25)
axs[1, 0].text(-280, 146, "C", fontsize=25)
axs[1, 1].text(-280, 27, "D", fontsize=25)
axs[2, 0].text(-280, 241, "E", fontsize=25)
axs[2, 1].text(-280, 169, "F", fontsize=25)

fig.supxlabel("Year")
plt.tight_layout()
plt.savefig("./data/figures/combined.png")