# Make data dictionaries

In [1]:
import sys
import utils
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [3]:
def make_corr_plots(df, figname):
    corr = df.corr()
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 10))
    fig = sns.heatmap(corr, ax=ax)
    f.tight_layout()

    fig.get_figure().savefig("../data/" + figname)

In [4]:
def get_df_overview(df, basename, overview):
    pd_overview = overview_temp.format(
        len(df.columns),
        len(df),
        df.isna().any(axis=1).sum(),
        df.isna().any(axis=1).sum() / len(df) * 100,
        df.duplicated().sum(),
        df.duplicated().sum() / len(df) * 100,
        round(df.memory_usage(deep=True).sum() / (1024**2), 5),
    )

    lines = pd_overview.strip().split("\n")

    # Parse each line into a key-value pair
    for line in lines:
        key, value = line.strip().split(":", 1)
        key = key.strip("' ")  # remove leading/trailing whitespace and single quotes
        value = value.strip(
            "' ,"
        )  # remove leading/trailing whitespace, single quotes and comma
        overview[key] = value
    return overview

In [5]:
def get_data_dict(var_list):
    data_dict_str = [
        "| Variable Name | Description |",
        "| ------------- | ----------- |",
    ]
    for key, value in var_list:
        data_dict_str.append(f"| `{key}` | {value} |")
    return data_dict_str

In [6]:
data_dict = utils.data_dict
overview_temp = utils.overview_temp
data_dict_temp = utils.data_dict_temp

In [None]:
for base_name in data_dict.keys():
    data_dict_str = get_data_dict(data_dict[base_name]["var_list"].items())

    df = pd.read_csv(f"../{ data_dict[base_name]['data_path'] }")

    overview = get_df_overview(df, base_name, data_dict[base_name]["overview"])

    figname = f"figs/corr_{base_name}.png"
    make_corr_plots(df, figname)
    data_dict_md = data_dict_temp.format(
        pd.DataFrame(
            overview.items(),
            columns=["Properties", "Value"],
        ).to_markdown(index=False),
        "\n".join(data_dict_str),
        figname,
        df.head().to_markdown(index=False),
    )

    with open("../data/dataset_{}.md".format(base_name), "w") as f:
        f.write(data_dict_md)