In [None]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats
import matplotlib.pyplot as plt

def handle_missing_data(df):
    df = df.replace("*", np.NaN)
    return df.fillna(0)


def convert_feature_types(df):
    to_category_features = ["Data_Zone", "Intermediate_Zone", "Council_area"]
    to_float_features = [
        "income_rate",
        "employment_rate",
        "crime_rate",
        "DEPRESS",
        "LBWT",
        "Attendance",
        "not_participating",
        "University",
        "overcrowded_rate",
        "nocentralheating_rate",
        "broadband",
        "crime_count",
        "Attainment",
    ]
    for feature in to_category_features:
        df[feature] = df[feature].astype("category")
    for feature in to_float_features:
        df[feature] = df[feature].str.rstrip("%").astype(float) / 100


def get_data(data_fpath="../data/glasgow-litter-simd2020v2.csv"):
    df = pd.read_csv(data_fpath, header=0)
    df = handle_missing_data(df)
    convert_feature_types(df)
    return df


def get_data_split(valid_frac=0.2, test_frac=0.1):
    train_frac = 1 - valid_frac - test_frac
    df = get_data()
    return np.split(
        df.sample(frac=1, random_state=1),
        [int(train_frac * len(df)), int((train_frac + valid_frac) * len(df))],
    )

def get_predicted_counts(model, X):
    predictions = model.get_prediction(X)
    df_predictions = predictions.summary_frame()
    return round(df_predictions["mean"])

def scatter_plot(model, X, y):
    predicted_counts, actual_counts = get_predicted_counts(model, X), y["litter"]
    fig = plt.figure()
    fig.suptitle("Predicted vs Actual Litter Counts in Glasgow City")
    plt.scatter(x=predicted_counts, y=actual_counts)
    plt.xlabel("Predicted Litter")
    plt.ylabel("Actual Litter")
    plt.show()

def bar_plot(model, X, y, n_bars=20, bar_width=0.2):
    indices = X.index[:n_bars]
    df = get_data()
    labels = df.iloc[indices]["Data_Zone"]
    predicted_counts, actual_counts = get_predicted_counts(model, X), y["litter"]
    y_predicted = predicted_counts[:n_bars]
    y_actual = actual_counts[:n_bars]
    y_err = abs(y_predicted - y_actual)

    fig, ax = plt.subplots()
    fig.set_figwidth(10)
    fig.set_figheight(10)
    x = np.arange(len(labels))
    rects1 = ax.bar(x - bar_width/2, y_predicted, bar_width, label="Predicted Litter")
    rects2 = ax.bar(x + bar_width/2, y_actual, bar_width, label="Actual Litter")
    rects3 = ax.bar(x + (bar_width/2) + (bar_width/2), y_err, bar_width, label="Error")
    ax.set_ylabel("Litter")
    ax.set_title("Predicted vs Actual Litter in Glasgow City by Data Zone")
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=90)
    ax.legend()
    for rects in [rects1, rects2, rects3]:
        for rect in rects:
            height = rect.get_height()
            ax.annotate(
                f"{height}",
                xy=(rect.get_x() + rect.get_width() / 2 + rect.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha="center", va='bottom'
        )
    fig.tight_layout()
    plt.show()

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model