In [3]:
# Data
import pandas as pd
import numpy as np

# Models
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
import warnings
warnings.filterwarnings("ignore")

# Metrics
from sklearn.metrics import mean_absolute_error

# Plot
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
demographics = pd.read_csv("../data_clean/demographics.csv")

In [5]:
INDICATORS = [
    "Total population",
    "Population| Age group |0 tot 15 year",
    "Population| Age group |15 tot 25 year",
    "Population| Age group |25 tot 45 year",
    "Population| Age group |45 tot 65 year",
    "Population| Age group |65+ year",
    "Total households",
    "Single-person households",
    "Households with children",
    "Households without children",
    "Births"
]


In [6]:
demographics = demographics.copy()

# ensure correct types
demographics["Year"] = demographics["Year"].astype(int)

# sort data
demographics = demographics.sort_values(
    ["Districts & Neigbourhoods", "Year"]
).reset_index(drop=True)


In [7]:
demographics["Year"]

0       2020
1       2021
2       2022
3       2023
4       2024
        ... 
3561    2025
3562    2020
3563    2021
3564    2022
3565    2025
Name: Year, Length: 3566, dtype: int64

In [8]:
def get_valid_neighbourhoods(df, indicator):
    valid = []

    for name, g in df.groupby("Districts & Neigbourhoods"):
        series = g.sort_values("Year")[indicator]

        non_zero = (series > 0).sum()
        total_obs = series.notna().sum()

        if non_zero >= 3 and total_obs >= 4:
            valid.append(name)

    return valid


In [9]:
get_valid_neighbourhoods(demographics, "Total population")[:10]

['AMC',
 'Aalsmeerwegbuurt Oost',
 'Aalsmeerwegbuurt West',
 'Afrikahaven',
 'Alexanderplein e.o.',
 'Amerikahaven',
 'Amstel III deel A/B Noord',
 'Amstel III deel A/B Zuid',
 'Amstel III deel C/D Noord',
 'Amstel III deel C/D Zuid']

In [None]:
get_valid_neighbourhoods(demographics, "Total population")[:10]


In [10]:
def drift_model(series, steps=3):
    if len(series) < 2:
        return [series.iloc[-1]] * steps

    slope = (series.iloc[-1] - series.iloc[0]) / (len(series)-1)
    return [series.iloc[-1] + slope*(i+1) for i in range(steps)]


In [11]:
def ols_model(series, steps=3):
    y = series.values
    x = np.arange(len(y))

    coef = np.polyfit(x, y, 1)

    future_x = np.arange(len(y), len(y)+steps)
    return coef[0]*future_x + coef[1]


In [16]:
def exp_model(series, steps=3):
    fit = SimpleExpSmoothing(series).fit()
    return fit.forecast(steps).values


In [17]:
def rolling_error(series, model_func):

    errors = []

    for i in range(3, len(series)):
        train = series.iloc[:i]
        test = series.iloc[i]

        try:
            pred = float(model_func(train, 1)[0])
        except:
            continue

        if not np.isnan(pred) and not np.isnan(test):
            errors.append(abs(test - pred))

    return np.mean(errors) if len(errors) > 0 else np.nan


In [None]:
def evaluate_models(df, indicator):

    valid_neigh = get_valid_neighbourhoods(df, indicator)
    df = df[df["Districts & Neigbourhoods"].isin(valid_neigh)]

    results = []

    for name, g in df.groupby("Districts & Neigbourhoods"):

        series = g.sort_values("Year").set_index("Year")[indicator]

        if len(series) < 4:
            continue

        drift = rolling_error(series, drift_model)
        ols   = rolling_error(series, ols_model)
        exp   = rolling_error(series, exp_model)

        results.append([name, drift, ols, exp])

    return pd.DataFrame(results, columns=["Neighbourhood","Drift","OLS","EXP"])


In [18]:
evaluate_models(demographics, "Total population").head()

Unnamed: 0,Neighbourhood,Drift,OLS,EXP
0,AMC,4.722222,4.277778,3.333333
1,Aalsmeerwegbuurt Oost,22.5,30.0,0.0
2,Aalsmeerwegbuurt West,32.5,26.666667,10.0
3,Afrikahaven,1.805556,2.611111,2.960595e-16
4,Alexanderplein e.o.,3.472222,3.888889,3.267442


In [19]:
def choose_best_model(result_df):

    avg_errors = result_df[["Drift","OLS","EXP"]].mean()

    best = avg_errors.idxmin()
    print("Best model:", best)
    print(avg_errors)

    return best


In [20]:
res = evaluate_models(demographics, "Total population")
choose_best_model(res)

Best model: EXP
Drift    166.140826
OLS      149.548962
EXP      115.422165
dtype: float64


'EXP'

In [21]:
evaluate_models(demographics, "Births")
choose_best_model(evaluate_models(demographics, "Births"))

Best model: EXP
Drift    10.559034
OLS       9.772397
EXP       6.542272
dtype: float64


'EXP'

In [25]:
def best_model_for_indicator(df, indicator):

    res = evaluate_models(df, indicator)

    avg_errors = res[["Drift","OLS","EXP"]].mean()

    return avg_errors.idxmin()


In [22]:
def forecast_series(series, model, steps=3):

    if model == "Drift":
        return drift_model(series, steps)

    if model == "OLS":
        return ols_model(series, steps)

    if model == "EXP":
        return exp_model(series, steps)


In [23]:
def forecast_all_indicators(df):

    rows = []

    for indicator in INDICATORS:

        print("Processing:", indicator)

        model = best_model_for_indicator(df, indicator)

        valid_neigh = get_valid_neighbourhoods(df, indicator)
        sub = df[df["Districts & Neigbourhoods"].isin(valid_neigh)]

        for name, g in sub.groupby("Districts & Neigbourhoods"):

            series = g.sort_values("Year").set_index("Year")[indicator]

            pred = forecast_series(series, model, 3)

            for yr, val in zip([2026,2027,2028], pred):
                rows.append([name, indicator, yr, val, model])

    return pd.DataFrame(rows, columns=[
        "Neighbourhood","Indicator","Year","Forecast","Model"
    ])


In [26]:
forecast_demographic = forecast_all_indicators(demographics)

forecast_demographic.head()


Processing: Total population
Processing: Population| Age group |0 tot 15 year
Processing: Population| Age group |15 tot 25 year
Processing: Population| Age group |25 tot 45 year
Processing: Population| Age group |45 tot 65 year
Processing: Population| Age group |65+ year
Processing: Total households
Processing: Single-person households
Processing: Households with children
Processing: Households without children
Processing: Births


Unnamed: 0,Neighbourhood,Indicator,Year,Forecast,Model
0,AMC,Total population,2026,31.521975,EXP
1,AMC,Total population,2027,31.521975,EXP
2,AMC,Total population,2028,31.521975,EXP
3,Aalsmeerwegbuurt Oost,Total population,2026,2165.0,EXP
4,Aalsmeerwegbuurt Oost,Total population,2027,2165.0,EXP


In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [30]:
def evaluate_series(series, model_func):

    preds = []
    trues = []

    for i in range(3, len(series)):
        train = series.iloc[:i]
        test  = series.iloc[i]

        pred = model_func(train, 1)[0]

        if not np.isnan(pred) and not np.isnan(test):
            preds.append(pred)
            trues.append(test)

    if len(preds) == 0:
        return np.nan, np.nan

    mae  = mean_absolute_error(trues, preds)
    rmse = np.sqrt(mean_squared_error(trues, preds))

    return mae, rmse


In [31]:
def best_model_with_metrics(series):

    results = {}

    results["Drift"] = evaluate_series(series, drift_model)
    results["OLS"]   = evaluate_series(series, ols_model)
    results["EXP"]   = evaluate_series(series, exp_model)

    # choose model with lowest MAE
    best = min(results.items(), key=lambda x: x[1][0] if not np.isnan(x[1][0]) else 1e18)

    return best[0], best[1][0], best[1][1]


In [36]:
def forecast_all_indicators_with_metrics(df):

    rows = []

    for indicator in INDICATORS:

        valid_neigh = get_valid_neighbourhoods(df, indicator)
        sub = df[df["Districts & Neigbourhoods"].isin(valid_neigh)]

        for name, g in sub.groupby("Districts & Neigbourhoods"):

            series = g.sort_values("Year").set_index("Year")[indicator]

            model, mae, rmse = best_model_with_metrics(series)

            pred = forecast_series(series, model, 3)

            for yr, val in zip([2026,2027,2028], pred):

                rows.append([
                    name,
                    indicator,
                    yr,
                    val,
                    np.nan,     # actual future unknown
                    mae,
                    rmse,
                    model
                ])

    return pd.DataFrame(rows, columns=[
        "Neighbourhood","Indicator","Year",
        "Forecast","Actual","MAE","RMSE","Model"
    ])


In [37]:
forecast_demographic = forecast_all_indicators_with_metrics(demographics)

forecast_demographic.head()


Unnamed: 0,Neighbourhood,Indicator,Year,Forecast,Actual,MAE,RMSE,Model
0,AMC,Total population,2026,31.521975,,3.333333,4.082483,EXP
1,AMC,Total population,2027,31.521975,,3.333333,4.082483,EXP
2,AMC,Total population,2028,31.521975,,3.333333,4.082483,EXP
3,Aalsmeerwegbuurt Oost,Total population,2026,2165.0,,0.0,0.0,EXP
4,Aalsmeerwegbuurt Oost,Total population,2027,2165.0,,0.0,0.0,EXP


In [38]:
forecast_demographic = forecast_demographic.drop(columns=["Actual"])


In [39]:
forecast_demographic.to_csv("../TRANSFORMATION/forecast_demographic.csv", index=False)