In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("podatki/df_mesecne.csv", parse_dates=["Datum"])
df = df[(df["Datum"] > "2002") & (df["Datum"] < "2024")]


df["Year"] = df["Datum"].dt.year
df["Month"] = df["Datum"].dt.month

df.drop(columns=["Datum", "Postaja"], inplace=True)

LETO = [3, 4, 5, 6, 7, 8, 9, 10]
ZIMA = [11, 12, 1, 2]
POLLUTANTS = ["SO2", "PM10", "NO2"]


def seasonal_average(df: pd.DataFrame):

    def calculate_season(season_values, pollutant):
        if season_values.empty:
            return np.nan

        grouped_so2 = season_values.groupby("Month")[pollutant].apply(list).to_list()
        monthly_averages = np.array(list(map(np.mean, grouped_so2)))
        weights = np.array([len(values) for values in grouped_so2])

        season_yearly = np.sum(weights * monthly_averages) / weights.sum()
        return season_yearly

    def calculate_year(pollutant_months: pd.DataFrame, pollutant):
        regija, leto = pollutant_months.name
        pollutant_months = pollutant_months.dropna()
        pm_letni = pollutant_months.loc[pollutant_months["Month"].isin(LETO)]
        pm_zimski = pollutant_months.loc[pollutant_months["Month"].isin(ZIMA)]
        

        leto_avg = calculate_season(pm_letni, pollutant)
        zimski_avg = calculate_season(pm_zimski, pollutant)
        letni_avg = calculate_season(pollutant_months,pollutant)
        return pd.Series(
            {
                #f"{pollutant}_l": leto_avg,
                #f"{pollutant}_z": zimski_avg,
                pollutant: letni_avg
            }
        )

    def pollutant_weighted_average(pollutant: str):
        result = df.groupby(["Regija", "Year"])[[pollutant, "Month"]].apply(
            calculate_year, pollutant=pollutant
        )
        return result.reset_index()

    df_list = [pollutant_weighted_average(snov) for snov in POLLUTANTS]
    master_df = df_list[0]
    for i in range(1, len(df_list)):
        master_df = master_df.merge(df_list[i], on=["Regija", "Year"])

    return master_df


season_df = seasonal_average(df)
season_df.sort_values(by=["Regija", "Year"], inplace=True)

In [None]:
mrd_df_wide = pd.read_csv("podatki/umrljivost/umrljivost_regije.csv")
mrd_df = mrd_df_wide.melt(id_vars="Regija", var_name="Year", value_name="MRD")

mrd_df["Year"] = pd.to_numeric(mrd_df["Year"], errors="coerce")
panel_df = season_df.merge(mrd_df, on=["Regija", "Year"]).sort_values(
    by=["Regija", "Year"]
)

panel_df.to_csv("podatki/panel_df.csv",index=False)

In [None]:
import statsmodels.formula.api as sm 
from statsmodels.iolib.summary2 import summary_col
df = panel_df.copy().dropna()

df['SO2_lag2'] = df.groupby('Regija')['SO2'].shift(2)
df['PM10_lag2'] = df.groupby('Regija')['PM10'].shift(2)
df['NO2_lag2'] = df.groupby('Regija')['NO2'].shift(2)

df = df.dropna()

df['NECD_2005'] = np.where(df['Year'] >= 2005,1,0)
df['NECD_2010'] = np.where(df['Year'] >= 2005, 1,0)

df['SO2_lag2_NECD'] = df['SO2_lag2'] * df['NECD_2005']
df['PM10_lag2_NECD'] = df['PM10_lag2'] * df['NECD_2005']
df['NO2_lag2_NECD'] = df['NO2_lag2'] * df['NECD_2010']


formula = 'MRD ~ SO2_lag2 + PM10_lag2 + NO2_lag2 + SO2_lag2_NECD + PM10_lag2_NECD + NO2_lag2_NECD + C(Regija) + C(Year)'

model_fe = sm.ols(formula, data=df).fit()
print(model_fe.summary())

model_fe_clustered = sm.ols(formula, data=df).fit(cov_type='cluster', cov_kwds={'groups': df['Regija']})
print(model_fe_clustered.summary())

info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'Adj. R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[model_fe,model_fe_clustered],
                             float_format='%0.2f',
                             stars = True,
                             info_dict=info_dict,
                             regressor_order=['SO2_lag2','PM10_lag2','NO2_lag2','SO2_lag2_NECD','PM10_lag2_NECD','NO2_lag2_NECD','C(Regija)','C(Year)'])

results_table.add_title('Fixed Effects Results (2-Year Lags)')

print(results_table)
