# Regresja liniowa

In [39]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.regression.quantile_regression import QuantReg

In [40]:
clean_data = pd.read_csv("clean_data.csv", encoding="latin1")

clean_data = clean_data.rename(columns={
    "ï»¿Kraj": "Kraj"
})

In [41]:
clean_data.head()

Unnamed: 0,Kraj,Rok,Stopa_bezrobocia,Edukacja,Dlugosc_zycia,Zdrowie,Inflacja,Gini,Wspolczynnik_urbanizacji,Skolaryzacja,Rolnictwo,Dzieci_0-14,p0p50,p90p100,p99p100,PKB_pc,Indeks_nierow,Kontynent
0,Albania,1996,13.928,3.08351,74.113,,12.725478,27.0,38.899604,,33.561661,31.123281,0.1883,0.3256,0.0814,6741.8751,0.432289,Europe
1,Albania,2002,17.891,3.1178,75.299,5.664645,7.770526,31.7,43.152042,110.338921,19.763916,28.284316,0.1803,0.3362,0.0862,9001.8889,0.478092,Europe
2,Albania,2005,15.966,3.28155,76.427,5.729545,2.366582,30.6,45.993767,107.016144,17.124718,25.870379,0.1846,0.3311,0.0842,10336.2767,0.456121,Europe
3,Albania,2008,13.06,,78.248,5.509003,3.320871,30.0,49.38507,100.383003,15.4221,23.363673,0.186,0.3306,0.0857,12396.2443,0.460753,Europe
4,Albania,2012,13.376,3.29534,78.084,6.157462,2.031593,29.0,53.863755,102.067361,17.591369,20.292086,0.1903,0.3301,0.0855,13472.0161,0.449291,Europe


In [42]:
clean_data['Gini'] = clean_data['Gini']/100

In [43]:
# Logarytm PKB per capita (standard w literaturze)
clean_data['ln_pkb_pc'] = np.log(clean_data['PKB_pc'])
# Wybór zmiennych do modelu
vars_model = [
    'Kraj',
    'Indeks_nierow',
    'Gini',
    'Zdrowie',
    'Edukacja',
    'Stopa_bezrobocia',
    'Dlugosc_zycia',
    'Inflacja',
    'ln_pkb_pc',
    'Wspolczynnik_urbanizacji',
    'Skolaryzacja',
    'Rolnictwo',
    'Dzieci_0-14'
]

# Usunięcie obserwacji z brakami danych
data_model = clean_data[vars_model].dropna()


In [44]:
Y = data_model['Gini']

X = data_model[
    [
    'Zdrowie',
    'Edukacja',
    'Stopa_bezrobocia',
    'Dlugosc_zycia',
    'Inflacja',
    'ln_pkb_pc',
    'Wspolczynnik_urbanizacji',
    'Skolaryzacja',
    'Rolnictwo',
    'Dzieci_0-14'
    ]
]

# Dodanie wyrazu wolnego
X = sm.add_constant(X)
ols_model = sm.OLS(Y, X).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Gini   R-squared:                       0.547
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                     149.3
Date:                Sun, 04 Jan 2026   Prob (F-statistic):          1.17e-204
Time:                        20:08:06   Log-Likelihood:                 1888.3
No. Observations:                1249   AIC:                            -3755.
Df Residuals:                    1238   BIC:                            -3698.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [45]:
Y = data_model['Indeks_nierow']

X = data_model[
    [
    'Zdrowie',
    'Edukacja',
    'Stopa_bezrobocia',
    'Dlugosc_zycia',
    'Inflacja',
    'ln_pkb_pc',
    'Wspolczynnik_urbanizacji',
    'Skolaryzacja',
    'Rolnictwo',
    'Dzieci_0-14'
    ]
]

# Dodanie wyrazu wolnego
X = sm.add_constant(X)
ols_model = sm.OLS(Y, X).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:          Indeks_nierow   R-squared:                       0.471
Model:                            OLS   Adj. R-squared:                  0.467
Method:                 Least Squares   F-statistic:                     110.3
Date:                Sun, 04 Jan 2026   Prob (F-statistic):          1.75e-163
Time:                        20:08:06   Log-Likelihood:                -1304.3
No. Observations:                1249   AIC:                             2631.
Df Residuals:                    1238   BIC:                             2687.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [46]:
country_pkb = (
    clean_data
    .groupby('Kraj')['PKB_pc']
    .mean()
    .reset_index()
)

country_pkb['tercyl_pkb'] = pd.qcut(
    country_pkb['PKB_pc'],
    q=3,
    labels=['niski', 'średni', 'wysoki']
)



In [47]:
data_model = data_model.merge(
    country_pkb[['Kraj', 'tercyl_pkb']],
    on='Kraj',
    how='left'
)


In [52]:
for tercyl in ['niski', 'średni', 'wysoki']:
    subset = data_model[data_model['tercyl_pkb'] == tercyl]

    Y = subset['Gini']
    X = subset[
        [
            'Zdrowie',
            'Edukacja',
            'Stopa_bezrobocia',
            'Dlugosc_zycia',
            'Inflacja',
            'ln_pkb_pc',
            'Wspolczynnik_urbanizacji',
            'Skolaryzacja',
            'Rolnictwo',
            'Dzieci_0-14'
        ]
    ]

    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()

    print(f"\n===== Tercyl PKB: {tercyl.upper()} =====")
    print(model.summary())



===== Tercyl PKB: NISKI =====
                            OLS Regression Results                            
Dep. Variable:                   Gini   R-squared:                       0.422
Model:                            OLS   Adj. R-squared:                  0.371
Method:                 Least Squares   F-statistic:                     8.322
Date:                Sun, 04 Jan 2026   Prob (F-statistic):           4.93e-10
Time:                        20:09:25   Log-Likelihood:                 193.86
No. Observations:                 125   AIC:                            -365.7
Df Residuals:                     114   BIC:                            -334.6
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------

In [54]:
for tercyl in ['niski', 'średni', 'wysoki']:
    subset = data_model[data_model['tercyl_pkb'] == tercyl]

    Y = subset['Indeks_nierow']
    X = subset[
        [
            'Zdrowie',
            'Edukacja',
            'Stopa_bezrobocia',
            'Dlugosc_zycia',
            'Inflacja',
            'ln_pkb_pc',
            'Wspolczynnik_urbanizacji',
            'Skolaryzacja',
            'Rolnictwo',
            'Dzieci_0-14'
        ]
    ]

    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()

    print(f"\n===== Tercyl PKB: {tercyl.upper()} =====")
    print(model.summary())



===== Tercyl PKB: NISKI =====
                            OLS Regression Results                            
Dep. Variable:          Indeks_nierow   R-squared:                       0.280
Model:                            OLS   Adj. R-squared:                  0.217
Method:                 Least Squares   F-statistic:                     4.438
Date:                Sun, 04 Jan 2026   Prob (F-statistic):           2.81e-05
Time:                        20:17:31   Log-Likelihood:                -83.417
No. Observations:                 125   AIC:                             188.8
Df Residuals:                     114   BIC:                             219.9
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------