In [1]:
import sys
sys.path.append('../src')
import json
import pandas as pd
from src.predictionHelper import add_predictions_gauss_regr
from src.mapCreator import filter_data_by_geometry
from src.mainController import MainController
from src.MapTypes import MapTypes
import statsmodels.api as sm

# Load the JSON data from a file
with open('../data/db.json') as f:
    json_data = json.load(f)

state_names = ['Baden-Württemberg', 'Bavaria', 'Berlin', 'Brandenburg', 'Bremen', 'Hamburg', 'North Rhine-Westphalia']

# Create a dictionary that maps each state to a unique ID
state_to_id = {state: i for i, state in enumerate(state_names)}


chosen_states = ["Baden-Württemberg", "Bavaria"]
special_ids = None

# Get the IDs of the selected states
chosen_states_ids = [state_to_id[state] for state in chosen_states]

# Filter data using GeoPandas
gdf = filter_data_by_geometry(json_data, chosen_states_ids, forexp=True)

# seems like mainController.setData(gdf, MapTypes.Gauss, special_ids) does not work
# run add_predictions_gauss_regr on the data to add column 'uncertainty'
mainController = MainController()
mainController.setData(gdf, MapTypes.Gauss, special_ids)
filtered_data = add_predictions_gauss_regr(mainController.dto.gdf)

# Initialize 'isMeasured' with a default value
filtered_data['isMeasured'] = None  

# Use boolean indexing to set values in 'isMeasured' column
filtered_data.loc[filtered_data['all_measurements'].isnull() | (filtered_data['all_measurements'] == 0), 'isMeasured'] = 1
filtered_data.loc[filtered_data['all_measurements'].notnull() & (filtered_data['all_measurements'] != 0), 'isMeasured'] = 0

# Convert 'isMeasured' to numeric (if it's not already)
filtered_data['isMeasured'] = pd.to_numeric(filtered_data['isMeasured'], errors='coerce')

# Add a constant term for the logistic regression
filtered_data['const'] = 1
X = filtered_data[['const', 'all_stability']]
y = filtered_data['isMeasured']

# Logistic regression model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit()

# Display the summary
print(result.summary())



Optimization terminated successfully.
         Current function value: 0.639165
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             isMeasured   No. Observations:                 6795
Model:                          Logit   Df Residuals:                     6793
Method:                           MLE   Df Model:                            1
Date:                Tue, 23 Jan 2024   Pseudo R-squ.:                 0.02909
Time:                        19:53:51   Log-Likelihood:                -4343.1
converged:                       True   LL-Null:                       -4473.2
Covariance Type:            nonrobust   LLR p-value:                 1.523e-58
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             2.6934      0.233     11.575      0.000       2.237       3.149
all_stability    -3.

In [2]:
# Null hypothesis (H₀): There is no significant association between the presence of a measurement and stability greater than 0.5.
# Alternative hypothesis (H₁): There is a significant association between the presence of a measurement and stability greater than 0.5.


# Creating binary variables 'Measured' and 'High_Stability'.
filtered_data['Measured'] = (filtered_data['all_measurements'] > 0).astype(int)  # 1 for measurement, 0 for no measurement
filtered_data['High_Stability'] = (filtered_data['all_stability'] > 0.5).astype(int)  # 1 for high_stability > 0.5, 0 otherwise

# Adding a constant term for logistic regression.
filtered_data['const'] = 1

# Setting up independent variables 'X' (constant and 'Measured') and dependent variable 'y' ('High_Stability').
X = filtered_data[['const', 'Measured']]
y = filtered_data['High_Stability']

# Fitting logistic regression model using Statsmodels.
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Printing summary of logistic regression results.
print(result.summary())

# Concluding that the presence of a measurement ("Measured") is significantly associated with higher likelihood of high stability (>0.5).

Optimization terminated successfully.
         Current function value: 0.089381
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:         High_Stability   No. Observations:                 6795
Model:                          Logit   Df Residuals:                     6793
Method:                           MLE   Df Model:                            1
Date:                Tue, 23 Jan 2024   Pseudo R-squ.:                 0.02560
Time:                        19:53:51   Log-Likelihood:                -607.34
converged:                       True   LL-Null:                       -623.30
Covariance Type:            nonrobust   LLR p-value:                 1.618e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.4514      0.116     29.817      0.000       3.225       3.678
Measured       1.0299      0.