In [51]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [52]:
# Step 1: Load the dataset into a pandas DataFrame
data = pd.read_csv('Year 6 Post Covid.csv')

# Step 2: Display the first few rows of the dataset to verify its contents
print(data.head())

        CODE               BOROUGH  YEAR     AGE  % Healthy  % Unhealthy  \
0  E09000002  Barking and Dagenham  2021  Year 6       49.9         50.0   
1  E09000003                Barnet  2021  Year 6       63.1         36.9   
2  E09000004                Bexley  2021  Year 6       57.8         42.2   
3  E09000005                 Brent  2021  Year 6       56.8         43.3   
4  E09000006               Bromley  2021  Year 6       64.7         35.3   

   Green cover  Blue cover  Open Space  Local Parks  District Parks  \
0        35.34        6.10        56.2         30.0            46.4   
1        56.38        0.69        33.4         34.3            42.3   
2        49.04        6.81        49.7         35.6            15.9   
3        36.94        1.03        29.5         40.4            28.9   
4        73.65        0.38        41.6         31.0            39.0   

   Metropolitan Parks  Regional Parks  Percentage with private outdoor space  \
0                62.6             0.

In [53]:
# Step 3: Check the columns in the dataset to ensure all necessary columns are present
print(data.columns)

# Step 4: Create 'HealthCategory' column based on '% Healthy' and '% Unhealthy'
def categorize_health(row):
    if row['% Healthy'] > 63:
        return 'High'
    elif row['% Healthy'] > 58:
        return 'Medium'
    else:
        return 'Low'

data['HealthCategory'] = data.apply(categorize_health, axis=1)

# Ensure 'HealthCategory' is in a categorical format
data['HealthCategory'] = data['HealthCategory'].astype('category')

Index(['CODE', 'BOROUGH', 'YEAR', 'AGE', '% Healthy', '% Unhealthy',
       'Green cover', 'Blue cover', 'Open Space', 'Local Parks',
       'District Parks', 'Metropolitan Parks', 'Regional Parks',
       'Percentage with private outdoor space', 'Tree Cover',
       'Income Deprivation', 'Employment Deprivation', 'Crime Deprivation'],
      dtype='object')


In [54]:
# Step 5: Clean and preprocess the data
# Drop rows with missing values (if needed)
data = data.dropna()

# Convert 'YEAR' to numeric, handling errors if necessary
data['YEAR'] = pd.to_numeric(data['YEAR'], errors='coerce')

# Step 6: Standardize the deprivation factors: Income, Employment, and Crime Deprivation
deprivation_columns = ['Income Deprivation', 'Employment Deprivation', 'Crime Deprivation']
scaler = StandardScaler()

# Standardizing the deprivation columns
data[deprivation_columns] = scaler.fit_transform(data[deprivation_columns])

In [55]:
# Step 7: Specify the independent variables (natural assets) and the dependent variable (HealthCategory)
natural_assets_columns = ['Green cover', 'Blue cover', 'Open Space', 'Local Parks', 'District Parks', 
                          'Metropolitan Parks', 'Regional Parks', 'Percentage with private outdoor space', 'Tree Cover']
deprivation_columns = ['Income Deprivation', 'Employment Deprivation', 'Crime Deprivation']

# Combine independent variables
independent_vars = natural_assets_columns + deprivation_columns

In [57]:
# Step 8: Ordinal Logistic Regression (using OrderedModel from statsmodels)
def ordinal_logistic_regression(df, dependent_var, independent_vars):
    # Ensure you're modifying a copy of the DataFrame to avoid SettingWithCopyWarning
    df_copy = df.copy()  # Create a copy of the DataFrame
    
    # Select independent variables
    X = df_copy[independent_vars]
    y = df_copy[dependent_var]
    
    # Fit an ordinal logistic regression model using OrderedModel
    model = OrderedModel(y, X, distr='logit')  # Logistic distribution for ordinal logistic regression
    result = model.fit(method='bfgs', disp=False)
    return result

In [58]:
# Step 9: Run ordinal logistic regression
result_reception = ordinal_logistic_regression(data, 'HealthCategory', independent_vars)



In [50]:
# Step 10: Print the regression results
print("Ordinal Logistic Regression for Reception:")
print(result_reception.summary())

Ordinal Logistic Regression for Reception:
                             OrderedModel Results                             
Dep. Variable:         HealthCategory   Log-Likelihood:                -88.223
Model:                   OrderedModel   AIC:                             204.4
Method:            Maximum Likelihood   BIC:                             240.3
Date:                Fri, 17 Jan 2025                                         
Time:                        16:16:42                                         
No. Observations:                  96                                         
Df Residuals:                      82                                         
Df Model:                          12                                         
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Green cover                       