In [3]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset into a pandas DataFrame
data = pd.read_csv('Reception Pre Covid.csv')

# Step 2: Display the first few rows of the dataset to verify its contents
print(data.head())

# Step 3: Check the columns in the dataset to ensure all necessary columns are present
print(data.columns)

# Step 4: Create 'HealthCategory' column based on '% Healthy' and '% Unhealthy'
def categorize_health(row):
    if row['% Healthy'] > 79:
        return 'High'
    elif row['% Healthy'] > 75:
        return 'Medium'
    else:
        return 'Low'

data['HealthCategory'] = data.apply(categorize_health, axis=1)

# Ensure 'HealthCategory' is in a categorical format with an ordered nature
data['HealthCategory'] = pd.Categorical(
    data['HealthCategory'], categories=['Low', 'Medium', 'High'], ordered=True
)

# Step 5: Clean and preprocess the data
# Drop rows with missing values (if needed)
data = data.dropna()

# Convert 'YEAR' to numeric, handling errors if necessary
data['YEAR'] = pd.to_numeric(data['YEAR'], errors='coerce')

# Step 6: Standardize the deprivation factors: Income, Employment, and Crime Deprivation
deprivation_columns = ['Income Deprivation', 'Employment Deprivation', 'Crime Deprivation']
scaler = StandardScaler()

# Standardizing the deprivation columns
data[deprivation_columns] = scaler.fit_transform(data[deprivation_columns])

# Step 9: Specify the independent variables (natural assets) and the dependent variable (HealthCategory)
natural_assets_columns = ['Green cover', 'Blue cover', 'Open Space', 'Local Parks', 'District Parks', 
                          'Metropolitan Parks', 'Regional Parks', 'Percentage with private outdoor space', 'Tree Cover']
deprivation_columns = ['Income Deprivation', 'Employment Deprivation', 'Crime Deprivation']

# Combine independent variables
independent_vars = natural_assets_columns + deprivation_columns

# Step 10: Calculate Variance Inflation Factor (VIF)
def calculate_vif(df, independent_vars):
    # Select independent variables
    X = df[independent_vars]
    
    # Add a constant for the VIF calculation
    X = sm.add_constant(X)
    
    # Calculate VIF for each variable
    vif_data = pd.DataFrame()
    vif_data['Variable'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

vif_results = calculate_vif(data, independent_vars)
print("\nVariance Inflation Factor (VIF):")
print(vif_results)

# Step 11: Ordinal Logistic Regression (using OrderedModel from statsmodels)
def ordinal_logistic_regression(df, dependent_var, independent_vars):
    # Ensure you're modifying a copy of the DataFrame to avoid SettingWithCopyWarning
    df_copy = df.copy()  # Create a copy of the DataFrame
    
    # Select independent variables
    X = df_copy[independent_vars]
    y = df_copy[dependent_var]
    
    # Fit an ordinal logistic regression model using OrderedModel
    model = OrderedModel(y, X, distr='logit')  # Logistic distribution for ordinal logistic regression
    result = model.fit(method='bfgs', disp=False)
    return result

# Step 12: Run ordinal logistic regression
result_reception = ordinal_logistic_regression(data, 'HealthCategory', independent_vars)

# Step 13: Print the regression results
print("\nOrdinal Logistic Regression for Reception:")
print(result_reception.summary())


        CODE               BOROUGH  YEAR        AGE  % Healthy  % Unhealthy  \
0  E09000002  Barking and Dagenham  2013  Reception      72.20        27.80   
1  E09000003                Barnet  2013  Reception      77.68        22.32   
2  E09000004                Bexley  2013  Reception      72.11        27.89   
3  E09000005                 Brent  2013  Reception      71.03        28.97   
4  E09000006               Bromley  2013  Reception      78.01        21.99   

   Green cover  Blue cover  Open Space  Local Parks  District Parks  \
0        39.20        6.10        56.2         30.0            46.4   
1        57.99        0.69        33.4         34.3            42.3   
2        42.92        6.81        49.7         35.6            15.9   
3        39.94        1.03        29.5         40.4            28.9   
4        68.36        0.38        41.6         31.0            39.0   

   Metropolitan Parks  Regional Parks  Percentage with private outdoor space  \
0                6