In [1]:
import pandas as pd
import numpy as np

from edinburgh_challenge.simulation import *
from edinburgh_challenge.processing import *

# sklearn
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Firstly combine the two versions of estates df
# To get the information about finances while retaining
# the new Deployment data
estates_df = load_estates_df("data/Estate_Informationv3.xlsx")

In [3]:
# Get in use police stations only
# 
#
police_station_filter = estates_df["Property Classification"] == "Police Station"
local_policing_filter = estates_df["Primary Classification (Use)"] == "Local Policing"
deployment_filter = estates_df["Deployment station? Y/N"] == "Yes"
estates_df = estates_df[police_station_filter & local_policing_filter & deployment_filter]



In [4]:
# Cleaning column - Condition Grade
estates_df['Condition Grade'] = estates_df['Condition Grade'].replace(
    ['Not Surveyed', 'Not yet undertaken', 'Not surveyed', 'No grade available', np.nan],
    np.nan
)

# Cleaning column - Ownership
estates_df["Ownership"] = estates_df["Ownership"].replace(
['Leased In', 'Leased in'], "Leased In")

# Cleaning the Property Size column
estates_df["Property Size GIA sq.m"] = estates_df["Property Size GIA sq.m"].replace(
['TBC', 'Variable'], np.nan)

# Cleaning column - Year of Construction
def clean_year_value(year):
    if pd.isna(year) or year in ['Unknown', 'No survey available', 'no info.']:
        return np.nan
    elif year == "1900s / 1995":
        year = 1995
    elif isinstance(year, str):
        year = year.lower().strip()
        if 'circa' in year or 'c.' in year:
            year = year.replace('circa', '').replace('c.', '').replace("'", '').replace('s', '').strip()
        if 'refurbished' in year:
            year = year.split(' ')[-1]  # Take the year after 'refurbished'
        if '/' in year:
            parts = year.split('/')
            year = str(int(sum(int(part.strip()) for part in parts) / len(parts)))  # Average the years
        if 'post' in year.lower():
            year = '1945'
        if 's' in year:
            year = year.replace('s', '')
        year = year.replace('+', '').replace("'", '').replace(' ', '')
    try:
        return int(year)
    except ValueError:
        return np.nan

# Year
estates_df["Year of Construction"] = estates_df["Year of Construction"].apply(clean_year_value)



#no_condition_filter = estates_df["Condition Grade"].isna()
#estates_df.loc[no_condition_filter]

In [5]:
estates_df[estates_df["Ownership"].isna()]

Unnamed: 0,Activity code,Property Classification,Record Level,Division Name,Division,Unitary Authority,Site Name,Property Name,X (Easting),Y (Northing),...,Mountain Rescue (Yes / No),Dogs / Horses (Yes / No),Fleet workshop (Yes / No),Mortuary (Yes / No),Stores (Yes / No),Critical IT infrastructure (Yes / No),Accommodation (Yes / No),Intelligence Support (Yes / No),Number of Parking Spaces,EV Infrastructure


In [6]:
estates_df.head()

Unnamed: 0,Activity code,Property Classification,Record Level,Division Name,Division,Unitary Authority,Site Name,Property Name,X (Easting),Y (Northing),...,Mountain Rescue (Yes / No),Dogs / Horses (Yes / No),Fleet workshop (Yes / No),Mortuary (Yes / No),Stores (Yes / No),Critical IT infrastructure (Yes / No),Accommodation (Yes / No),Intelligence Support (Yes / No),Number of Parking Spaces,EV Infrastructure
2,P0926,Police Station,Building,Greater Glasgow,G,Glasgow City Council,"Baird Street, Glasgow",Baird Street Police station,260122,666304,...,No,No,No,No,No,,No,Yes,113.0,4
5,P1302,Police Station,Building,Greater Glasgow,G,East Renfrewshire Council,Barrhead,Barrhead Police station,250255,658880,...,No,No,No,No,No,,No,No,14.0,0
7,P0902,Police Station,Building,Greater Glasgow,G,East Dunbartonshire Council,Bishopbriggs,Bishopbriggs Police station,260903,670097,...,No,No,No,No,No,,No,No,12.0,0
12,P0925,Police Station,Building,Greater Glasgow,G,Glasgow City Council,"Cathcart, Glasgow",Cathcart Police station (stewart street),259277,661816,...,No,No,No,No,No,,No,Yes,70.0,4
16,P0905,Police Station,Building,Greater Glasgow,G,Glasgow City Council,Cowcaddens,Glasgow City Centre Police station,258808,666230,...,No,No,No,No,No,,No,No,58.0,3


In [7]:
# Encoding columns
def create_mappings(df, column):
    unique_values = pd.Series(df[column].unique())
    unique_values = unique_values[unique_values.notna()].sort_values()
    encoding = {value: idx for idx, value in enumerate(unique_values, start=1)}  # Start from 1 for easier decoding
    decoding = {idx: value for value, idx in encoding.items()}
    # Explicitly include NaN handling
    encoding[np.nan] = np.nan
    decoding[np.nan] = np.nan
    return encoding, decoding




# Creating Mappings
condition_encoding, condition_decoding = create_mappings(estates_df, 'Condition Grade')
classification_encoding, classification_decoding = create_mappings(estates_df, 'Primary Classification (Use)')
year_encoding, year_decoding = create_mappings(estates_df, 'Year of Construction')
division_encoding, division_decoding = create_mappings(estates_df, 'Division')
location_encoding, location_decoding = create_mappings(estates_df, 'Location Category (A=Urban, B=Rural, C=Remote, D=Islands)')
ownership_encoding, ownership_decoding = create_mappings(estates_df, 'Ownership')
building_encoding, building_decoding = create_mappings(estates_df, 'Record Level')


def encode_df(estates_df):
    estates_df_encoded = estates_df.copy()
    estates_df_encoded["Condition Grade"] = estates_df_encoded["Condition Grade"].map(condition_encoding)
    estates_df_encoded["Primary Classification (Use)"] = estates_df_encoded["Primary Classification (Use)"].map(classification_encoding)
    estates_df_encoded["Year of Construction"] = estates_df_encoded["Year of Construction"].map(year_encoding)
    estates_df_encoded["Division"] = estates_df_encoded["Division"].map(division_encoding)
    estates_df_encoded["Location Category (A=Urban, B=Rural, C=Remote, D=Islands)"] = estates_df_encoded["Location Category (A=Urban, B=Rural, C=Remote, D=Islands)"].map(location_encoding)
    estates_df_encoded["Ownership"] = estates_df_encoded["Ownership"].map(ownership_encoding)
    estates_df_encoded["Record Level"] = estates_df_encoded["Ownership"].map(building_encoding)
    return estates_df_encoded

In [8]:
decoding_dicts = {
    "Property Size GIA sq.m": None,
    "Primary Classification (Use)":classification_decoding,
    "Ownership":ownership_decoding,
    "Year of Construction":year_decoding,
    "Division":division_decoding,
    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)":location_decoding, 
    "Condition Grade":condition_decoding
}

#decoding_dicts = [
#    None,
#    classification_decoding,
#    ownership_decoding,
#    year_decoding,
#    division_decoding,
#    location_decoding,
#    condition_decoding
#]


def categorical_imputation(df, column_name, columns_for_imputation, thresh=1):
    """
    Imputes missing categorical values in a specified column using KNN classifier.
    
    Args:
    - df (DataFrame): Input DataFrame containing categorical columns with missing values.
    - column_name (str): Name of the column to impute.
    - columns_for_imputation (list): List of columns to consider for imputation, including column_name.
    
    Returns:
    - DataFrame: DataFrame with imputed values.
    """
    # Make a copy of the original DataFrame
    df_copy = encode_df(df).copy()
    df_master = encode_df(df).copy()
    df_copy = df_copy[columns_for_imputation]
    
    
    X_cols = [c for c in columns_for_imputation if c != column_name]
    
    # Calculate the number of nans in the imputation column
    print(f"No. of empty columns = {df_copy[column_name].isnull().sum()}")
    
    # Filter rows where more than one column is missing (other than column_name)
    df_filtered = df_copy.dropna(subset=X_cols, thresh=thresh)
    
    if df_filtered.empty:
        print(f"No valid rows found for imputation in DataFrame. Returning original DataFrame.")
        return df_copy
    
    # Separate data into features (X) and target (y)
    X = df_filtered.dropna()[X_cols]  # Features (without target column and without NaNs)
    y = df_filtered.dropna()[column_name]  # Target (non-NaN values of target column)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize KNeighborsClassifier
    knn_imputer = KNeighborsClassifier(n_neighbors=6)
    
    # Fit KNN model on training data
    knn_imputer.fit(X_train, y_train)
    
    # Evaluate accuracy on testing data
    y_pred = knn_imputer.predict(X_test)
    accuracy = accuracy_score(list(y_test.values), y_pred)
    print(f"Accuracy of KNN classifier: {accuracy:.2f}")
    
    # Predict missing values in testing data
    X_missing = df_copy.loc[df_copy[column_name].isnull(), X_cols]
    
    # Loop through each row in X_missing
    imputed_values = []
    for index, row in X_missing.iterrows():
        # Check if more than one column is NaN in the row
        if row.isnull().sum() > 0:
            continue  # Skip imputation for this row
        
        # Predict and save imputed value
        imputed_value = knn_imputer.predict([row])[0]
        imputed_values.append(imputed_value)
        
        # Assign imputed value back to df_copy
        df_master.loc[index, column_name] = imputed_value
    
    # Decoding
    def decode_column(df, column, decoding_dict):
        return df[column].map(decoding_dict)


    for col in decoding_dicts.keys():
        d = decoding_dicts[col]
        if d is None:
            continue
        df_master[col] = decode_column(df_master, col, d)



    return df_master

In [9]:
#columns_for_imputation = [
#    "Ownership",
#    "Record Level"
#]

# Imputing Condition Grade

columns_for_imputation = [
    "Year of Construction",
    "Primary Classification (Use)",
    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)", 
    "Condition Grade"
]

estates_imputed_df = categorical_imputation(estates_df, "Condition Grade", columns_for_imputation)

No. of empty columns = 7
Accuracy of KNN classifier: 0.85




In [10]:
# Imputing 
estates_imputed_df[estates_imputed_df["Condition Grade"].isnull()]

Unnamed: 0,Activity code,Property Classification,Record Level,Division Name,Division,Unitary Authority,Site Name,Property Name,X (Easting),Y (Northing),...,Mountain Rescue (Yes / No),Dogs / Horses (Yes / No),Fleet workshop (Yes / No),Mortuary (Yes / No),Stores (Yes / No),Critical IT infrastructure (Yes / No),Accommodation (Yes / No),Intelligence Support (Yes / No),Number of Parking Spaces,EV Infrastructure


In [11]:
estates_imputed_df.to_csv("data/Estate_Informationv3_imputed.csv")

In [11]:
estates_imputed_df[estates_imputed_df["Division"] == "U"]["Year of Construction"].mean()

1970.375

In [12]:
# Imputing 

#columns_for_imputation = [
#    "Property Size GIA sq.m",
#    "Primary Classification (Use)",
#    "Ownership",
#    "Year of Construction",
#    "Division",
#    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)", 
#    "Condition Grade"
#]

columns_for_imputation = [
    "Year of Construction",
    "Primary Classification (Use)",
    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)", 
    "Condition Grade",
    "Division"
]

estates_imputed_df = categorical_imputation(estates_df, "Primary Classification (Use)", columns_for_imputation)

No. of empty columns = 0
Accuracy of KNN classifier: 1.00


In [13]:
estates_imputed_df.isnull().any()

Activity code                                                False
Property Classification                                      False
Record Level                                                  True
Division Name                                                False
Division                                                     False
Unitary Authority                                            False
Site Name                                                    False
Property Name                                                False
X (Easting)                                                  False
Y (Northing)                                                 False
Location Category (A=Urban, B=Rural, C=Remote, D=Islands)    False
Property Size GIA sq.m                                       False
Primary Classification (Use)                                 False
Ownership                                                    False
Co-Located                                                   F

In [14]:
estates_imputed_df[estates_imputed_df["Condition Grade"].isnull()]

Unnamed: 0,Activity code,Property Classification,Record Level,Division Name,Division,Unitary Authority,Site Name,Property Name,X (Easting),Y (Northing),...,Mountain Rescue (Yes / No),Dogs / Horses (Yes / No),Fleet workshop (Yes / No),Mortuary (Yes / No),Stores (Yes / No),Critical IT infrastructure (Yes / No),Accommodation (Yes / No),Intelligence Support (Yes / No),Number of Parking Spaces,EV Infrastructure
66,P1322,Police Station,,Renfrew & Inverclyde,K,Renfrewshire Council,Johnstone,Johnstone Police station-New,242735,663066,...,No,No,No,No,No,,No,No,,2
135,P1452,Police Station,,Lanarkshire,Q,South Lanarkshire Council,Lesmahagow,Lesmahagow Fire Station,281719,640378,...,No,No,No,No,No,,No,No,,0
148,P1028,Police Station,,Ayrshire,U,South Ayrshire Council,Ayr (new),Newton House,233755,622657,...,No,No,No,No,No,,No,Yes,,0
163,P1027,Police Station,,Ayrshire,U,South Ayrshire Council,Girvan,Girvan Police station,219135,598598,...,No,No,No,No,No,,No,No,,0
172,P1441,Police Station,,Ayrshire,U,North Ayrshire Council,Largs,Largs Police station,220416,659790,...,No,No,No,No,No,,No,No,,0
174,P1463,Police Station,,Ayrshire,U,South Ayrshire Council,Maybole (new),Maybole Police Station,229286,609632,...,No,No,No,No,No,,No,No,1.0,0
181,P1007,Police Station,,Ayrshire,U,East Ayrshire Council,Stewarton,Stewarton Police station,241983,645992,...,No,No,No,No,No,,No,No,0.0,1


In [15]:
#columns_for_imputation = [
#    "Property Size GIA sq.m",
#    "Primary Classification (Use)",
#    "Ownership",
#    "Year of Construction",
#    "Division",
#    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)", 
#    "Condition Grade"
#]

columns_for_imputation = [
    "Primary Classification (Use)",
    "Location Category (A=Urban, B=Rural, C=Remote, D=Islands)", 
    "Condition Grade"
]
estates_imputed_df = categorical_imputation(estates_df_imputed, "Condition Grade", columns_for_imputation)


NameError: name 'estates_df_imputed' is not defined

In [None]:
_

In [None]:
estates_df_encoded[columns_for_imputation[1:]] = estates_df_encoded[columns_for_imputation[1:]].astype('Int64')

In [None]:
# Imputing using a KNN Imputer


#knn_imputer = KNNImputer(n_neighbors=3)
#estates_df_imputed = estates_df_encoded.copy()

#estates_df_imputed[columns_for_imputation] = knn_imputer.fit_transform(estates_df_encoded[columns_for_imputation])

In [None]:
# Filters
police_station_filter = estates_df["Property Classification"] == "Police Station"
local_policing_filter = estates_df["Primary Classification (Use)"] == "Local Policing"
deployment_filter = estates_df["Deployment station? Y/N"] == "Yes"
estates_df = estates_df[police_station_filter & local_policing_filter & deployment_filter]

In [None]:
estates_df.columns

In [None]:
# Officer Count
# Property Square
# Co located
# Condition grade