# Cleaning the Data

In [1]:
import pandas as pd


In [2]:
dataset = pd.read_csv('../../Data/Data Sets/raw_dataset.csv', index_col=False)

# look at the data 
display(dataset)


Unnamed: 0,StopCount,StopCountDrugs,LSOA21CD,Borough,PopulationLSOA,EthnicMinority,IncomeDomainScore,CrimeSum,DrugCrimeSum,MeanHousePrice,PoliceDept
0,127.0,111.0,E01006434,Knowsley,1518,1.91,932.0,228.0,23.0,123955.0,Merseyside
1,98.0,83.0,E01006435,Knowsley,1524,3.35,782.0,149.0,18.0,134664.0,Merseyside
2,373.0,326.0,E01006436,Knowsley,1457,3.29,613.0,307.0,58.0,111733.0,Merseyside
3,534.0,434.0,E01006437,Knowsley,1387,3.03,1649.0,921.0,72.0,119648.0,Merseyside
4,38.0,32.0,E01006438,Knowsley,1153,5.29,17818.0,91.0,7.0,331221.0,Merseyside
...,...,...,...,...,...,...,...,...,...,...,...
5912,4.0,0.0,E01004660,Westminster,1430,41.54,31573.0,344.0,1.0,1655186.0,London
5913,278.0,119.0,E01004661,Westminster,1998,49.80,14052.0,737.0,17.0,1283005.0,London
5914,23.0,9.0,E01004662,Westminster,1318,31.64,29619.0,306.0,2.0,1427386.0,London
5915,108.0,49.0,E01004663,Westminster,1417,37.47,19021.0,333.0,15.0,1817948.0,London


# Adressing Missing Data

In [3]:
def missing_rows():
    for col in dataset:

        na_count = sum(dataset[col].isna())
        if na_count > 0:
            print(f'\nColumn : {col} has: {na_count} values')

missing_rows()


Column : StopCount has: 284 values

Column : StopCountDrugs has: 284 values

Column : IncomeDomainScore has: 369 values

Column : CrimeSum has: 21 values

Column : DrugCrimeSum has: 636 values

Column : MeanHousePrice has: 524 values


NAs for StopCount, StopCountDrugs, CrimeSum and DrugCrimeSum can all be replaced with 0

In [4]:
# dataset['StopCount'].fillna(0, inplace=True)

dataset.fillna({'StopCount': 0,
                'StopCountDrugs': 0,
                'CrimeSum': 0,
                'DrugCrimeSum': 0}, inplace=True)

missing_rows()


Column : IncomeDomainScore has: 369 values

Column : MeanHousePrice has: 524 values


In [5]:
# Calculate mean values for London and Merseyside separately
mean_values = dataset.groupby('PoliceDept')[['IncomeDomainScore', 'MeanHousePrice']].transform('mean')

# Fill missing values for IncomeDomainScore and MeanHousePrice based on their respective groups
dataset['IncomeDomainScore'] = dataset.groupby('PoliceDept')['IncomeDomainScore'].transform(
    lambda x: x.fillna(x.mean())
)

dataset['MeanHousePrice'] = dataset.groupby('PoliceDept')['MeanHousePrice'].transform(
    lambda x: x.fillna(x.mean())
)

# Verify the dataset
display(dataset)
missing_rows()

Unnamed: 0,StopCount,StopCountDrugs,LSOA21CD,Borough,PopulationLSOA,EthnicMinority,IncomeDomainScore,CrimeSum,DrugCrimeSum,MeanHousePrice,PoliceDept
0,127.0,111.0,E01006434,Knowsley,1518,1.91,932.0,228.0,23.0,123955.0,Merseyside
1,98.0,83.0,E01006435,Knowsley,1524,3.35,782.0,149.0,18.0,134664.0,Merseyside
2,373.0,326.0,E01006436,Knowsley,1457,3.29,613.0,307.0,58.0,111733.0,Merseyside
3,534.0,434.0,E01006437,Knowsley,1387,3.03,1649.0,921.0,72.0,119648.0,Merseyside
4,38.0,32.0,E01006438,Knowsley,1153,5.29,17818.0,91.0,7.0,331221.0,Merseyside
...,...,...,...,...,...,...,...,...,...,...,...
5912,4.0,0.0,E01004660,Westminster,1430,41.54,31573.0,344.0,1.0,1655186.0,London
5913,278.0,119.0,E01004661,Westminster,1998,49.80,14052.0,737.0,17.0,1283005.0,London
5914,23.0,9.0,E01004662,Westminster,1318,31.64,29619.0,306.0,2.0,1427386.0,London
5915,108.0,49.0,E01004663,Westminster,1417,37.47,19021.0,333.0,15.0,1817948.0,London


# Normalising Scores

In [6]:
def convert_to_z_scores(df, column_names):
    """
    Convert multiple columns in the DataFrame to the number of standard deviations (z-scores) from the mean.
    This operation is done separately for each group (e.g., London and Merseyside).
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns to transform.
    column_names (list): A list of column names to convert to Z-scores.
    
    Returns:
    pd.DataFrame: The DataFrame with Z-scores added as new columns for the specified columns.
    """
    # Group by the specified group column (London/Merseyside)
    df_copy = df.copy()
    
    for column_name in column_names:
        # Apply Z-score transformation within each group
        df_copy[column_name + '_z'] = df_copy.groupby('PoliceDept')[column_name].transform(
            lambda x: ((x - x.mean()) / x.std()).round(3)
        )
    
    return df_copy


In [7]:
# List of columns to convert to Z-scores
columns_to_convert = ['StopCount', 'EthnicMinority', 'MeanHousePrice', 'CrimeSum', 'DrugCrimeSum', 'IncomeDomainScore']

# Create new columns in your 'merseyside' DataFrame
dataset = convert_to_z_scores(dataset, columns_to_convert)


dataset



Unnamed: 0,StopCount,StopCountDrugs,LSOA21CD,Borough,PopulationLSOA,EthnicMinority,IncomeDomainScore,CrimeSum,DrugCrimeSum,MeanHousePrice,PoliceDept,StopCount_z,EthnicMinority_z,MeanHousePrice_z,CrimeSum_z,DrugCrimeSum_z,IncomeDomainScore_z
0,127.0,111.0,E01006434,Knowsley,1518,1.91,932.0,228.0,23.0,123955.0,Merseyside,0.574,-0.673,-0.769,0.151,0.382,-1.087
1,98.0,83.0,E01006435,Knowsley,1524,3.35,782.0,149.0,18.0,134664.0,Merseyside,0.345,-0.505,-0.660,-0.146,0.204,-1.102
2,373.0,326.0,E01006436,Knowsley,1457,3.29,613.0,307.0,58.0,111733.0,Merseyside,2.511,-0.512,-0.893,0.449,1.629,-1.119
3,534.0,434.0,E01006437,Knowsley,1387,3.03,1649.0,921.0,72.0,119648.0,Merseyside,3.779,-0.542,-0.813,2.759,2.128,-1.015
4,38.0,32.0,E01006438,Knowsley,1153,5.29,17818.0,91.0,7.0,331221.0,Merseyside,-0.127,-0.279,1.342,-0.364,-0.188,0.605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5912,4.0,0.0,E01004660,Westminster,1430,41.54,31573.0,344.0,1.0,1655186.0,London,-0.364,-0.195,1.973,0.514,-0.373,1.830
5913,278.0,119.0,E01004661,Westminster,1998,49.80,14052.0,737.0,17.0,1283005.0,London,3.168,0.246,1.220,1.868,0.607,-0.181
5914,23.0,9.0,E01004662,Westminster,1318,31.64,29619.0,306.0,2.0,1427386.0,London,-0.119,-0.724,1.512,0.384,-0.312,1.606
5915,108.0,49.0,E01004663,Westminster,1417,37.47,19021.0,333.0,15.0,1817948.0,London,0.977,-0.413,2.302,0.477,0.484,0.390


# Add Gini Coefficient

In [8]:
gini = pd.read_csv('../../Data/Data Sets/gini_dataset.csv')
gini

Unnamed: 0,County,LSOA,gini
0,GREATER LONDON,E01000001,0.216543
1,GREATER LONDON,E01000002,0.224353
2,GREATER LONDON,E01000003,0.230507
3,GREATER LONDON,E01000005,0.631699
4,GREATER LONDON,E01000006,0.261445
...,...,...,...
5916,MERSEYSIDE,E01034837,0.158510
5917,MERSEYSIDE,E01034838,0.394225
5918,MERSEYSIDE,E01034839,0.613060
5919,MERSEYSIDE,E01034840,0.183836


In [9]:
dataset = pd.merge(dataset, gini, how='left', left_on='LSOA21CD', right_on='LSOA')
dataset

Unnamed: 0,StopCount,StopCountDrugs,LSOA21CD,Borough,PopulationLSOA,EthnicMinority,IncomeDomainScore,CrimeSum,DrugCrimeSum,MeanHousePrice,PoliceDept,StopCount_z,EthnicMinority_z,MeanHousePrice_z,CrimeSum_z,DrugCrimeSum_z,IncomeDomainScore_z,County,LSOA,gini
0,127.0,111.0,E01006434,Knowsley,1518,1.91,932.0,228.0,23.0,123955.0,Merseyside,0.574,-0.673,-0.769,0.151,0.382,-1.087,MERSEYSIDE,E01006434,0.110904
1,98.0,83.0,E01006435,Knowsley,1524,3.35,782.0,149.0,18.0,134664.0,Merseyside,0.345,-0.505,-0.660,-0.146,0.204,-1.102,MERSEYSIDE,E01006435,0.219380
2,373.0,326.0,E01006436,Knowsley,1457,3.29,613.0,307.0,58.0,111733.0,Merseyside,2.511,-0.512,-0.893,0.449,1.629,-1.119,MERSEYSIDE,E01006436,0.139740
3,534.0,434.0,E01006437,Knowsley,1387,3.03,1649.0,921.0,72.0,119648.0,Merseyside,3.779,-0.542,-0.813,2.759,2.128,-1.015,MERSEYSIDE,E01006437,0.594827
4,38.0,32.0,E01006438,Knowsley,1153,5.29,17818.0,91.0,7.0,331221.0,Merseyside,-0.127,-0.279,1.342,-0.364,-0.188,0.605,MERSEYSIDE,E01006438,0.247459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5912,4.0,0.0,E01004660,Westminster,1430,41.54,31573.0,344.0,1.0,1655186.0,London,-0.364,-0.195,1.973,0.514,-0.373,1.830,GREATER LONDON,E01004660,0.301121
5913,278.0,119.0,E01004661,Westminster,1998,49.80,14052.0,737.0,17.0,1283005.0,London,3.168,0.246,1.220,1.868,0.607,-0.181,GREATER LONDON,E01004661,0.755591
5914,23.0,9.0,E01004662,Westminster,1318,31.64,29619.0,306.0,2.0,1427386.0,London,-0.119,-0.724,1.512,0.384,-0.312,1.606,GREATER LONDON,E01004662,0.346453
5915,108.0,49.0,E01004663,Westminster,1417,37.47,19021.0,333.0,15.0,1817948.0,London,0.977,-0.413,2.302,0.477,0.484,0.390,GREATER LONDON,E01004663,0.416258


In [10]:
# dataset = dataset[['LSOA21CD', 'Borough', 'StopCount_z', 'StopCount', 'PoliceDept', 'MeanHousePrice_z', 'CrimeSum_z', 'BAME_z', 'DrugCrimeSum_z', 'IncomeDomainScore_z']]

dataset.drop(columns={'County', 'LSOA'}, inplace=True)

In [11]:
dataset['gini'] = dataset.groupby('PoliceDept')['gini'].transform(lambda x: x.fillna(x.mean()))

dataset

Unnamed: 0,StopCount,StopCountDrugs,LSOA21CD,Borough,PopulationLSOA,EthnicMinority,IncomeDomainScore,CrimeSum,DrugCrimeSum,MeanHousePrice,PoliceDept,StopCount_z,EthnicMinority_z,MeanHousePrice_z,CrimeSum_z,DrugCrimeSum_z,IncomeDomainScore_z,gini
0,127.0,111.0,E01006434,Knowsley,1518,1.91,932.0,228.0,23.0,123955.0,Merseyside,0.574,-0.673,-0.769,0.151,0.382,-1.087,0.110904
1,98.0,83.0,E01006435,Knowsley,1524,3.35,782.0,149.0,18.0,134664.0,Merseyside,0.345,-0.505,-0.660,-0.146,0.204,-1.102,0.219380
2,373.0,326.0,E01006436,Knowsley,1457,3.29,613.0,307.0,58.0,111733.0,Merseyside,2.511,-0.512,-0.893,0.449,1.629,-1.119,0.139740
3,534.0,434.0,E01006437,Knowsley,1387,3.03,1649.0,921.0,72.0,119648.0,Merseyside,3.779,-0.542,-0.813,2.759,2.128,-1.015,0.594827
4,38.0,32.0,E01006438,Knowsley,1153,5.29,17818.0,91.0,7.0,331221.0,Merseyside,-0.127,-0.279,1.342,-0.364,-0.188,0.605,0.247459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5912,4.0,0.0,E01004660,Westminster,1430,41.54,31573.0,344.0,1.0,1655186.0,London,-0.364,-0.195,1.973,0.514,-0.373,1.830,0.301121
5913,278.0,119.0,E01004661,Westminster,1998,49.80,14052.0,737.0,17.0,1283005.0,London,3.168,0.246,1.220,1.868,0.607,-0.181,0.755591
5914,23.0,9.0,E01004662,Westminster,1318,31.64,29619.0,306.0,2.0,1427386.0,London,-0.119,-0.724,1.512,0.384,-0.312,1.606,0.346453
5915,108.0,49.0,E01004663,Westminster,1417,37.47,19021.0,333.0,15.0,1817948.0,London,0.977,-0.413,2.302,0.477,0.484,0.390,0.416258


In [12]:
dataset.to_csv('../../Data/Data Sets/cleaned_dataset.csv', index=False)
