<a href="https://colab.research.google.com/github/JosephFalconio/Joseph-Falconio_dissertation/blob/main/cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Step 1: Load the IMD data and lookup table
imd = pd.read_csv('IMD_2010_converted.csv')
lookup = pd.read_csv('lookup_2011_2021_leeds.csv')

# Step 2: Keep only relevant columns
imd = imd[['LSOA11CD', 'IMD_Score']]
lookup = lookup[['LSOA11CD', 'LSOA21CD', 'ChangeType']]

# Step 3: Merge IMD scores onto lookup by LSOA11CD
merged = pd.merge(lookup, imd, on='LSOA11CD', how='left')

# Step 4: Separate by ChangeType
unchanged = merged[merged['ChangeType'] == 'U'].copy()
split = merged[merged['ChangeType'] == 'S'].copy()
merged_type = merged[merged['ChangeType'] == 'M'].copy()
irregular = merged[merged['ChangeType'] == 'X'].copy()

# Step 5: Handle 'unchanged' (U) - direct copy
# For these, one 2011 LSOA maps to one 2021 LSOA
unchanged_result = unchanged[['LSOA21CD', 'IMD_Score']]

# Step 6: Handle 'split' (S) - assign same IMD score to all split parts
# Just keep all rows with the IMD score repeated
split_result = split[['LSOA21CD', 'IMD_Score']]

# Step 7: Handle 'merged' (M) - average IMD scores of multiple 2011 LSOAs mapping to one 2021 LSOA
merged_avg = merged_type.groupby('LSOA21CD')['IMD_Score'].mean().reset_index()

# Step 8: Handle 'irregular' (X) - optional: treat like merged or exclude
# Here we treat like merged by averaging
irregular_avg = irregular.groupby('LSOA21CD')['IMD_Score'].mean().reset_index()

# Step 9: Combine all results
final_imd = pd.concat([
    unchanged_result,
    split_result,
    merged_avg,
    irregular_avg
], ignore_index=True)

# Step 10: If duplicates exist (unlikely), average them
final_imd = final_imd.groupby('LSOA21CD')['IMD_Score'].mean().reset_index()

# Step 11: Save to CSV
final_imd.rename(columns={'IMD_Score': 'IMD_2010_in_2021_LSOA'}, inplace=True)
final_imd.to_csv('IMD_2010_converted_2021_LSOA.csv', index=False)

print(final_imd.head())


    LSOA21CD  IMD_2010_in_2021_LSOA
0  E01011264                    7.0
1  E01011265                    9.0
2  E01011266                   10.0
3  E01011267                    5.0
4  E01011268                    4.0
