# Making the Gini Variable




with data from:

https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads

In [1]:
import pandas as pd
import numpy as np


In [2]:


url = 'http://prod2.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2022.txt'

# Read in the data
data = pd.read_csv(url, sep=',', header=None, encoding='latin1')

# assign column names (based on official schema)
data.columns = [
    "Transaction unique identifier",
    "Price",
    "Date of Transfer",
    "Postcode",
    "Property Type",
    "Old/New",
    "Duration",
    "PAON",
    "SAON",
    "Street",
    "Locality",
    "Town/City",
    "District",
    "County",
    "PPD Category Type",
    "Record Status"
]

# Show first few rows
display(data)


Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Postcode,Property Type,Old/New,Duration,PAON,SAON,Street,Locality,Town/City,District,County,PPD Category Type,Record Status
0,{045A1898-4ABF-9A24-E063-4804A8C048EA},407400,2022-04-28 00:00,LU7 3FZ,S,Y,F,68,,RAMSAY DRIVE,,LEIGHTON BUZZARD,CENTRAL BEDFORDSHIRE,CENTRAL BEDFORDSHIRE,A,A
1,{045A1898-4AC1-9A24-E063-4804A8C048EA},357000,2022-05-27 00:00,LU7 3QS,S,Y,F,44,,CHADWICK CRESCENT,,LEIGHTON BUZZARD,CENTRAL BEDFORDSHIRE,CENTRAL BEDFORDSHIRE,A,A
2,{045A1898-4AC2-9A24-E063-4804A8C048EA},372950,2022-04-28 00:00,LU5 6TD,S,Y,F,11,,SKYE GARDENS,HOUGHTON REGIS,DUNSTABLE,CENTRAL BEDFORDSHIRE,CENTRAL BEDFORDSHIRE,A,A
3,{045A1898-4AC4-9A24-E063-4804A8C048EA},570000,2022-05-31 00:00,SG18 9RF,D,Y,F,8,,HARVEST MOUSE PLACE,LANGFORD,BIGGLESWADE,CENTRAL BEDFORDSHIRE,CENTRAL BEDFORDSHIRE,A,A
4,{045A1898-4AC5-9A24-E063-4804A8C048EA},570000,2022-05-27 00:00,LU7 3QS,D,Y,F,42,,CHADWICK CRESCENT,,LEIGHTON BUZZARD,CENTRAL BEDFORDSHIRE,CENTRAL BEDFORDSHIRE,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068640,{E2D14905-55E3-4C2D-E053-6B04A8C0422B},432000,2022-05-20 00:00,RM9 5UL,T,N,F,59,,CONNOR ROAD,,DAGENHAM,BARKING AND DAGENHAM,GREATER LONDON,A,A
1068641,{E2D14905-55E4-4C2D-E053-6B04A8C0422B},510000,2022-04-29 00:00,E4 8PH,T,N,F,27,,SINCLAIR ROAD,,LONDON,WALTHAM FOREST,GREATER LONDON,A,A
1068642,{E2D14905-55E5-4C2D-E053-6B04A8C0422B},635000,2022-05-23 00:00,E17 4PN,T,N,F,133,,CHINGFORD ROAD,,LONDON,WALTHAM FOREST,GREATER LONDON,A,A
1068643,{E2D14905-55E6-4C2D-E053-6B04A8C0422B},895000,2022-04-01 00:00,IG8 9EH,T,N,F,35,,ST ALBANS CRESCENT,,WOODFORD GREEN,REDBRIDGE,GREATER LONDON,A,A


## Filter London and Merseyside
smaller data will be easier to wrangle

In [3]:
# Filter the data for the specific counties
data = data[(data['County'] == 'GREATER LONDON') | (data['County'] == 'MERSEYSIDE')]

# Drop rows where 'Postcode' is NaN
data.dropna(subset=['Postcode'], inplace=True)

# Display the modified DataFrame
data = data[['Price', 'Postcode', 'County']]


data


Unnamed: 0,Price,Postcode,County
30,215000,SM6 8AR,GREATER LONDON
31,315000,CR4 1FD,GREATER LONDON
32,890000,BR4 9NW,GREATER LONDON
33,225000,CR2 6AE,GREATER LONDON
34,412500,BR1 1AG,GREATER LONDON
...,...,...,...
1068640,432000,RM9 5UL,GREATER LONDON
1068641,510000,E4 8PH,GREATER LONDON
1068642,635000,E17 4PN,GREATER LONDON
1068643,895000,IG8 9EH,GREATER LONDON


## Map to LSOA

https://geoportal.statistics.gov.uk/datasets/80592949bebd4390b2cbe29159a75ef4/about

In [4]:
post_code_map = pd.read_csv('../../Data/LSOA_data/PostCodes/PCD_OA21_LSOA21_MSOA21_LAD_FEB25_UK_LU.csv')
post_code_map

  post_code_map = pd.read_csv('../../Data/LSOA_data/PostCodes/PCD_OA21_LSOA21_MSOA21_LAD_FEB25_UK_LU.csv')


Unnamed: 0,pcd7,pcd8,pcds,dointr,doterm,usertype,oa21cd,lsoa21cd,msoa21cd,ladcd,lsoa21nm,msoa21nm,ladnm,ladnmw
0,AB1 0AA,AB1 0AA,AB1 0AA,198001,199606.0,0,S00137176,S01013490,S02002516,S12000033,"Cults, Bieldside and Milltimber West - 02","Cults, Bieldside and Milltimber West",Aberdeen City,
1,AB1 0AB,AB1 0AB,AB1 0AB,198001,199606.0,0,S00137176,S01013490,S02002516,S12000033,"Cults, Bieldside and Milltimber West - 02","Cults, Bieldside and Milltimber West",Aberdeen City,
2,AB1 0AD,AB1 0AD,AB1 0AD,198001,199606.0,0,S00137176,S01013490,S02002516,S12000033,"Cults, Bieldside and Milltimber West - 02","Cults, Bieldside and Milltimber West",Aberdeen City,
3,AB1 0AE,AB1 0AE,AB1 0AE,199402,199606.0,0,S00138891,S01013856,S02002577,S12000034,"Dunecht, Durris and Drumoak - 01","Dunecht, Durris and Drumoak",Aberdeenshire,
4,AB1 0AF,AB1 0AF,AB1 0AF,199012,199207.0,1,S00137241,S01013487,S02002515,S12000033,Culter - 06,Culter,Aberdeen City,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712501,ZE3 9JW,ZE3 9JW,ZE3 9JW,198001,,0,S00174882,S01019720,S02003651,S12000027,Shetland South - 01,Shetland South,Shetland Islands,
2712502,ZE3 9JX,ZE3 9JX,ZE3 9JX,198001,,0,S00174882,S01019720,S02003651,S12000027,Shetland South - 01,Shetland South,Shetland Islands,
2712503,ZE3 9JY,ZE3 9JY,ZE3 9JY,198001,,0,S00174880,S01019720,S02003651,S12000027,Shetland South - 01,Shetland South,Shetland Islands,
2712504,ZE3 9JZ,ZE3 9JZ,ZE3 9JZ,198001,,0,S00174880,S01019720,S02003651,S12000027,Shetland South - 01,Shetland South,Shetland Islands,


In [5]:
post_code_map = post_code_map[['pcds', 'lsoa21cd']]

post_code_map

Unnamed: 0,pcds,lsoa21cd
0,AB1 0AA,S01013490
1,AB1 0AB,S01013490
2,AB1 0AD,S01013490
3,AB1 0AE,S01013856
4,AB1 0AF,S01013487
...,...,...
2712501,ZE3 9JW,S01019720
2712502,ZE3 9JX,S01019720
2712503,ZE3 9JY,S01019720
2712504,ZE3 9JZ,S01019720


In [6]:
gini_data = pd.merge(data, post_code_map, how='left', left_on='Postcode', right_on='pcds')

gini_data = gini_data[['lsoa21cd', 'Price', 'County']]

gini_data.columns = ['LSOA', 'Price', 'County']

gini_data

Unnamed: 0,LSOA,Price,County
0,E01004172,215000,GREATER LONDON
1,E01003435,315000,GREATER LONDON
2,E01000840,890000,GREATER LONDON
3,E01001040,225000,GREATER LONDON
4,E01034386,412500,GREATER LONDON
...,...,...,...
146383,E01000065,432000,GREATER LONDON
146384,E01004454,510000,GREATER LONDON
146385,E01004347,635000,GREATER LONDON
146386,E01003670,895000,GREATER LONDON


In [7]:


def gini(x):
    x = np.asarray(x)
    n = len(x)
    mu = np.mean(x)
    diff_sum = np.sum(np.abs(x[:, None] - x))  # pairwise absolute differences
    return diff_sum / (2 * n**2 * mu)


gini = (
    gini_data
    .groupby(['County', 'LSOA'])['Price']
    .apply(gini) 
    .reset_index(name='gini')
)


In [8]:
gini

Unnamed: 0,County,LSOA,gini
0,GREATER LONDON,E01000001,0.216543
1,GREATER LONDON,E01000002,0.224353
2,GREATER LONDON,E01000003,0.230507
3,GREATER LONDON,E01000005,0.631699
4,GREATER LONDON,E01000006,0.261445
...,...,...,...
5916,MERSEYSIDE,E01034837,0.158510
5917,MERSEYSIDE,E01034838,0.394225
5918,MERSEYSIDE,E01034839,0.613060
5919,MERSEYSIDE,E01034840,0.183836


In [9]:
gini.to_csv('../../Data/Data Sets/gini_dataset.csv', index=False)