# Synthetic Nottingham Income Estimates

This code creates synthetic income estimates for LSOAs across Nottingham (UK) 

In [157]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
#import the dataset
data = pd.read_csv('notts_lsoa_income_params.csv')
#drop unwanted cols
#data = data.drop(['Unnamed: 0', 'Transaction'], axis=1)
data.head()

Unnamed: 0,LSOA_code,mean,households,upper,lower,sd
0,E01013810,24.173,643,43.337,3.74,9.582
1,E01013811,22.592,526,41.756,3.74,9.582
2,E01013812,22.957,554,42.121,3.74,9.582
3,E01013813,15.186,578,34.350,3.74,9.582
4,E01013814,25.569,715,44.733,3.74,9.582
...,...,...,...,...,...,...
177,E01033407,20.761,646,39.925,3.74,9.582
178,E01033408,27.280,670,46.444,3.74,9.582
179,E01033409,12.340,486,31.504,3.74,9.582
180,E01033410,25.634,814,44.798,3.74,9.582


In [158]:
# setting lsoa's as the index
LSOA_data = data.set_index('LSOA_code')
LSOA_data.head(3)

Unnamed: 0_level_0,mean,households,upper,lower,sd
LSOA_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E01013810,24.173,643,43.337,3.74,9.582
E01013811,22.592,526,41.756,3.74,9.582
E01013812,22.957,554,42.121,3.74,9.582
E01013813,15.186,578,34.350,3.74,9.582
E01013814,25.569,715,44.733,3.74,9.582
...,...,...,...,...,...
E01033407,20.761,646,39.925,3.74,9.582
E01033408,27.280,670,46.444,3.74,9.582
E01033409,12.340,486,31.504,3.74,9.582
E01033410,25.634,814,44.798,3.74,9.582


# Bounded Range Estiamte
Creating an estimate based on the bounding range of upper and lower income bounds

In [166]:
#sample across the first observation (0)
mean = LSOA_data.iloc[0]['mean'] 
lower_bound = LSOA_data.iloc[0]['lower'] 
upper_bound = LSOA_data.iloc[0]['upper'] + 1  # + 1 to keep allow for the generation of the highest observed value

# Generate a random number within the Bounded Range
bounded_random_income = np.random.uniform(lower_bound, upper_bound, size=int(LSOA_data.iloc[0]['households']))
bounded_random_income


array([18.26882901,  8.5466673 , 16.35554344, 23.99852738, 12.5144462 ,
       17.20777754, 15.61769667, 25.00920166, 42.97683991, 39.73586743,
       12.66451175, 18.16216877, 39.74740326, 37.82884466, 44.02766634,
       11.65945497, 31.56173731, 24.83212118, 24.81067157, 43.82967832,
       12.63686906, 19.92161242, 10.63269171, 10.04222913, 25.79472929,
       38.70058023, 17.42076962,  5.6467308 , 14.47579852, 35.04204568,
       15.61844112, 12.59979023, 32.62111808, 17.85340738, 40.71587649,
       16.37815712, 27.62080855, 26.15845677, 40.19553705,  4.38361727,
       36.48671648, 38.7011101 , 43.72184614, 33.02068072, 16.60007138,
       11.24701453, 12.34541651, 36.17779273, 12.90891192, 11.32714713,
       29.32738652, 17.13479201, 30.71861229, 36.76011296, 25.71795523,
       25.35339333, 12.88319047, 31.18078461, 20.55502605, 16.59678094,
       38.48588559, 19.7181836 , 30.2381614 , 39.14185839, 42.29468421,
        9.28549113, 21.38210575, 21.10487758, 21.17863488, 32.41

# With SD
Creating an estimate based on the standard deviation

In [167]:
#sample across the first observation (0)
mean = LSOA_data.iloc[0]['mean'] 
sd = LSOA_data.iloc[0]['sd'] 
random_income = np.random.normal(mean, sd, size=int(LSOA_data.iloc[0]['households']))
random_income

array([25.30634763, 13.70287595, 11.14419998, 24.97978196, 17.50785695,
       13.5339838 , 17.55697402, 16.7325433 , 27.98768865, 15.13339394,
       29.05901148, 15.99685585, 18.56107113, 37.81900795, 25.84792468,
       23.93798303, 32.20254046, 33.46899142, 40.2541476 , 18.06246251,
       34.38137891, 25.67915497, 22.17216973, 19.71128382, 25.0085161 ,
       36.86852814, 43.58525131, 17.9015637 , 31.1296791 , 41.05191688,
       15.49353991,  7.84231471, 39.98771147, 28.23102513, 26.50549844,
       38.63900148, 20.45428385, 27.70403919, 20.94380117, 19.32050901,
       13.61166926, 22.38195945, 15.46720504, 22.63753334, 30.44411596,
       24.91625135,  6.16774129, 15.00869334, 29.76196952, 34.60775472,
       35.41102156, 33.89210511, 18.79689004, 15.46796255, 24.74460463,
       26.80448104, 27.05514548, 20.23162434, 41.14899117, 12.30650358,
       29.92166853, 39.22957635, 23.44830253, 38.68987906, 19.42147768,
        9.55007383, 20.20112618, 22.94974799, 26.25162803, 29.67

In [161]:
# defining extra contextual columns in the dataframe
synthetic_dataframe = pd.DataFrame(columns=[['house_number','synthetic_data']])

# Fill the DataFrame with values from the list
for i in range(int(LSOA_data.iloc[0]['households'])):
    synthetic_dataframe.loc[i, 'house_number'] = i
    synthetic_dataframe.loc[i, 'synthetic_data'] = random_income[i]
synthetic_dataframe

Unnamed: 0,house_number,synthetic_data
0,0,30.086955
1,1,21.824815
2,2,38.934447
3,3,26.034546
4,4,22.870879
...,...,...
638,638,35.87638
639,639,7.142589
640,640,25.079773
641,641,19.472715


# The Function the Creates Synthetic Data and Exports to CSVs

In [171]:

def create_sythetic_data(dataframe):
    '''
    Creates synthetic income data for regions and outputs them as CSVs.

    Args:
      dataframe: the dataframe that contains the mean and standard deviation of income across an area.

    Returns:
      CSV(s) containing the income estimates for that area.
    '''
    for i in range(len(dataframe)):
        mean = dataframe.iloc[i]['mean'] 
        sd = dataframe.iloc[i]['sd']
        
        random_income = np.random.normal(mean, sd, size=int(dataframe.iloc[i]['households']))
        
        synthetic_dataframe = pd.DataFrame(columns=[['lsoa_code','synthetic_data']])

            # Fill the DataFrame with values from the list
        for j in range(int(LSOA_data.iloc[i]['households'])):
            synthetic_dataframe.loc[j, 'lsoa_code'] = data['LSOA_code'][i]
            synthetic_dataframe.loc[j, 'synthetic_data'] = random_income[j]
            value_for_filename =  i
            synthetic_dataframe.to_csv(f"/Users/gregormilligan/Desktop/semple_code_help/{data['LSOA_code'][i]}_synthetic_dataframe_{value_for_filename}.csv")
    print(f'number of csvs create {i+1}') 


In [172]:
create_sythetic_data(LSOA_data)

number of csvs create 182
