# Consumer's dataset merging process

In [1]:
import folium
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [96]:
consumer = pd.read_parquet("../data/curated/consumer/")
processed_postcode = pd.read_csv('../data/curated/processed_postcode.csv')
processed_postcode = processed_postcode[['postcode', 'SA2_code']]
processed_postcode

Unnamed: 0,postcode,SA2_code
0,200,801051049.0
1,800,701011002.0
2,801,701011002.0
3,804,701011007.0
4,810,701021010.0
...,...,...
3162,9013,305011105.0
3163,9015,305011105.0
3164,9464,302031038.0
3165,9726,309101268.0


In [115]:
df = pd.read_parquet("../data/curated/consumer/")
df

Unnamed: 0,consumer_id,user_id,state,postcode,gender
0,28,458885,WA,6176,Male
1,78,319257,SA,5410,Male
2,101,9180,SA,5554,Female
3,108,191536,SA,5052,Female
4,133,234634,WA,6985,Male
...,...,...,...,...,...
499994,1499869,381121,QLD,4403,Female
499995,1499885,325002,TAS,7163,Male
499996,1499910,414057,NSW,2090,Female
499997,1499911,56561,WA,6105,Female


In [98]:
income_df = pd.read_excel('../data/external/total_income.xlsx', sheet_name='Table 1.4')
income_df

Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,Personal Income in Australia Table 1. Total In...,,,,,,,,,,...,,,,,,,,,,
1,Released at 11:30 am (Canberra time) 22/01/2021,,,,,,,,,,...,,,,,,,,,,
2,Table 1.4,Total Income by Statistical Area Level 2 (2014...,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,Earners (persons),,,,,Median age of earners (years),,,...,,,,,Mean ($),,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2301,801111140,ACT - South West,349,289,347,364,355,40,39,39,...,62592,55384,61096,64227,62505,72858,70503,67445,73435,
2302,801111141,Namadgi,np,np,21,19,18,np,np,34,...,np,49981,58498,40479,np,np,57309,56357,39278,
2303,,,,,,,,,,,...,,,,,,,,,,
2304,Totals may not align with the sum of their com...,,,,,,,,,,...,,,,,,,,,,


In [99]:
income_df1 = income_df.drop(income_df.index[0:6], inplace=False).reset_index(drop=True)
income_df1.columns = income_df.iloc[5].values.flatten().tolist()
income_df1 = (income_df1.iloc[:, [0, 1, 26]])
income_df1.drop(income_df1.index[2297:2300], inplace=True)
income_df1.rename(columns={'2018-19':'Mean_Total_Income'}, inplace=True)
income_df1.rename(columns={'SA2':'SA2_code'}, inplace=True)
income_df1 = income_df1.dropna().reset_index(drop = True)
#income_df1.astype({'SA2': 'int'})
#income_df1.to_csv("../data/curated/income.csv")

In [100]:
income_df1['Mean_Total_Income'] = income_df1['Mean_Total_Income'].replace('np', np.NAN)

In [70]:
type(income_df1["Mean_Total_Income"].iloc[0])

numpy.float64

In [101]:
SA2_missing = income_df1[(income_df1.isnull()).any(axis=1)]
SA2_missing.shape

(44, 3)

In [116]:
# Dataset has multiple areas with postcode 2611, check the existance the one in ACT
df.loc[(df['postcode']==2611) & (df['state'] == 'ACT')]

Unnamed: 0,consumer_id,user_id,state,postcode,gender


In [117]:
df = consumer.merge(processed_postcode, on='postcode', how='left')
df.isna().values.any()

True

In [118]:
df[df['user_id'] == 18666]

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code
157529,1420365,18666,VIC,3989,Male,


In [74]:
income_df1

Unnamed: 0,SA2_code,SA2 NAME,Mean_Total_Income
0,101021007,Braidwood,51149.0
1,101021008,Karabar,66335.0
2,101021009,Queanbeyan,65874.0
3,101021010,Queanbeyan - East,69860.0
4,101021011,Queanbeyan Region,81919.0
...,...,...,...
2283,801101137,Molonglo,
2284,801101138,Molonglo - North,
2285,801101139,Wright,86007.0
2286,801111140,ACT - South West,73435.0


In [119]:
# combining consumer with mean total income based on SA2 code
df = df.merge(income_df1, on='SA2_code', how='left')

In [120]:
df[df['user_id'] == 18666]

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code,SA2 NAME,Mean_Total_Income
157529,1420365,18666,VIC,3989,Male,,,


### Missing values

#### find mean income per state to replace missing value in 'mean total income'

In [106]:
state_income = income_df.drop(income_df.index[0:6], inplace=False).reset_index(drop=True)
state_income

Unnamed: 0,Australian Bureau of Statistics,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,Australia,,13102895,13358252,13678024,14069082,14425037,42,42,42,...,47692,48360,49805,51389,61036,61975,62594,64246,65953,
1,New South Wales,,4091347,4191542,4344997,4466941,4569650,42,42,42,...,48085,48700,50153,51818,62798,64493,65196,67200,68816,
2,101021007,Braidwood,2133,2153,2262,2315,2361,50,50,50,...,39716,41288,42003,41593,47741,51074,51090,51594,51149,
3,101021008,Karabar,4866,4937,4988,5059,5100,43,42,42,...,55870,57880,59295,61777,59548,61093,62535,63790,66335,
4,101021009,Queanbeyan,6194,6419,6486,6595,6697,40,39,39,...,54999,55376,57848,60119,59310,60764,60958,62863,65874,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2295,801111140,ACT - South West,349,289,347,364,355,40,39,39,...,62592,55384,61096,64227,62505,72858,70503,67445,73435,
2296,801111141,Namadgi,np,np,21,19,18,np,np,34,...,np,49981,58498,40479,np,np,57309,56357,39278,
2297,,,,,,,,,,,...,,,,,,,,,,
2298,Totals may not align with the sum of their com...,,,,,,,,,,...,,,,,,,,,,


In [107]:
states = ['New South Wales', 'Victoria', 'Queensland', 'South Australia', 'Western Australia', 'Tasmania', 'Northern Territory', 'Australian Capital Territory']
state_income = state_income[state_income.iloc[:, 0].isin(states)].iloc[:, [0, 26]]

In [108]:
abbrv = ['NSW', 'VIC','QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT']
state_income.iloc[:,0] = state_income.iloc[:,0].replace(states,abbrv)
state_income.rename(columns={'Unnamed: 26':'Mean_Total_Income'}, inplace=True)
state_income = state_income.set_index(state_income.iloc[:, 0])
state_income = state_income.to_dict()['Mean_Total_Income']


In [109]:
state_income['NSW']

68816

In [121]:
for state in abbrv:
  df.loc[(df['state'] == state)&(df['Mean_Total_Income'].isna()), ['SA2 NAME','Mean_Total_Income']] = [state, state_income[state]]

In [122]:
df[df['user_id'] == 18662]

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code,SA2 NAME,Mean_Total_Income
496379,1411657,18662,SA,5110,Female,402041039.0,SA,59875.0


In [91]:
df.dtypes

consumer_id            int32
user_id                int32
state                 object
postcode               int32
gender                object
SA2_code              object
SA2 NAME              object
Mean_Total_Income    float64
dtype: object

In [82]:
df

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code,SA2 NAME,Mean_Total_Income
0,28,458885,WA,6176,Male,507051185.0,Baldivis,70678.0
1,78,319257,SA,5410,Male,405011111.0,Light,55650.0
2,101,9180,SA,5554,Female,405041125.0,Moonta,46898.0
3,108,191536,SA,5052,Female,403031065.0,Belair,73814.0
4,133,234634,WA,6985,Male,506021122.0,Rivervale - Kewdale - Cloverdale,61942.0
...,...,...,...,...,...,...,...,...
499994,1499869,381121,QLD,4403,Female,307021179.0,Crows Nest - Rosalie,44615.0
499995,1499885,325002,TAS,7163,Male,603021069.0,Bruny Island - Kettering,53377.0
499996,1499910,414057,NSW,2090,Female,121041413.0,Cremorne - Cammeray,123563.0
499997,1499911,56561,WA,6105,Female,506021122.0,Rivervale - Kewdale - Cloverdale,61942.0


### drop rows with missing SA2_code

In [83]:
#df = df.dropna()

In [84]:
#df

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code,SA2 NAME,Mean_Total_Income
0,28,458885,WA,6176,Male,507051185.0,Baldivis,70678.0
1,78,319257,SA,5410,Male,405011111.0,Light,55650.0
2,101,9180,SA,5554,Female,405041125.0,Moonta,46898.0
3,108,191536,SA,5052,Female,403031065.0,Belair,73814.0
4,133,234634,WA,6985,Male,506021122.0,Rivervale - Kewdale - Cloverdale,61942.0
...,...,...,...,...,...,...,...,...
499994,1499869,381121,QLD,4403,Female,307021179.0,Crows Nest - Rosalie,44615.0
499995,1499885,325002,TAS,7163,Male,603021069.0,Bruny Island - Kettering,53377.0
499996,1499910,414057,NSW,2090,Female,121041413.0,Cremorne - Cammeray,123563.0
499997,1499911,56561,WA,6105,Female,506021122.0,Rivervale - Kewdale - Cloverdale,61942.0


In [123]:
df[df['user_id'] == 18662]

Unnamed: 0,consumer_id,user_id,state,postcode,gender,SA2_code,SA2 NAME,Mean_Total_Income
496379,1411657,18662,SA,5110,Female,402041039.0,SA,59875.0


In [124]:
df.to_csv('../data/curated/clean_consumer_post_total_income.csv')

## Model Features selection

Transaction dollar * take rate = BNPL revenue

Level a-e ?

Transaction Volumn

Average transaction amount per merchant

Growth rate per merchant (Transaction Volumn + amount)

Number of consumers (more -> Riskless?)

Brand loyalty


### Revenue prediction
y = next year revenue
x = past data

new merchants are not considered