In [3]:
# import required libries
import pandas as pd

# define required path
raw_data = '../data/raw/'
curated_data = '../data/curated/'

In [4]:
# read in datasets
crime = pd.read_csv(curated_data + 'crime_rate.csv')
suburb = pd.read_csv(raw_data + 'suburb.csv')
traffic = pd.read_csv(curated_data + 'traffic_dataset.csv')
income = pd.read_csv(curated_data + 'income.csv')

In [5]:
# discard duplicate index
del crime['Unnamed: 0']
del suburb['Unnamed: 0']
del traffic['Unnamed: 0']
del income['Unnamed: 0']

In [6]:
# drop total data
crime = crime.loc[crime["Local Government Area"] != 'Total']

In [7]:
# drop duplicate suburbs
suburb['Suburb'] = suburb['Suburb'].drop_duplicates()
suburb = suburb.dropna()

In [8]:
# select crime rate in 2021
crime = crime.loc[crime['Year'] == 2021]

In [9]:
# preprocessing the column to drop space
crime['Local Government Area'] = crime['Local Government Area'].str[1:]

In [10]:
# left join crime dataset to suburb dataset
crime2021 = suburb[['Suburb','LGA']].join(crime[['Local Government Area','crime_rate']].set_index('Local Government Area'), \
    on = 'LGA').reset_index(drop=True)

In [11]:
crime2021.to_csv(curated_data + 'crime2021.csv')

In [12]:
# extract suburb from stop name
traffic['Suburb'] = traffic['STOP_NAME'].str.extract(r"\(([A-Za-z ]+)\)", expand=False)

In [13]:
# calculate the number of station each suburb
traffic_count = traffic.groupby(['Suburb'], as_index=False)['STOP_ID'].count()

In [14]:
traffic_count.columns = ['Suburb','traffic_count']

In [15]:
# calculate the annual traffic
traffic_sum = traffic.groupby(['Suburb'], as_index=False)['Pax_annual'].sum()

In [16]:
traffic_sum.columns = ['Suburb','traffic_sum']

In [17]:
traffic_count.to_csv(curated_data + 'traffic_count.csv')
traffic_sum.to_csv(curated_data + 'traffic_sum.csv')

In [18]:
income = pd.read_csv(curated_data + 'income.csv')

In [19]:
# allocate data in sa2 to suburbs
suburb_income = suburb.join(income.set_index('SA2_code'), on = 'SA2_code').dropna().reset_index(drop=True)

In [20]:
print(suburb_income)

      Postcode                   Suburb        LGA     SA2_code  \
0         3000                MELBOURNE  Melbourne  206041122.0   
1         3002           EAST MELBOURNE      Yarra  206041119.0   
2         3004    ST KILDA ROAD CENTRAL      Yarra  206041125.0   
3         3004  ST KILDA ROAD MELBOURNE      Yarra  206041125.0   
4         3005       WORLD TRADE CENTRE  Melbourne  206041118.0   
...        ...                      ...        ...          ...   
3317      8007      COLLINS STREET WEST      Yarra  206041122.0   
3318      8009            FLINDERS LANE      Yarra  206041122.0   
3319      8010               LAW COURTS      Yarra  210021235.0   
3320      8011   LITTLE LONSDALE STREET      Yarra  206041122.0   
3321      9999               NORTH POLE      Yarra  206041122.0   

                SA2_name  Unnamed: 0  2014_15_median  2015_16_median  \
0              Melbourne       697.0         32697.0         31242.0   
1         East Melbourne       694.0         67966.

In [20]:
suburb_income.to_csv(curated_data + 'suburb_income.csv')