In [1]:
# Dependencies
import json
import requests
import pprint
from scipy.stats import linregress
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Import data as pandas DF's
Education_2016_df = pd.read_csv("Resources/2016_Education_Postcode.csv")
Income_2016_df = pd.read_csv("Resources/2016_Income_Postcode.csv")
VehicleCount_2016_df = pd.read_csv("Resources/2016_VehicleCount_Postcode.csv")
Education_2021_df = pd.read_csv("Resources/2021_Education_Postcode.csv")
Income_2021_df = pd.read_csv("Resources/2021_Income_Postcode.csv")
VehicleCount_2021_df = pd.read_csv("Resources/2021_VehicleCount_Postcode.csv")

#adjust name once year confirmed
FuelType_2021_df = pd.read_csv("Resources/FuelType_Postcode.csv")
#pd.read_csv("Resources/2016_FuelType_Postcode.csv")

In [3]:
def create_scatter(df, y_variable, x_variable):

    x_values = df[x_variable]
    y_values = df[y_variable]
    
    #get plot axis range for annotation location.
    ymin = df[y_variable].min()
    xmin = df[x_variable].min()

    ymax = df[y_variable].max()
    xmax = df[x_variable].max()

    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

    plt.scatter(
        x_values,
        y_values,
        color = 'blue',
        marker = 'o',
        s = 50,
        edgecolors = 'black')

    plt.plot(x_values, regress_values, "r-", linewidth = 1)

    #plt.annotate(line_eq, (xmin + 1, ymin + 5), fontsize =10, color = "red")

    plt.xlabel('Latitude')
    plt.ylabel(y_variable)
    plt.suptitle(f"The r-value is: {rvalue}", fontsize = 8, x = 0.2, y = 0.95)
    plt.annotate(line_eq, xy=(0.8, 1.05), fontsize =8, color = "red",
            xycoords='axes fraction', textcoords='axes fraction')

    plt.show()


In [4]:
def reshape_df(df):
    return df.pivot_table(index = ['Vehicle_type', 'Postcode'], 
                          columns = 'Fuel_type', 
                          values = 'Count').reset_index().rename_axis(None, axis=1)

In [5]:
def clean_headers(df):
    df.columns = df.columns.str.replace(" ", "_")
    return df

Clean Data

In [6]:
#Clean Headers
Education_2016_df = clean_headers(Education_2016_df)
Income_2016_df = clean_headers(Income_2016_df)
VehicleCount_2016_df = clean_headers(VehicleCount_2016_df)
Education_2021_df = clean_headers(Education_2021_df)
Income_2021_df = clean_headers(Income_2021_df)
VehicleCount_2021_df = clean_headers(VehicleCount_2021_df)
FuelType_2021_df = clean_headers(FuelType_2021_df)

In [7]:
FuelType_2021_df = reshape_df(FuelType_2021_df)
#FuelType_2016_df = reshape_df(FuelType_2016_df)


In [8]:
FuelType_2021_df = FuelType_2021_df[FuelType_2021_df["Vehicle_type"] != "Total"]
FuelType_2021_df = FuelType_2021_df[FuelType_2021_df["Postcode"] != "Total"]


Merge DFs so that Education, income, vehicle count, fuel type are in 1 df on postcode handle different postcodes that arent present in all datasets

In [9]:
#Combine the 2016 data into a single dataset.
data_education_income_2016 = pd.merge(Education_2016_df, Income_2016_df, how="inner", on= ["Postcode", "Postcode"])
data_complete_2016 = pd.merge(data_education_income_2016, VehicleCount_2016_df ,how="inner", on= ["Postcode", "Postcode"])
data_complete_2016 = data_complete_2016.drop(['State','Total_y', 'Total'], axis=1)

data_complete_2016.head()

Unnamed: 0,Postcode,State_x,Postgraduate_Degree_Level,Graduate_Diploma_and_Graduate_Certificate_Level,Bachelor_Degree_Level,Advanced_Diploma_and_Diploma_Level,Certificate_III_&_IV_Level,Secondary_Education_-_Years_10_and_above,Certificate_I_&_II_Level,Secondary_Education_-_Years_9_and_below,...,"$3,000_or_more_($156,000_or_more)",Not_stated_y,Not_applicable_y,No_motor_vehicles,One_motor_vehicle,Two_motor_vehicles,Three_motor_vehicles,Four_or_more_motor_vehicles,Not_stated,Not_applicable
0,2000,NSW,2836,307,7278,2852,931,6241,9,443,...,1796,4115,1251,5196,3229,672,63,41,2163,2473
1,2006,NSW,34,4,114,12,10,912,0,5,...,11,156,15,0,4,0,0,0,3,6
2,2007,NSW,903,95,2537,622,275,2496,0,200,...,196,1072,375,1656,974,144,16,20,607,247
3,2008,NSW,1387,174,3549,727,389,3269,0,142,...,359,1359,407,2738,1402,298,34,21,704,428
4,2009,NSW,1583,281,3653,1099,659,2387,3,244,...,1271,1254,1187,1617,2793,792,83,26,629,527


In [10]:
Education_2021_df['Postcode'] = Education_2021_df['Postcode'].astype(int)
data_education_income_2021 = pd.merge(Education_2021_df, Income_2021_df, how="inner", on= ["Postcode", "Postcode"])
data_complete_2021 = pd.merge(data_education_income_2021, VehicleCount_2021_df ,how="inner", on= ["Postcode", "Postcode"])
data_complete_2021 = data_complete_2021.drop(['State','Total_y', 'Total'], axis=1)

Merge DFS so that Education, income, vehicle count, fuel type are in 1 df on postcode handle different postcodes that arent present in all datasets

In [11]:
data_complete_2021

Unnamed: 0,Postcode,State_x,Postgraduate_Degree_Level,Graduate_Diploma_and_Graduate_Certificate_Level,Bachelor_Degree_Level,Advanced_Diploma_and_Diploma_Level,Certificate_III_&_IV_Level,Secondary_Education_-_Years_10_and_above,Certificate_I_&_II_Level,Secondary_Education_-_Years_9_and_below,...,"$3,500_or_more_($182,000_or_more)",Not_stated_y,Not_applicable_y,No_motor_vehicles,One_motor_vehicle,Two_motor_vehicles,Three_motor_vehicles,Four_or_more_motor_vehicles,Not_stated,Not_applicable
0,2000,NSW,3813,489,8530,3814,1137,4761,16,397,...,2059,2402,1418,6503,4125,742,89,38,1183,3948
1,2007,NSW,1134,134,2165,722,347,1426,0,163,...,220,565,485,1333,1182,222,23,6,362,617
2,2008,NSW,1731,228,3450,752,404,2265,0,97,...,439,845,409,2489,1650,317,59,23,426,995
3,2009,NSW,1872,346,3673,1162,712,2007,12,223,...,1243,931,1298,1612,2974,848,97,26,459,933
4,2010,NSW,4003,796,9068,2185,1402,4145,6,454,...,3049,2279,1434,6241,5654,1064,140,58,977,2548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2636,4377,QLD/NSW,9,3,31,35,120,184,0,52,...,4,64,114,3,66,84,50,18,30,37
2637,4380,QLD/NSW,155,90,680,509,1292,2293,10,754,...,70,677,1292,194,1197,1116,435,269,289,689
2638,4383,QLD/NSW,3,3,15,30,112,179,0,88,...,6,57,78,13,104,101,38,10,17,67
2639,4385,QLD/NSW,6,21,61,67,142,350,0,168,...,12,101,195,30,196,171,67,41,49,101


Once data is merged into a 2016 and a 2021 census df begin comparrison

Bar charts for dfs Comparing the diffenent years together. probably worth also comparing the changes in income and education level

Scatter showing income by fuel type? maybe 3 types so that we can see the relationships between income and diesel, income and ev etc.