In [1]:
import numpy as np
import random
import pandas as pd

In [60]:
# load the data
pop = pd.read_csv('../../data/curated/vic_population_density_sa2.csv', index_col = 0)

pop = pop.drop(columns=["S/T name"])
pop = pop.drop(columns=["SA2 name"])
for y in range(2001, 2022):
    col_name = "population_density_of_"+str(y)
    pop[str(y)] = pop[col_name]
    pop = pop.drop(columns=[col_name])

pop = pop.melt(id_vars=["SA2 code"], 
        var_name="year", 
        value_name="population_rate")
pop.head()
# pop = pop.loc[pop['SA2 code']==201011001]
pop = pd.get_dummies(pop, columns=['SA2 code'])
pop
# pop.dtypes

Unnamed: 0,year,population_rate,SA2 code_201011001,SA2 code_201011002,SA2 code_201011005,SA2 code_201011006,SA2 code_201011007,SA2 code_201011008,SA2 code_201011481,SA2 code_201011482,...,SA2 code_217031471,SA2 code_217031472,SA2 code_217031473,SA2 code_217031474,SA2 code_217031475,SA2 code_217031476,SA2 code_217041477,SA2 code_217041478,SA2 code_217041479,SA2 code_217041480
0,2001,109.222011,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2001,927.177419,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2001,103.100775,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2001,121.461988,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2001,31.680993,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10957,2021,2.641610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10958,2021,2.133525,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10959,2021,4.889096,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10960,2021,359.967949,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

# define features and outcome
features = list(pop.columns)
features.remove('population_rate')
outcome = ['population_rate']

# partition data into training and test sets
X = pop[features]
y = pop[outcome]
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)# , random_state = 42)

# train model
reg = LinearRegression()
reg.fit(x_train, y_train)

# bootstrap predictions
accuracy = []
n_iterations = 1000
for i in range(n_iterations):
    x_test, y_test = resample(x_train, y_train, replace=True)
    # evaluate model
    r2_score = reg.score(x_test, y_test)
    accuracy.append(r2_score)

In [31]:
import seaborn as sns
# plot distribution of accuracy
sns.kdeplot(accuracy)
plt.title("Accuracy across 1000 bootstrap samples of the held-out test set")
plt.xlabel("Accuracy")
plt.show()

In [None]:
# get median
median = np.percentile(accuracy, 50)

# get 95% interval
alpha = 100-95
lower_ci = np.percentile(accuracy, alpha/2)
upper_ci = np.percentile(accuracy, 100-alpha/2)

print(f"Model accuracy is reported on the test set. 1000 bootstrapped samples " 
      f"were used to calculate 95% confidence intervals.\n"
      f"Median accuracy is {median:.2f} with a 95% a confidence "
      f"interval of [{lower_ci:.2f},{upper_ci:.2f}].")

sns.kdeplot(accuracy)
plt.title("Accuracy across 1000 bootstrap samples of the held-out test set\n"
          "showing median with 95\\% confidence intervals")
plt.xlabel("Accuracy")
plt.axvline(median,0, 14, linestyle="--", color="red")
plt.axvline(lower_ci,0, 14, linestyle="--", color="red")
plt.axvline(upper_ci,0, 14, linestyle="--", color="red")
plt.show()

## predict population per suburb from 2022-2027

In [74]:
raw = pd.read_csv('../../data/curated/vic_population_density_sa2.csv', index_col = 0)
sa2 = raw['SA2 code']

In [75]:
YEAR = [2022, 2023, 2024, 2025, 2026, 2027] # as we are predicting population for 2022-2027

In [76]:
all_year = []
for year in YEAR:
    for i in range(len(sa2)):
        all_year.append(year)
all_sa2 = list(sa2)*6


In [77]:
data = {
    'year':all_year,
    'sa2': all_sa2,
    'sa2_code': all_sa2
}
df = pd.DataFrame(data)
df
df2 = pd.get_dummies(df, columns=['sa2'])



In [78]:
features = list(df2.columns)
features.remove('sa2_code')
outcome = ['population_rate']
features

['year',
 'sa2_201011001',
 'sa2_201011002',
 'sa2_201011005',
 'sa2_201011006',
 'sa2_201011007',
 'sa2_201011008',
 'sa2_201011481',
 'sa2_201011482',
 'sa2_201011483',
 'sa2_201011484',
 'sa2_201021009',
 'sa2_201021010',
 'sa2_201021011',
 'sa2_201021012',
 'sa2_201031013',
 'sa2_201031014',
 'sa2_201031015',
 'sa2_201031016',
 'sa2_201031017',
 'sa2_202011018',
 'sa2_202011019',
 'sa2_202011020',
 'sa2_202011021',
 'sa2_202011022',
 'sa2_202011023',
 'sa2_202011024',
 'sa2_202011025',
 'sa2_202021026',
 'sa2_202021027',
 'sa2_202021028',
 'sa2_202021029',
 'sa2_202021030',
 'sa2_202021031',
 'sa2_202031032',
 'sa2_202031033',
 'sa2_203011034',
 'sa2_203011035',
 'sa2_203011036',
 'sa2_203021037',
 'sa2_203021039',
 'sa2_203021040',
 'sa2_203021042',
 'sa2_203021043',
 'sa2_203021044',
 'sa2_203021045',
 'sa2_203021046',
 'sa2_203021047',
 'sa2_203021485',
 'sa2_203021486',
 'sa2_203021487',
 'sa2_203021488',
 'sa2_203031048',
 'sa2_203031049',
 'sa2_203031051',
 'sa2_203031052',
 

In [86]:

# partition data into training and test sets
X_test = df2[features]
y_pred = reg.predict(X_test)
df['pred'] = y_pred


Feature names unseen at fit time:
- sa2_201011001
- sa2_201011002
- sa2_201011005
- sa2_201011006
- sa2_201011007
- ...
Feature names seen at fit time, yet now missing:
- SA2 code_201011001
- SA2 code_201011002
- SA2 code_201011005
- SA2 code_201011006
- SA2 code_201011007
- ...



Unnamed: 0,year,sa2,sa2_code,pred
0,2022,201011001,201011001,575.422202
1,2022,201011002,201011002,1380.387025
2,2022,201011005,201011005,535.584080
3,2022,201011006,201011006,605.456749
4,2022,201011007,201011007,431.968429
...,...,...,...,...
3127,2027,217031476,217031476,639.454604
3128,2027,217041477,217041477,545.693218
3129,2027,217041478,217041478,562.253630
3130,2027,217041479,217041479,889.385469


In [88]:
df.columns
result = df.drop(columns = 'sa2_code')

In [89]:
import os
path = '../../data/curated/feature_prediction/'
os.makedirs(path)

os.chdir
result.to_csv('../../data/curated/feature_prediction/22_27population.csv')

In [96]:
result.columns

Index(['year', 'sa2', 'pred'], dtype='object')

In [102]:
result

Unnamed: 0,year,sa2,pred
0,2022,201011001,575.422202
1,2022,201011002,1380.387025
2,2022,201011005,535.584080
3,2022,201011006,605.456749
4,2022,201011007,431.968429
...,...,...,...
3127,2027,217031476,639.454604
3128,2027,217041477,545.693218
3129,2027,217041478,562.253630
3130,2027,217041479,889.385469


In [105]:
#int_features = [float(x) for x in request.form.values()] #Convert string inputs to float.
#features = [np.array(int_features)]  #Convert to the form [[a, b]] for input to the model
#prediction = model.predict(features)  # features Must be in the form [[a, b]]
result[(result.year == 2023)&(result.sa2 == 201011001)].pred

522    610.429168
Name: pred, dtype: float64

In [92]:
import pickle
pickle.dump(reg, open('population_model.pkl','wb'))

model = pickle.load(open('population_model.pkl','rb'))

In [None]:
reg(2022)

In [91]:
sa2

651     201011001
652     201011002
653     201011005
654     201011006
655     201011007
          ...    
1168    217031476
1169    217041477
1170    217041478
1171    217041479
1172    217041480
Name: SA2 code, Length: 522, dtype: int64