In [None]:
import pandas as pd
from api_keys import apify_key
import json
import requests
import hvplot.pandas
import holoviews as hv
import geoviews as gv


In [2]:
%%capture --no-display
%run Burlington_Data_2021.ipynb

In [3]:
%%capture --no-display
%run Oshawa.ipynb

In [None]:
%%capture --no-display
%run Housing.ipynb

In [None]:
%%capture --no-display
%run Vaughan.ipynb

In [None]:
%%capture --no-display
%run "Milton Analysis Project 1 Bootcamp.ipynb"

In [None]:
all_cities_df = pd.concat([burlington_final_df, Oakville_df, vaughan_df, cleaned_milton_df, Oshawa_df])
all_cities_df = all_cities_df.iloc[:, :-1]
all_cities_df = all_cities_df.replace({'\$': '', ',': '', '%': ''}, regex=True)
all_cities_df = all_cities_df.astype(float)
all_cities_df

In [None]:
all_amenities_df = pd.concat([burlington_amenities_df, Oakville_amenities_df, vaughan_amenities_df, milton_leisure_df, Oshawa_amenities_df])
all_amenities_df = all_amenities_df.astype(int)
all_amenities_df

# Comparison between Estimated House Value vs Average Sale Price (both years on one bar graph)

In [None]:
import matplotlib.pyplot as plt

# Reset index to get Year into its own column and change type to integer
all_cities_reset_df = all_cities_df.reset_index(level='Year')
all_cities_reset_df['Year'] = all_cities_reset_df['Year'].astype(int)

# Filter the DataFrame to include only the relevant columns
est_house_value_df = all_cities_reset_df[['Year', 'Estimated House Value ($)']]

# Pivot the DataFrame to have 'Year' as the columns
estimated_house = est_house_value_df.pivot(columns='Year', values='Estimated House Value ($)')

# Plot the bar graph
estimated_house.plot(kind='bar', color=['red', 'blue'])
plt.title('Estimated House Value for all Cities in 2016 and 2021')
plt.ylabel('Estimated House Value in millions ($)')
plt.xticks(rotation=45, ha='right')  
plt.legend(title='Year')
plt.tight_layout()
plt.show()

In [None]:
avg_sale_price_df = all_cities_reset_df[['Year', 'Average Sale Price ($)']]

average_sale = avg_sale_price_df.pivot(columns='Year', values='Average Sale Price ($)')

# Plot the bar graph
average_sale.plot(kind='bar', color=['tab:red', 'tab:blue'])
plt.title('Average Sale Price for all Cities in 2016 and 2021')
plt.ylabel('Average Sale Price ($)')
plt.xticks(rotation=45, ha='right')  
plt.legend(title='Year')
plt.tight_layout()
plt.show()

In [None]:
estimated_avg_sale_df = all_cities_df[['Estimated House Value ($)','Average Sale Price ($)']]
estimated_avg_sale_df

In [None]:
#%matplotlib inline
graph = estimated_avg_sale_df.plot(kind = 'bar', title='Estimated House Value ($). vs Avg Sale Price ($)', color=["yellow", "green"])
graph.set_xlabel("City, Year")
graph.set_ylabel("Price in millions ($)")
plt.show()
plt.tight_layout()

In [None]:
new_built_df = all_cities_reset_df[['Year', 'New Built']]

new_builds = new_built_df.pivot(columns='Year', values='New Built')

# Plot the bar graph
new_builds.plot(kind='bar', color=['tab:orange', 'tab:purple'])
plt.title('New Built for all Cities in 2016 and 2021')
plt.ylabel('No. of New Builts')
plt.xticks(rotation=45, ha='right')  
plt.legend(title='Year')
plt.tight_layout()
plt.show()

# Unemployment Rate

In [None]:
import matplotlib.pyplot as plt

# Reset index to get Year into its own column and change type to integer
all_cities_reset_df = all_cities_df.reset_index(level='Year')
all_cities_reset_df['Year'] = all_cities_reset_df['Year'].astype(int)

# Filter the DataFrame to include only the relevant columns
unemployment_df = all_cities_reset_df[['Year', 'Unemployment Rate (%)']]

# Pivot the DataFrame to have 'Year' as the columns
unemployment_pivot = unemployment_df.pivot(columns='Year', values='Unemployment Rate (%)')

# Plot the bar graph
unemployment_pivot.plot(kind='bar', stacked=False,color=['#3f899e', '#858437'])
plt.title('Unemployment Rate Between Cities in 2016 and 2021')
plt.ylabel('Unemployment Rate (%)')
plt.xlabel('')
plt.xticks(rotation=45, ha='right')  # Rotating x-axis labels for better readability
plt.legend(title='Year', bbox_to_anchor=(1.25, 1))
plt.show()

# Total number of leisure places per city

In [None]:
import matplotlib.pyplot as plt

# Reset index to get Year into its own column and change type to integer
all_amenities_reset_df = all_amenities_df.reset_index(level='City')
all_amenities_reset_df['City'] = all_amenities_reset_df['City']

# Filter the DataFrame to include only the relevant columns
leisure_df = all_amenities_reset_df[['City', '# of Leisure Spaces']]

# Extracting data for the bar graph
cities = leisure_df['City']
leisure_spaces = leisure_df['# of Leisure Spaces']

# Create Graph
bars = plt.bar(cities, leisure_spaces, color='#3f899e')
plt.ylabel('# of Leisure Spaces')
plt.title('Number of Leisure Spaces in Each City')
plt.xticks(rotation=45, ha='right')  # Rotating x-axis labels for better readability

# Adding count numbers to the top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')

plt.tight_layout()

plt.show()

In [None]:
years = ['2016', '2021']
cities = ['Burlington', 'Oakville', 'Vaughan', 'Milton', 'Oshawa']
bar_width=0.35

y_value_1 = [
    all_cities_df.loc[('Burlington', 2016), 'Population'],
    all_cities_df.loc[('Oakville', '2016'), 'Population'],
    all_cities_df.loc[('Vaughan', '2016'), 'Population'],
    all_cities_df.loc[('Milton', '2016'), 'Population'],
    all_cities_df.loc[('Oshawa', '2016'), 'Population']
]
y_value_2 = [
    all_cities_df.loc[('Burlington', 2021), 'Population'],
    all_cities_df.loc[('Oakville', ' 2021'), 'Population'],
    all_cities_df.loc[('Vaughan', ' 2021'), 'Population'],
    all_cities_df.loc[('Milton', '2021'), 'Population'],
    all_cities_df.loc[('Oshawa', ' 2021'), 'Population']
]

x_axis = np.arange(len(cities))

plt.bar(x_axis - bar_width/2, y_value_1, color="b", width=bar_width, align="center", label='2016')
plt.bar(x_axis + bar_width/2, y_value_2, color="r", width=bar_width, align="center", label='2021', alpha=0.7)

plt.xticks(x_axis, cities, rotation="vertical")
plt.xlabel('Cities')
plt.ylabel('Population')
plt.title('Population Comparison (2016 and 2021) for Selected Cities')
plt.legend()
plt.show()

In [None]:
years = ['2016', '2021']
cities = ['Burlington', 'Oakville', 'Vaughan', 'Milton', 'Oshawa']

B_income_2016 = all_cities_df.loc[('Burlington', 2016), 'Household Income ($)'], all_cities_df.loc[('Burlington', 2021), 'Household Income ($)']
Oa_income_2016 = all_cities_df.loc[('Oakville', '2016'), 'Household Income ($)'], all_cities_df.loc[('Oakville', ' 2021'), 'Household Income ($)']
V_income_2016 = all_cities_df.loc[('Vaughan', '2016'), 'Household Income ($)'], all_cities_df.loc[('Vaughan', ' 2021'), 'Household Income ($)']
M_income_2016 = all_cities_df.loc[('Milton', '2016'), 'Household Income ($)'], all_cities_df.loc[('Milton', '2021'), 'Household Income ($)']
Os_income_2016 = all_cities_df.loc[('Oshawa', '2016'), 'Household Income ($)'], all_cities_df.loc[('Oshawa', ' 2021'), 'Household Income ($)']

B_rent_2016 = all_cities_df.loc[('Burlington', 2016), 'Average Rent ($)'], all_cities_df.loc[('Burlington', 2021), 'Average Rent ($)']
Oa_rent_2016 = all_cities_df.loc[('Oakville', '2016'), 'Average Rent ($)'], all_cities_df.loc[('Oakville', ' 2021'), 'Average Rent ($)'] 
V_rent_2016 = all_cities_df.loc[('Vaughan', '2016'), 'Average Rent ($)'], all_cities_df.loc[('Vaughan', ' 2021'), 'Average Rent ($)'] 
M_rent_2016 = all_cities_df.loc[('Milton', '2016'), 'Average Rent ($)'], all_cities_df.loc[('Milton', '2021'), 'Average Rent ($)'] 
Os_rent_2016 = all_cities_df.loc[('Oshawa', '2016'), 'Average Rent ($)'], all_cities_df.loc[('Oshawa', ' 2021'), 'Average Rent ($)'] 
x_axis = years

plt.plot(x_axis, B_income_2016, label='Burlington Income', marker='o')
plt.plot(x_axis, Oa_income_2016, label='Oakville Income ', marker='o')
plt.plot(x_axis, V_income_2016, label='Vaughan Income ', marker='o')
plt.plot(x_axis, M_income_2016, label='Milton Income ', marker='o')
plt.plot(x_axis, Os_income_2016, label='Oshawa Income ', marker='o')
plt.plot(x_axis, B_rent_2016, label='Burlington Rent ', marker='o')
plt.plot(x_axis, Oa_rent_2016, label='Oakville Rent ', marker='o')
plt.plot(x_axis, V_rent_2016, label='Vaughan Rent ', marker='o')
plt.plot(x_axis, M_rent_2016, label='Milton Rent ', marker='o')
plt.plot(x_axis, Os_rent_2016, label='Oshawa Rent ', marker='o')

plt.xlabel('Cities')
plt.ylabel('Amount in $')
plt.title('Household Income and Average Rent Comparison (2016 and 2021) for Selected Cities')

plt.legend()

plt.show()

In [None]:
x_axis = years

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, gridspec_kw={'height_ratios': [1, 1]})

ax1.plot(x_axis, B_income_2016, label='Burlington Income', marker='o')
ax1.plot(x_axis, Oa_income_2016, label='Oakville Income ', marker='o')
ax1.plot(x_axis, V_income_2016, label='Vaughan Income ', marker='o')
ax1.plot(x_axis, M_income_2016, label='Milton Income ', marker='o')
ax1.plot(x_axis, Os_income_2016, label='Oshawa Income ', marker='o')

ax1.set_ylabel('Income in $')
ax1.set_ylim(70000, 150000)
ax2.set_ylim(1000, 3500)
ax1.spines['bottom'].set_visible(False)
ax1.xaxis.tick_top()
ax1.xaxis.set_label_position('top')

ax2.plot(x_axis, B_rent_2016, label='Burlington Rent ', marker='o')
ax2.plot(x_axis, Oa_rent_2016, label='Oakville Rent ', marker='o')
ax2.plot(x_axis, V_rent_2016, label='Vaughan Rent ', marker='o')
ax2.plot(x_axis, M_rent_2016, label='Milton Rent ', marker='o')
ax2.plot(x_axis, Os_rent_2016, label='Oshawa Rent ', marker='o')

ax2.set_xlabel('Years')
ax2.set_ylabel('Rent in $')
ax1.set_title('Household Income and Average Rent Comparison (2016 and 2021) for Selected Cities')

ax1.legend(loc='best')
ax2.legend(loc='best')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

**<font size =5> School Graph <font>***

In [None]:
all_amenities_df[["# of Public Schools","# of Catholic Schools","# of Private Schools"]].plot(kind = 'bar',
                                                                        title = 'Number of schools by type per cities',
                                                                        ylabel = 'Total Number',
                                                                         rot = 45)

**<font size = 5> Owned vs Rented Houses <font>**

In [None]:
#Extract a df from the all_cities_df
rented_vs_owned = all_cities_df[['Owned Houses',"Rented Houses"]]
rented_vs_owned

In [None]:
#Version 1
%matplotlib inline
rented_vs_owned.plot(kind = 'bar', title = "Owned Houses vs Rented Houses in 2016 and 2021 per city", rot =45,
                    ylabel = "Number of Houses")

In [None]:
rented_vs_owned = all_cities_df[["Population","Owned Houses","Rented Houses"]]


In [None]:
#Determining my axis
x_axis = rented_vs_owned["Population"]
y_axis = rented_vs_owned["Owned Houses"]
y_axis2 = rented_vs_owned["Rented Houses"]


#Plotting the chart
fig, ax = plt.subplots()
ax.scatter(x_axis, y_axis, color = 'brown',label='Owned Houses')
ax.scatter(x_axis, y_axis2, color = 'orange',label='Rented Houses')
plt.xlabel ("Population")
plt.ylabel ("Number of Houses")
plt.title('Owned vs Rented Houses increase by population')
plt.legend()

#Adding regression lines
output_owned_houses = st.linregress(x_axis, y_axis)
output_rented_houses = st.linregress(x_axis, y_axis2)
slope1 = output_owned_houses[0]
slope2 = output_rented_houses[0]
intercept1 = output_owned_houses[1]
intercept2 = output_rented_houses[1]
regressline1 = slope1*x_axis+intercept1
regressline2 = slope2*x_axis+intercept2
line_eq1 = f'y = {round(slope1,2)}x+{round(intercept1,2)}'
line_eq2 = f'y = {round(slope2,2)}x+{round(intercept2,2)}'
corr1 = np.corrcoef(x_axis,y_axis)
corr2 = np.corrcoef(x_axis,y_axis2)
#Displaying the regression lines
plt.plot(x_axis,regressline1, color = 'pink')
plt.plot(x_axis,regressline2, color = 'yellow')
plt.annotate(line_eq1,(155000,70000), color = 'brown')
plt.annotate(line_eq2,(250000,20000),color = 'orange')
fig.set_figheight(5.5)
fig.set_figwidth(7)
plt.plot()

#Correlation coefficients 
corr1 = np.corrcoef(x_axis,y_axis)
corr2 = np.corrcoef(x_axis,y_axis2)
print(f'The correlation between the increase of population and owned houses is {corr1[1][0]}')
print(f'The correlation between the increase of population and rented houses is {corr2[1][0]}')


In [None]:
#Pie chart
rented_vs_owned = rented_vs_owned.reset_index()
rented_vs_owned['Year'] = rented_vs_owned["Year"].astype(int)
df_2016 = rented_vs_owned.loc[rented_vs_owned['Year']==2016]

In [None]:
df_2021 = rented_vs_owned.loc[rented_vs_owned['Year']==2021]

In [None]:
fig, ax1 = plt.subplots(1, 2)

values_o_2016 = df_2016['Owned Houses']
values_r_2016 = df_2016['Rented Houses']
values_o_2021 = df_2021['Owned Houses']
values_r_2021 = df_2021['Rented Houses']
names = df_2016["City"]
colors =['yellow',"lightblue","brown","pink","green"]
explosion =[0.05,0.05,0.05,0.05,0.05]


# ...
ax1[0].pie(values_o_2016,labels = names, colors = colors, autopct = '%1.1f%%', explode = explosion)
ax1[0].set_title('Owned Houses')
#...
ax1[1].pie(values_r_2016,labels = names, colors = colors, autopct = '%1.1f%%',explode = explosion)
ax1[1].set_title('Rented Houses')
fig.supylabel('2016')
fig.tight_layout()



# ...
fig, ax2 = plt.subplots(1, 2)
# ...
ax2[0].pie(values_o_2021,labels = names, colors = colors, autopct = '%1.1f%%',explode = explosion)

# ...
ax2[1].pie(values_r_2021,labels = names, colors = colors, autopct = '%1.1f%%',explode = explosion)
fig.supylabel('2021')
fig.tight_layout()



plt.show()

In [None]:
map_cities = ['Burlington', 'Oakville', 'Oshawa', 'Vaughan', 'Milton']
map_cities_dic = {
    'City': map_cities
}
map_cities_df = pd.DataFrame(map_cities_dic)

map_cities_df['Lat'] = ""
map_cities_df['Lon'] = ""

params = {
    "apiKey":apify_key,
    "format":"json"
}
base_url = "https://api.geoapify.com/v1/geocode/search"
for index, row in map_cities_df.iterrows():


    city = f'{row["City"]} , Ontario, Canada'

    params['text'] = city

    response = requests.get(base_url, params = params).json()

    lat = response['results'][0]['lat']
    lon = response['results'][0]['lon']
    
    map_cities_df.loc[index, "Lat"] = lat
    map_cities_df.loc[index, "Lon"] = lon
map_cities_df.set_index('City', inplace=True)  
map_cities_df

merged_map_df = map_cities_df.merge(all_cities_df['Population'], left_index=True, right_index=True, how='left')
merged_map_df = merged_map_df.reset_index()
merged_map_df

merged_map_df['City'] = merged_map_df['City'].astype('category')
min_pop = merged_map_df['Population'].min()
max_pop = merged_map_df['Population'].max()

merged_map_df['Scaled_Population'] = (
    ((merged_map_df['Population'] - min_pop) / (max_pop - min_pop)) * (900 - 300) + 300
)

final_map = merged_map_df.hvplot.points(
    'Lon', 'Lat', geo=True, tiles="OSM",
    size='Scaled_Population',
    color='City',
    alpha=0.7,
    hover_cols=['City', 'Population'],
    legend=False)
final_map
