# Exploratory Data Analysis_Group2

## Import Library

In [None]:
import  pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from IPython.display import display
import json
import os


## Read data

In [None]:
data_sa2_all = pd.read_csv('../../data/curated/final_merged_data_sa2.csv')

In [None]:
shp_sa2 = gpd.read_file('../../data/landing/region_data/sa2_dataset/sa2_unzip/SA2_2021_AUST_GDA2020.shp')

In [None]:
shp_LGA = gpd.read_file('../../data/landing/region_data/LGA_dataset/LGA_unzip/LGA_2022_AUST_GDA94.shp')

## Data understanding

In [None]:
sa2_names_csv = data_sa2_all['SA2'].unique()

In [None]:
data_sa2_all.info()

In [None]:
data_sa2_all.head()

In [None]:
data_sa2_all.describe()

### Extract the median rent

In [None]:
data_sa2_all_about_rent = data_sa2_all.iloc[:, :202]

In [None]:
data_sa2_rent = data_sa2_all_about_rent.loc[:, ['SA2'] + [col for col in data_sa2_all_about_rent.columns if 'median' in col]]

In [None]:
data_sa2_rent.head()

In [None]:
data_LGA_rent = data_sa2_all_about_rent.loc[:, ['LGA'] + [col for col in data_sa2_all_about_rent.columns if 'median' in col]].drop_duplicates()

In [None]:
data_LGA_rent

### Extract the number of rental

In [None]:
data_sa2_count = data_sa2_all_about_rent.loc[:, ['SA2'] + [col for col in data_sa2_all_about_rent.columns if 'count' in col]]

In [None]:
data_sa2_count.head()

## time series for Average rent and count(All region)

In [None]:
data_sa2_rent_median = data_sa2_rent.median(numeric_only=True).to_frame(name='rent')
data_sa2_count_median = data_sa2_count.median(numeric_only=True).to_frame(name='count')

In [None]:
data_sa2_rent_median.index = pd.to_datetime(data_sa2_rent_median.index.str.replace('median', '').str.strip(), format='%b %Y', errors='coerce')
data_sa2_count_median.index = pd.to_datetime(data_sa2_count_median.index.str.replace('count', '').str.strip(), format='%b %Y', errors='coerce')

In [None]:
data_sa2_rent_count = pd.merge(data_sa2_rent_median, data_sa2_count_median, left_index=True, right_index=True, suffixes=('rent', 'count'))

In [None]:
data_sa2_rent_count

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(data_sa2_rent_count.index, data_sa2_rent_count['rent'], label='Median Rent', color='b')
plt.plot(data_sa2_rent_count.index, data_sa2_rent_count['count'], label='Rental Count', color='r')
plt.title('Time Series of Median Rent and Rental Count')
plt.xlabel('Year')
plt.ylabel('Value')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

## Time Series for rent of All LGA region

In [None]:
data_LGA_rent_timeseries = data_LGA_rent.copy()
data_LGA_rent_timeseries.set_index('LGA', inplace=True)
data_LGA_rent_timeseries_columns = data_LGA_rent_timeseries.columns
transposed_data = data_LGA_rent_timeseries.T
# Draw the plot
plt.figure(figsize=(15, 8))
for region in transposed_data.columns:
    plt.plot(transposed_data.index, transposed_data[region], alpha=0.6)
plt.xlabel('Year')
plt.ylabel('Median Rent Price')
plt.title('Time-Series Plot for Median Rent Prices in Different LGAs')
plt.xticks(transposed_data.index[::4], rotation=45)
plt.tight_layout()
plt.show()

## Feature Correlation

In [None]:
selected_columns_2018 = [
    'SA2', 'Mar 2018 median','Jun 2018 median', 'Sep 2018 median', 'Dec 2018 median',
    'Mar 2018 count','Jun 2018 count', 'Sep 2018 count', 'Dec 2018 count','Median Criminal Count',
    'entertainments_count', 'hospital_count', 'park_count',  'psf_count',  'school2_count',
    'school1_count', 'shop_count', 'stop_count', 'population_density_2018', 
    'percentage_working_population_2018', 'nBusiness_2019', 'nHouses_2018', 'nTownhouses_2018', 'nApartments_2018', 'nDwellings_2018', 'percentage_year_12_2016','nRented_2016',
     'nHomeless_2016',  'distance_to_cbd',
    'nJob_2018', 'median_income_2018','nEmployed_2016','nUnEmployed_2016'
]
data_18 = data_sa2_all[selected_columns_2018].copy()

In [None]:
data_18['average_count_2018'] = data_18[['Mar 2018 count','Jun 2018 count', 'Sep 2018 count', 'Dec 2018 count']].mean(axis=1)
data_18['average_rent_2018'] = data_18[['Mar 2018 median','Jun 2018 median', 'Sep 2018 median', 'Dec 2018 median']].mean(axis=1)
data_18 = data_18.drop(columns=['Jun 2018 count', 'Sep 2018 count', 'Dec 2018 count',
                                'Jun 2018 median', 'Sep 2018 median', 'Dec 2018 median'])


In [None]:
data_18

In [None]:
correlation_with_rent = data_18.corr(numeric_only=True)['average_rent_2018']
top_10_correlated_features = correlation_with_rent.abs().sort_values(ascending=False).head(13).index  
top_10_correlated_features = top_10_correlated_features[top_10_correlated_features != 'average_rent_2018']
correlation_matrix_top_10 = data_18[top_10_correlated_features].corr()
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix_top_10, cmap='coolwarm', annot=True, linewidths=0.5, linecolor='gray', cbar=True)
plt.yticks(fontsize=14)
plt.title('Top Correlated Features with Average Rent 2018', fontsize=16)
plt.show()

## SA2 region & LGA region

### Geo of Australia by SA2

In [None]:
shp_sa2.plot()

### Geo of Victoria by SA2

In [None]:
victoria_sa2 = shp_sa2[shp_sa2['STE_NAME21'] == 'Victoria']

In [None]:
geo_json_data_victoria = victoria_sa2.to_json()
m = folium.Map(location=[-37.4713, 144.7852], zoom_start=7)
folium.GeoJson(geo_json_data_victoria, name="Victoria SA2").add_to(m)
folium.LayerControl().add_to(m)
display(m)

In [None]:
# Plotting the Victoria SA2 regions
plt.figure(figsize=(30, 30))
victoria_sa2.plot()
plt.title("SA2 Boundaries in Victoria, Australia")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

### Highlighting regions present in the CSV

In [None]:
victoria_sa2['SA2_NAME21'] = victoria_sa2['SA2_NAME21'].str.strip()  
victoria_sa2['highlight'] = victoria_sa2['SA2_NAME21'].apply(lambda x: x in sa2_names_csv)
plt.figure(figsize=(50, 50))
base = victoria_sa2.plot(edgecolor='black', linewidth=0.1, color='lightblue', alpha=0.7)
victoria_sa2[victoria_sa2['highlight']].plot(ax=base, color='red', alpha=0.6)
plt.show()

### Geo of Victoria by LGA

In [None]:
victoria_LGA = shp_LGA[shp_LGA['STE_NAME21'] == 'Victoria']

In [None]:
plt.figure(figsize=(30, 30))
victoria_LGA.plot()
plt.title("LGA Boundaries in Victoria, Australia")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
geo_json_data_victoria_LGA = victoria_LGA.to_json()
m = folium.Map(location=[-37.4713, 144.7852], zoom_start=7)
folium.GeoJson(geo_json_data_victoria_LGA, name="Victoria LGA").add_to(m)
folium.LayerControl().add_to(m)
display(m)

## Sort out the data needed

In [None]:
data_LGA_rent

### Calculate the average annual rent for each region

In [None]:
data_LGA_rent_long = data_LGA_rent.melt(id_vars='LGA', var_name='Date', value_name='Mean Rent')
data_LGA_rent_long['Date'] = data_LGA_rent_long['Date'].str.replace(' median', '')
data_LGA_rent_long['Date'] = pd.to_datetime(data_LGA_rent_long['Date'])
data_LGA_rent_long['Year'] = data_LGA_rent_long['Date'].dt.year
mean_rent_per_city_year = data_LGA_rent_long.groupby(['LGA', 'Year'])['Mean Rent'].mean().reset_index()

In [None]:
mean_rent_per_city_year

In [None]:
mean_rent_per_city_5year = mean_rent_per_city_year[mean_rent_per_city_year['Year'].astype(str).isin(['1999', '2004', '2009', '2014', '2019', '2024'])]

In [None]:
mean_rent_per_city_5year

### Calculate The Growth Rate

In [None]:
mean_rent_per_city_5year_growth_rate = mean_rent_per_city_5year.copy()
mean_rent_per_city_5year_growth_rate['Growth Rate'] = mean_rent_per_city_5year_growth_rate.groupby('LGA')['Mean Rent'].pct_change(periods=1) * 100

In [None]:
mean_rent_per_city_5year_growth_rate.dropna()

In [None]:
growth_rate_19_24 = mean_rent_per_city_5year_growth_rate[mean_rent_per_city_5year_growth_rate['Year'].astype(str).isin(['2024'])]

In [None]:
growth_rate_19_24

In [None]:
growth_rate = mean_rent_per_city_year.copy()
growth_rate['Growth Rate'] = growth_rate.groupby('LGA')['Mean Rent'].pct_change(periods=1) * 100

In [None]:
growth_rate

In [None]:
mean_growth_rate_per_city = growth_rate.groupby('LGA', as_index=False)['Growth Rate'].mean()
top_10_growth_cities = mean_growth_rate_per_city.nlargest(10, 'Growth Rate')
top_10_growth_rate_data = growth_rate[growth_rate['LGA'].isin(top_10_growth_cities['LGA'])]

In [None]:
top_10_growth_rate_data

### 2024 rent by LGA

In [None]:
mean_rent_per_city_year_2024 = mean_rent_per_city_year[mean_rent_per_city_year['Year'].astype(str).isin(['2024'])]

In [None]:
mean_rent_per_city_year_2024

## Time Series for 'Top 10 Cities with Highest Average Growth Rate Over Time'

In [None]:
plt.figure(figsize=(14, 8))
for city in top_10_growth_cities['LGA']:
    city_data = top_10_growth_rate_data[top_10_growth_rate_data['LGA'] == city]
    plt.plot(city_data['Year'], city_data['Growth Rate'], label=city)
plt.title('Top 10 Cities with Highest Average Growth Rate Over Time')
plt.xlabel('Year')
plt.ylabel('Growth Rate (%)')
plt.legend(title='Cities', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

## mean_growth_rate_per_city in geo

In [None]:
victoria_gdf_mean_growth_rate = victoria_LGA.merge(mean_growth_rate_per_city, left_on='LGA_NAME22', right_on='LGA')

In [None]:
top_10_growth_cities = victoria_gdf_mean_growth_rate.nlargest(10, 'Growth Rate')
m = folium.Map(location=[-37.81, 144.96], zoom_start=6, tiles="cartodbpositron")
geojson_data = json.loads(victoria_gdf_mean_growth_rate.to_json())

folium.Choropleth(
    geo_data=geojson_data,
    name='choropleth',
    data=victoria_gdf_mean_growth_rate,
    columns=['LGA', 'Growth Rate'],
    key_on='feature.properties.LGA_NAME22',
    fill_color='Blues',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Average Growth Rate (%)'
).add_to(m)

for idx, row in top_10_growth_cities.iterrows():
    lga_name = row['LGA']
    growth_rate = row['Growth Rate']
    city_geometry = victoria_gdf_mean_growth_rate[victoria_gdf_mean_growth_rate['LGA_NAME22'] == lga_name].geometry.centroid.iloc[0]
    
    folium.Marker(
        location=[city_geometry.y, city_geometry.x],
        icon=folium.DivIcon(
            html=f'<div style="font-size: 12px; color: black; padding: 2px;">{lga_name}: {growth_rate:.2f}%</div>'
        )
    ).add_to(m)
    


# Define the file path for saving the plot
output_file_path = '../../plots/geo(html)/Victoria_Average_Growth_Rate_Map_with_Top10.html'

# Ensure the directory exists before saving the file
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Save the plot
m.save(output_file_path)


In [None]:
top_10_growth_cities = top_10_growth_cities[['LGA', 'Growth Rate']]

In [None]:
top_10_growth_cities

## The rate of rent growth over the past five years in geo

In [None]:
victoria_gdf_growth_rate_19_24 = victoria_LGA.merge(growth_rate_19_24, left_on='LGA_NAME22', right_on='LGA')

In [None]:
m = folium.Map(location=[-37.81, 144.96], zoom_start=6, tiles="cartodbpositron")
geojson_data = json.loads(victoria_gdf_growth_rate_19_24.to_json())
folium.Choropleth(
    geo_data=geojson_data,
    name='choropleth',
    data= victoria_gdf_growth_rate_19_24,
    columns=['LGA', 'Growth Rate'],
    key_on='feature.properties.LGA',
    fill_color='Blues',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Growth Rate of The rent growth rate from 2019 to 2024'
).add_to(m)
folium.LayerControl().add_to(m)

top_10_regions_19_24 = growth_rate_19_24.nlargest(10, 'Growth Rate')

for idx, row in top_10_regions_19_24.iterrows():
    lga_name = row['LGA']
    growth_rate_value = row['Growth Rate']
    lga_geometry = victoria_gdf_growth_rate_19_24[victoria_gdf_growth_rate_19_24['LGA'] == lga_name].geometry.centroid.iloc[0]
    
    folium.map.Marker(
        [lga_geometry.y, lga_geometry.x],
        icon=folium.DivIcon(
            html=f'<div style="font-size: 12px; color: black; padding: 2px;">{lga_name}: {growth_rate_value:.2f}</div>'
        )
    ).add_to(m)

m.save('../../plots/geo(html)/Victoria_Growth_rate_1924_Top10_Map_WithNames.html')

In [None]:
top_10_regions_19_24

## Prices for 2024 in different LGA regions(geo)

In [None]:
victoria_gdf_rent_2024 = victoria_LGA.merge(mean_rent_per_city_year_2024, left_on='LGA_NAME22', right_on='LGA')

In [None]:
m = folium.Map(location=[-37.81, 144.96], zoom_start=6, tiles="cartodbpositron")
geojson_data = json.loads(victoria_gdf_rent_2024.to_json())
folium.Choropleth(
    geo_data=geojson_data,
    name='choropleth',
    data=mean_rent_per_city_year_2024,
    columns=['LGA', 'Mean Rent'],
    key_on='feature.properties.LGA',
    fill_color='Blues',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Mean Rent in 2024'
).add_to(m)
folium.LayerControl().add_to(m)

top_10_regions_2024 = mean_rent_per_city_year_2024.nlargest(10, 'Mean Rent')

for idx, row in top_10_regions_2024.iterrows():
    lga_name = row['LGA']
    rent_value = row['Mean Rent']
    lga_geometry = victoria_gdf_rent_2024[victoria_gdf_rent_2024['LGA'] == lga_name].geometry.centroid.iloc[0]
    

m.save('../../plots/geo(html)/Victoria_Rent_2024_Top10_Map_WithNames.html')

In [None]:
top_10_regions_2024