**Feature Analysis**
-------

In this notebook, we will count the number to important features per suburb and add the count as features for model training.

In [2]:
import pandas as pd

Loading all the datasets of the features I want to join as df

In [24]:
rental_df = pd.read_csv('../data/curated/rental_merged.csv',encoding='ISO-8859-1')
school_df = pd.read_csv('../data/landing/dv346-schoollocations2023.csv', encoding='ISO-8859-1')
shopping_df = pd.read_csv('../data/curated/shopping_count.csv', encoding='ISO-8859-1')
parks_df = pd.read_csv('../data/curated/parks_count.csv', encoding='ISO-8859-1')
postcode_df = pd.read_csv('../data/curated/australian_postcodes.csv', encoding='ISO-8859-1')
postcode_df = postcode_df[['postcode', 'SA2_CODE_2021']]
hospital_df = pd.read_csv('../data/curated/rental_with_hospital.csv')
hospital_df = hospital_df[['Address','route_distance_hospital']]
hospital_df = hospital_df.rename(columns={'Address':'address'})
hospital_count_df = pd.read_csv('../data/curated/hospital_count.csv')

Joining school count per suburb

In [4]:
independent_count = school_df[school_df['Education_Sector'] == 'Independent'].groupby('Postal_Postcode').size().reset_index(name='Independent_School_Count')

# Count the number of non-independent schools per suburb
non_independent_count = school_df[school_df['Education_Sector'] != 'Independent'].groupby('Postal_Postcode').size().reset_index(name='Non_Independent_School_Count')

# Merge both results to get a full view of independent and non-independent school counts per suburb
school_count_per_suburb = pd.merge(independent_count, non_independent_count, on='Postal_Postcode', how='outer').fillna(0)

# Display the result
print(school_count_per_suburb)

     Postal_Postcode  Independent_School_Count  Non_Independent_School_Count
0               3000                       4.0                           0.0
1               3003                       0.0                           1.0
2               3004                       2.0                           2.0
3               3006                       0.0                           2.0
4               3008                       0.0                           1.0
..               ...                       ...                           ...
575             3987                       0.0                           1.0
576             3988                       0.0                           1.0
577             3992                       0.0                           1.0
578             3995                       1.0                           5.0
579             3996                       0.0                           2.0

[580 rows x 3 columns]


In [5]:
rental_df = pd.merge(rental_df, school_count_per_suburb, left_on='Postcode', right_on='Postal_Postcode', how='left').fillna(0)

rental_df = rental_df.drop(columns=['Postal_Postcode'])


Joining parks count per suburb

In [6]:
# Adding postcode to parks df
parks_postcode_df = pd.merge(parks_df, postcode_df, left_on='SA2_CODE21', right_on='SA2_CODE_2021', how='left')

# Group by 'postcode' and sum 'parks_count' to ensure no duplicates
parks_count_per_suburb = parks_postcode_df.groupby('postcode')['parks_count'].sum().reset_index()

# Joining number of parks to rental
rental_df = pd.merge(rental_df, parks_count_per_suburb, left_on='Postcode', right_on='postcode', how='left').fillna(0)
rental_df = rental_df.drop(columns=['postcode'])


In [None]:
# Adding postcode to parks df
shopping_postcode_df = pd.merge(shopping_df, postcode_df, left_on='SA2_CODE21', right_on='SA2_CODE_2021', how='left')

shops_count_per_suburb = shopping_postcode_df.groupby('postcode')['shopping_count'].sum().reset_index()

# Joining number of parks to rental 
rental_df = pd.merge(rental_df, shops_count_per_suburb, left_on='Postcode', right_on='postcode', how='left').fillna(0)
rental_df = rental_df.drop(columns=['postcode', 'property_index', 'Unnamed: 0'])

In [25]:
#adding postcode to parks df
hospital_count_df = pd.merge(hospital_count_df, postcode_df, left_on='SA2_CODE21', right_on='SA2_CODE_2021', how='left')

hospital_count_per_suburb = hospital_count_df.groupby('postcode')['hospital_count'].sum().reset_index()

#joining number of parks to rental 
rental_df = pd.merge(rental_df, hospital_count_per_suburb, left_on='Postcode', right_on='postcode', how='left').fillna(0)
rental_df.drop(columns=['postcode'])

Unnamed: 0.1,SA2 code,Unnamed: 0,property_index,Cost,Beds,Baths,Cars,Address,Property Type,coordinates,...,Non_Independent_School_Count,shopping_count,parks_count,SA2 name,Net population change,value_2019,avg_yearly_growth_rate,has_train_station,postcode,hospital_count
0,208031193,0,0,560.0,2,1,1.0,"6/121 Mcdonald Street, Mordialloc VIC 3195",Apartment / Unit / Flat,"(-38.0045428, 145.0884301)",...,11.0,5.0,36.0,Mordialloc - Parkdale,-10.0,1073.0,0.022933,1,3195.0,7.0
1,207021157,1,1,550.0,2,1,1.0,"5/3 Carnarvon Street, Doncaster VIC 3108",Apartment / Unit / Flat,"(-37.7863384, 145.1237982)",...,4.0,4.0,58.0,Doncaster,510.0,836.0,0.011487,0,3108.0,1.0
2,212041459,2,2,340.0,1,1,1.0,"4/10 Cole Street, Noble Park VIC 3174",Apartment / Unit / Flat,"(-37.9523213, 145.1736)",...,10.0,6.0,14.0,Noble Park - East,-21.0,694.0,0.019854,1,3174.0,3.0
3,213011328,3,4,460.0,3,1,0.0,"8 Perth Avenue, Albion VIC 3020",House,"(-37.7753959, 144.8154461)",...,17.0,0.0,14.0,Ardeer - Albion,-110.0,669.0,0.019774,1,3020.0,0.0
4,213011336,4,5,270.0,1,1,1.0,"49 Dickson Street, Sunshine VIC 3020",House,"(-37.786819, 144.83411450359245)",...,17.0,2.0,27.0,Sunshine,61.0,712.0,0.021199,1,3020.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,207011148,9103,10012,950.0,4,3,2.0,"2/1106 Burke Road, Balwyn North VIC 3104",Townhouse,"(-37.7845246, 145.063037)",...,4.0,15.0,23.0,Balwyn North,-84.0,1105.0,0.013321,0,3103.0,0.0
9104,206041505,9104,10013,75.0,0,1,1.0,"Car Park/228 La Trobe St, Melbourne VIC 3000",Carspace,"(-37.8114585, 144.9582685)",...,2.0,0.0,3.0,Melbourne CBD - West,1245.0,0.0,0.000000,1,3004.0,0.0
9105,207031161,9105,10014,690.0,3,2,2.0,"4/420 Middleborough Road, Blackburn VIC 3130",Townhouse,"(-37.8209424, 145.1381392)",...,10.0,10.0,38.0,Blackburn,-25.0,1013.0,0.017605,1,3130.0,4.0
9106,207011147,9106,10016,700.0,3,1,1.0,Balwyn VIC 3103,House,"(-37.8091737, 145.0833678)",...,4.0,22.0,25.0,Balwyn,-4.0,1082.0,0.012583,0,3103.0,0.0


Saving new merged dataset

In [26]:
rental_df.to_csv('../data/curated/rental_merged.csv')