This notebook is used to create an aggregated dataset used for EDA

In [1]:
import pandas as pd

# Load the CSV file to inspect its contents
file_path = 'data/curated/forcast_dataset.csv'
data = pd.read_csv(file_path) # data preprocessed for forecasting
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484558 entries, 0 to 484557
Data columns (total 32 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   suburb                                 484558 non-null  object 
 1   rent                                   484558 non-null  float64
 2   num_bedrooms                           484558 non-null  float64
 3   num_bathrooms                          484558 non-null  float64
 4   parking                                484558 non-null  float64
 5   distance_to_cbd                        484558 non-null  float64
 6   latitude                               484558 non-null  float64
 7   longitude                              484558 non-null  float64
 8   nearst_train_station_dist              484558 non-null  float64
 9   year                                   484558 non-null  int64  
 10  month                                  484558 non-null  

In [2]:
# Rename the columns as per the user's request
data.rename(columns={
    'Rate per 100,000 population': 'crime_rate',
    'v_high_vcaa': 'num_schls_very_high_score',
    'high_vcaa': 'num_schls_high_score',
    'good_vcaa': 'num_schls_good_score',
    'num_primary': 'num_primary_schools',
    'num_secondary_public': 'num_secondary_public_schools',
    'num_secondary_private': 'num_secondary_private_schools',
    'num_secondary_catholic': 'num_secondary_catholic_schools',
    'num_edu_centre': 'num_edu_centres'
}, inplace=True)

# Adding the additional property types that were mentioned
property_columns_extended = [
    'property_type_apartment / unit / flat', 'property_type_duplex', 'property_type_duplex-semi-detached',
    'property_type_house', 'property_type_retirement living', 'property_type_serviced apartment',
    'property_type_studio', 'property_type_townhouse', 'property_type_villa'
]

def combine_property_types_extended(row):
    types = []
    if row['property_type_house']:
        types.append('house')
    if row['property_type_retirement living']:
        types.append('retirement living')
    if row['property_type_serviced apartment']:
        types.append('serviced apartment')
    if row['property_type_studio']:
        types.append('studio')
    if row['property_type_townhouse']:
        types.append('townhouse')
    if row['property_type_villa']:
        types.append('villa')
    if row['property_type_apartment / unit / flat']:
        types.append('apartment/unit/flat')
    if row['property_type_duplex']:
        types.append('duplex')
    if row['property_type_duplex-semi-detached']:
        types.append('duplex-semi-detached')
    return ', '.join(types) if types else 'other'

# Apply the function to create the new 'property_types' column using the extended list of property types
data['property_types'] = data.apply(combine_property_types_extended, axis=1)

# Drop the individual property type columns since they are now combined
data.drop(columns=property_columns_extended, inplace=True)

In [3]:
# Grouping the data by 'suburb', 'num_bedrooms', and 'property_types', calculating the average rent, 
# and the number of data points (count) for each group

grouped_rent_df = data.groupby(['property_types', 'suburb', 'distance_to_cbd', 'num_bedrooms', 'year', 'month']).agg(
    avg_rent=('rent', 'mean'),
    data_count=('rent', 'count')
).reset_index()

grouped_rent_df

Unnamed: 0,property_types,suburb,distance_to_cbd,num_bedrooms,year,month,avg_rent,data_count
0,apartment/unit/flat,abbotsford,2.71,1.0,2017,10,350.0,1
1,apartment/unit/flat,abbotsford,2.71,1.0,2017,11,400.0,1
2,apartment/unit/flat,abbotsford,2.71,2.0,2017,10,600.0,1
3,apartment/unit/flat,abbotsford,2.71,2.0,2017,11,550.0,1
4,apartment/unit/flat,abbotsford,2.80,2.0,2015,6,500.0,1
...,...,...,...,...,...,...,...,...
421681,villa,yarraville,7.23,2.0,2017,2,330.0,1
421682,villa,yarraville,7.39,3.0,2017,11,480.0,1
421683,villa,yarraville,7.56,2.0,2015,11,320.0,1
421684,villa,yarraville,7.65,2.0,2017,8,440.0,1


In [4]:
# Function to categorize regions based on distance to CBD
def categorize_region(distance):
    if distance < 7:
        return 'city'
    elif distance < 50:
        return 'greater melbourne'
    else:
        return 'regional vic'

# Applying the function to the dataframe
grouped_rent_df['region'] = grouped_rent_df['distance_to_cbd'].apply(categorize_region)

In [5]:
grouped_rent_df

Unnamed: 0,property_types,suburb,distance_to_cbd,num_bedrooms,year,month,avg_rent,data_count,region
0,apartment/unit/flat,abbotsford,2.71,1.0,2017,10,350.0,1,city
1,apartment/unit/flat,abbotsford,2.71,1.0,2017,11,400.0,1,city
2,apartment/unit/flat,abbotsford,2.71,2.0,2017,10,600.0,1,city
3,apartment/unit/flat,abbotsford,2.71,2.0,2017,11,550.0,1,city
4,apartment/unit/flat,abbotsford,2.80,2.0,2015,6,500.0,1,city
...,...,...,...,...,...,...,...,...,...
421681,villa,yarraville,7.23,2.0,2017,2,330.0,1,greater melbourne
421682,villa,yarraville,7.39,3.0,2017,11,480.0,1,greater melbourne
421683,villa,yarraville,7.56,2.0,2015,11,320.0,1,greater melbourne
421684,villa,yarraville,7.65,2.0,2017,8,440.0,1,greater melbourne


In [6]:
# save
grouped_rent_df.to_csv('data/curated/rent_by_prop_type.csv')