### Import Libraries and Data

In [49]:
import pandas as pd
import os

In [50]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
demographics = pd.read_csv('../data/curated/demographics.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')

### Combining All Feature Sets

In [51]:
# Merge one_bed_flat
one_bed_merged = pd.merge(one_bed_flat, domain_one_bed_flat, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, crimes, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
one_bed_merged = pd.merge(one_bed_merged, education, on='suburb', how='inner')
one_bed_merged = pd.merge(one_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge two_bed_flat
two_bed_merged = pd.merge(two_bed_flat, domain_two_bed_flat, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, crimes, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_merged = pd.merge(two_bed_merged, education, on='suburb', how='inner')
two_bed_merged = pd.merge(two_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge three_bed_flat
three_bed_merged = pd.merge(three_bed_flat, domain_three_bed_flat, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, crimes, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_merged = pd.merge(three_bed_merged, education, on='suburb', how='inner')
three_bed_merged = pd.merge(three_bed_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, domain_two_bed_house, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, crimes, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, domain_three_bed_house, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, crimes, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, domain_four_bed_house, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, crimes, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')

# Merge all_properties
all_properties_merged = pd.merge(all_properties, domain_all_properties, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, crimes, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, demographics, left_on='suburb', right_on='sa2_name', how='inner')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')


In [52]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_merged_df(df):
    
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_merged = clean_merged_df(one_bed_merged)
two_bed_merged = clean_merged_df(two_bed_merged)
three_bed_merged = clean_merged_df(three_bed_merged)
two_bed_house_merged = clean_merged_df(two_bed_house_merged,)
three_bed_house_merged = clean_merged_df(three_bed_house_merged)
four_bed_house_merged = clean_merged_df(four_bed_house_merged)
all_properties_merged = clean_merged_df(all_properties_merged)

In [54]:
print(list(all_properties_merged.columns))
all_properties_merged.shape

['suburb', 'mar_2016_count', 'mar_2016_median', 'jun_2016_count', 'jun_2016_median', 'sep_2016_count', 'sep_2016_median', 'dec_2016_count', 'dec_2016_median', 'mar_2017_count', 'mar_2017_median', 'jun_2017_count', 'jun_2017_median', 'sep_2017_count', 'sep_2017_median', 'dec_2017_count', 'dec_2017_median', 'mar_2018_count', 'mar_2018_median', 'jun_2018_count', 'jun_2018_median', 'sep_2018_count', 'sep_2018_median', 'dec_2018_count', 'dec_2018_median', 'mar_2019_count', 'mar_2019_median', 'jun_2019_count', 'jun_2019_median', 'sep_2019_count', 'sep_2019_median', 'dec_2019_count', 'dec_2019_median', 'mar_2020_count', 'mar_2020_median', 'jun_2020_count', 'jun_2020_median', 'sep_2020_count', 'sep_2020_median', 'dec_2020_count', 'dec_2020_median', 'mar_2021_count', 'mar_2021_median', 'jun_2021_count', 'jun_2021_median', 'sep_2021_count', 'sep_2021_median', 'dec_2021_count', 'dec_2021_median', 'mar_2022_count', 'mar_2022_median', 'jun_2022_count', 'jun_2022_median', 'sep_2022_count', 'sep_2022

(11100, 151)

### Save Combined Feature Set

In [44]:
def save_feature_set(df, file_path):
     # Retrieve the directory 
    directory = os.path.dirname(file_path)
    
    # Create the directory if it does not exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the filtered dataframe to CSV
    df.to_csv(file_path, index=False)

print("All Saved!")

# Save all dataframes in the raw folder
one_bed_merged = save_feature_set(one_bed_merged, '../data/curated/combined_feature_set/one_bed_merged.csv')
two_bed_merged = save_feature_set(two_bed_merged, '../data/curated/combined_feature_set/two_bed_merged.csv')
three_bed_merged = save_feature_set(three_bed_merged, '../data/curated/combined_feature_set/three_bed_merged.csv')

two_bed_house_merged = save_feature_set(two_bed_house_merged, '../data/curated/combined_feature_set/two_bed_house_merged.csv')
three_bed_house_merged = save_feature_set(three_bed_house_merged, '../data/curated/combined_feature_set/three_bed_house_merged.csv')
four_bed_house_merged = save_feature_set(four_bed_house_merged, '../data/curated/combined_feature_set/four_bed_house_merged.csv')

all_properties_merged = save_feature_set(all_properties_merged, '../data/curated/combined_feature_set/all_properties_merged.csv')

All Saved!
