In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('properties_cleaned_df.csv')
df.head()

Unnamed: 0,Price,Property_Type,Bed,Bath,Parking,Location,Link,Postcode,Suburb,priceInt
0,"$2,975 per week",Apartment / Unit / Flat,3,2,2,SYDNEY NSW 2000,https://www.domain.com.au/1301-61-macquarie-st...,2000,SYDNEY,2975.0
1,"$1,250.00 per week furnished (6 Month Lease On...",Apartment / Unit / Flat,2,2,0,SYDNEY NSW 2000,https://www.domain.com.au/807-50-murray-street...,2000,SYDNEY,1250.0
2,"$2,975 per week",Apartment / Unit / Flat,3,2,2,SYDNEY NSW 2000,https://www.domain.com.au/1301-61-macquarie-st...,2000,SYDNEY,2975.0
3,$850 per week,Apartment / Unit / Flat,1,1,0,SYDNEY NSW 2000,https://www.domain.com.au/129-harrington-stree...,2000,SYDNEY,850.0
4,Furnished | $800 per week,Apartment / Unit / Flat,1,1,1,SYDNEY NSW 2000,https://www.domain.com.au/206-132-sussex-st-sy...,2000,SYDNEY,800.0


In [29]:
def preprocess(df):
    #drop columns not required for modelling and rename target variable price
    df.drop(columns = ['Price','Link','Location'], inplace=True)
    df.rename(columns = {'priceInt':'Price'}, inplace=True)
    
    #change postcode to string type
    df['Postcode']=df['Postcode'].astype(str)
    
    #drop price outliers as defined by 1.5 times IQR above 3rd quartile
    # Calculate the first quartile (Q1) and third quartile (Q3)
    Q1 = df['Price'].quantile(0.25)
    Q3 = df['Price'].quantile(0.75)
    # Calculate the interquartile range (IQR)
    IQR = Q3 - Q1
    # Define the upper bound for outliers
    upper_bound = Q3 + 1.5 * IQR
    #filter dataframe
    df = df[df['Price'] <= upper_bound]
    
    #filter to bedrooms 4 or less
    df=df[df['Bed'] <= 4]
    
    #filter to bathrooms 3 or less
    df[df['Bath']>=3]

    #combine postcode categories
    mask = df['Postcode'].isin(['2007', '2010', '2011'])
    df.loc[mask, 'Postcode'] = '2007_2010_2011'
    
    #combine apartment/house categories
    mask = df['Property_Type'].isin(['Apartment / Unit / Flat', 'Terrace', 'House',
       'Townhouse'])
    df.loc[mask, 'Property_Type'] = 'Apartment/House'

    #Change property type for apartments listed as no bedroom
    mask = ((df['Property_Type']=='Apartment/House') & (df['Bed']==0))
    df.loc[mask, 'Property_Type'] = 'Studio' 

    #Change bed number for studios listed as one bedroom
    mask = ((df['Property_Type']=='Studio') & (df['Bed']>0))
    df.loc[mask, 'Bed'] = 0


    df.reset_index(drop=True, inplace=True)#reset index
    
    return df

In [30]:
preprocessed_df = preprocess(df)
preprocessed_df.head()

Unnamed: 0,Property_Type,Bed,Bath,Parking,Postcode,Suburb,Price
0,Apartment/House,2,2,0,2000,SYDNEY,1250.0
1,Apartment/House,1,1,0,2000,SYDNEY,850.0
2,Apartment/House,1,1,1,2000,SYDNEY,800.0
3,Apartment/House,1,1,0,2000,SYDNEY,850.0
4,Apartment/House,1,1,1,2000,SYDNEY,1200.0


In [36]:
#preprocessed_df.to_csv('properties_preprocessed_df.csv', index=False) 