In [None]:
# this notebook pre-process the scraped data from oldlistings.com.au

In [None]:
import pandas as pd

In [None]:
# set a file directory
files_dire = '../data/raw/'

In [None]:
df = pd.read_csv(f'{files_dire}/melbourne_past_listings.csv')
# df = pd.read_csv('../data/curated/past_listing.csv')
# merge the past listing with the geocode
# this should only left with properties tshat have been listed in 2021-2022
geo_df = pd.read_csv(f'{files_dire}/geo.csv').iloc[: , 1:]
df = df.merge(geo_df, on='address')
df.head()

In [None]:
# now review how many NaN exist in each column
df.count()

In [None]:
# remove the column of the size of the land as only very small number of rows have records
df = df.drop(['land_raw'], axis = 1)

In [None]:
# as the research goal is mainly residential properties (for people to live in)
# hence we filter out all properties that have zero bedroom or bathroom
temp = ['bed', 'bath', 'car']
df[temp] = df[temp].fillna(0)
df[temp] = df[temp].astype('int64')
df = df[~(df[temp] == 0).any(axis=1)]
# print out the number of entries left
len(df)

In [None]:
# from the previous output of NaN count, the types of property seems broken
df['type'].value_counts()

In [None]:
df['type'].value_counts().index

In [None]:
# some of the types has a really low count hence these are removed
unknown_type = [ 'Rental_residential', 'AvailableNow', 'Available', 'AvailableDate', 'Other']
remove_type = ['Villa', 'Rural', 'ForSale', 'Villa,House', 'Terrace', 'Duplex', 'Acreage/semi-rural', 'Sales_residential']

In [None]:
df = df[~df['type'].isin(remove_type)]

In [None]:
# we want to combine categories in to three: house; Apartment / Unit / Flat; townhouse; studio
df['type'].value_counts()

In [None]:
# 'AUF' stands for 'Apartment / Unit / Flat'
df['type'] = df['type'].replace(['Unit/apmt', 'Apartment', 'Flat', 'Unit'], 'AUF')
df['type'] = df['type'].replace(unknown_type, 'unknown')
df['type'].value_counts()

In [None]:
df.isna().sum()

In [None]:
# preliminary processing on rent
# remove the dollar sign as well as comma
df['rent_raw'] = df['rent_raw'].str.replace(',', '')
df['rent_raw'] = df['rent_raw'].str.replace('$', '')
df.head()

In [None]:
# split the rent to a column of number and another for description
df['rent_raw'] = df['rent_raw'].str.lower()
df['rent_disc'] = df['rent_raw'].str.extract('(\D+)')
df['rent_raw'] = df['rent_raw'].str.extract('(\d+)')

In [None]:
# deduplicate entries again as some entries have slight difference such as 'per week' and 'pw'
deduplicate_subset = ['year', 'month', 'bed', 'bath', 'car', 'type', 'address', 'suburb', 'code', 'rent_raw']

In [None]:
df = df.drop_duplicates(subset=deduplicate_subset, keep=False)
# after deduplication, only properties listed more than once are retained
df = df[df.groupby('address').address.transform('count') > 1]

In [None]:
# remove all punctuation and space for further convinience
df['rent_disc'] = df['rent_disc'].str.replace(r'[^\w\s]+', '')
df['rent_disc'] = df['rent_disc'].str.replace(' ', '')

In [None]:
df['rent_disc'].value_counts()

In [None]:
# sort rent into three types
week = ['perweek', 'pw', 'weekly', 'wk', 'week', 'perweekgst']
month = ['permonth', 'monthly', 'pcm', 'pm', 'month']
year = ['pa', 'perannum', 'annually']

In [None]:
df['rent_disc'] = df['rent_disc'].replace(week, 7)
df['rent_disc'] = df['rent_disc'].replace(month, 30)
df['rent_disc'] = df['rent_disc'].replace(year, 365)

In [None]:
# remove entries without proper rent descriptions
allowed_vals = [7, 30, 365]
df = df[df['rent_disc'].isin(allowed_vals)]
df['rent_disc'] = df['rent_disc'].astype('int')
df = df[~df['rent_raw'].isnull()]
df = df[df['rent_raw'].str.isnumeric()]
df['rent_raw'] = df['rent_raw'].astype('int')

In [None]:
# convert all rent into rent per week for consistency
df['weekly_rent'] =  df['rent_raw'] / df['rent_disc'] * 7
df = df.drop(['rent_disc', 'rent_raw'], axis=1)

In [None]:
# as all listing does not have a specific date
# all dates are assumed to be the first date of each month
# df['day'] = 1
df['month'] = pd.to_datetime(df.month, format='%B').dt.month
df['list_date'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1))
df = df.drop(['year', 'month'], axis=1)

In [None]:
df.head()

In [None]:
# check once again to make sure there is no duplicate and less than 2 record
df = df[df.groupby('address').address.transform('count') > 1]
len(df)

In [None]:
# check the number of unique properties in future investigation
len(list(set(df['address'].tolist())))

In [None]:
# save to csv 
df.to_csv('../data/curated/processed_listing.csv')

In [None]:
# sample of the file as one of the group member is overseas
# this may need to be used by him
df.iloc[:1000, :].to_csv('../data/curated/SAMPLE_processed_listing.csv')