In [29]:
import pandas as pd

In [30]:
files_dire = '../data/raw/'

In [31]:
df = pd.read_csv(f'{files_dire}/melbourne_past_listings.csv')
# merge the past listing with the geocode
# this should only left with properties tshat have been listed in 2021-2022
geo_df = pd.read_csv(f'{files_dire}/geo.csv').iloc[: , 1:]
df = df.merge(geo_df, on='address')
df.head()

Unnamed: 0,year,month,bed,bath,car,land_raw,type,address,suburb,code,rent_raw,url,loc_address,lat,lon
0,2021,January,3.0,1.0,1.0,,Rental_residential,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,$330,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
1,2012,February,3.0,1.0,1.0,,Rental_residential,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,$320 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
2,2012,January,3.0,1.0,1.0,,Rental_residential,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,$340 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
3,2021,January,3.0,1.0,2.0,,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,$395 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494
4,2021,January,3.0,1.0,2.0,,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,$395,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494


In [32]:
len(geo_df)

228119

In [33]:
# now review how many NaN exist in each column
df.count()

year           1670322
month          1670322
bed            1670293
bath           1670284
car            1500422
land_raw         87532
type           1355579
address        1670322
suburb         1670322
code           1670322
rent_raw       1670311
url            1670322
loc_address    1670322
lat            1670322
lon            1670322
dtype: int64

In [34]:
# we also remove the column of the size of the land as only very small number of rows have records
df = df.drop(['land_raw'], axis = 1)

In [35]:
# as the research goal is mainly residential properties (for people to live in)
# hence we filter out all properties that have zero bedroom or bathroom
temp = ['bed', 'bath', 'car']
df[temp] = df[temp].fillna(0)
df[temp] = df[temp].astype('int64')
df = df[~(df[temp] == 0).any(axis=1)]
# print out the number of entries left
len(df)

1500409

In [36]:
# from the previous output of NaN count, the types of property seems broken
df['type'].value_counts()

House                 482146
Unit/apmt             307874
Rental_residential    193945
Townhouse             117726
AvailableNow           70421
Unit                   23672
Apartment              10032
Available               5170
AvailableDate           2635
Villa                   2206
Studio                  2136
Flat                     945
Sales_residential         22
Rural                     10
Other                      9
ForSale                    5
Villa,House                3
Terrace                    3
Duplex                     1
Acreage/semi-rural         1
Name: type, dtype: int64

In [37]:
df['type'].value_counts().index

Index(['House', 'Unit/apmt', 'Rental_residential', 'Townhouse', 'AvailableNow',
       'Unit', 'Apartment', 'Available', 'AvailableDate', 'Villa', 'Studio',
       'Flat', 'Sales_residential', 'Rural', 'Other', 'ForSale', 'Villa,House',
       'Terrace', 'Duplex', 'Acreage/semi-rural'],
      dtype='object')

In [42]:
# some of the types has a really low count hence these are removed
unknown_type = [ 'Rental_residential', 'AvailableNow', 'Available', 'AvailableDate', 'Other']
remove_type = ['Villa', 'Rural', 'ForSale', 'Villa,House', 'Terrace', 'Duplex', 'Acreage/semi-rural', 'Sales_residential']

In [43]:
df = df[~df['type'].isin(remove_type)]

In [44]:
# we want to combine categories in to three: house; Apartment / Unit / Flat; townhouse; studio
df['type'].value_counts()

House        482146
AUF          342523
unknown      272180
Townhouse    117726
Studio         2136
Name: type, dtype: int64

In [45]:
# 'AUF' stands for 'Apartment / Unit / Flat'
df['type'] = df['type'].replace(['Unit/apmt', 'Apartment', 'Flat', 'Unit'], 'AUF')
df['type'] = df['type'].replace(unknown_type, 'unknown')
df['type'].value_counts()

House        482146
AUF          342523
unknown      272180
Townhouse    117726
Studio         2136
Name: type, dtype: int64

In [63]:
df.isna().sum()

bed                0
bath               0
car                0
type           84864
address            0
suburb             0
code               0
url                0
loc_address        0
lat                0
lon                0
weekly_rent        0
list_date          0
dtype: int64

In [46]:
# preliminary processing on rent
# remove the dollar sign as well as comma
df['rent_raw'] = df['rent_raw'].str.replace(',', '')
df['rent_raw'] = df['rent_raw'].str.replace('$', '')
df.head()

  df['rent_raw'] = df['rent_raw'].str.replace('$', '')


Unnamed: 0,year,month,bed,bath,car,type,address,suburb,code,rent_raw,url,loc_address,lat,lon
0,2021,January,3,1,1,unknown,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,330,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
1,2012,February,3,1,1,unknown,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,320 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
2,2012,January,3,1,1,unknown,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,340 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153
3,2021,January,3,1,2,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,395 per week,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494
4,2021,January,3,1,2,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,395,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494


In [47]:
# split the rent to a column of number and another for description
df['rent_raw'] = df['rent_raw'].str.lower()
df['rent_disc'] = df['rent_raw'].str.extract('(\D+)')
df['rent_raw'] = df['rent_raw'].str.extract('(\d+)')

In [48]:
# deduplicate entries again as some entries have slight difference such as 'per week' and 'pw'
deduplicate_subset = ['year', 'month', 'bed', 'bath', 'car', 'type', 'address', 'suburb', 'code', 'rent_raw']

In [49]:
df = df.drop_duplicates(subset=deduplicate_subset, keep=False)
# after deduplication, only properties listed more than once are retained
df = df[df.groupby('address').address.transform('count') > 1]

In [50]:
# remove all punctuation and space for further convinience
df['rent_disc'] = df['rent_disc'].str.replace(r'[^\w\s]+', '')
df['rent_disc'] = df['rent_disc'].str.replace(' ', '')

  df['rent_disc'] = df['rent_disc'].str.replace(r'[^\w\s]+', '')


In [51]:
df['rent_disc'].value_counts()

perweek          244992
pw               152125
weekly            38806
                  32396
wk                 4530
                  ...  
millionweekly         1
ono                   1
justlisted            1
fixed                 1
opentooffers          1
Name: rent_disc, Length: 88, dtype: int64

In [52]:
# sort rent into three types
week = ['perweek', 'pw', 'weekly', 'wk', 'week', 'perweekgst']
month = ['permonth', 'monthly', 'pcm', 'pm', 'month']
year = ['pa', 'perannum', 'annually']

In [53]:
df['rent_disc'] = df['rent_disc'].replace(week, 7)
df['rent_disc'] = df['rent_disc'].replace(month, 30)
df['rent_disc'] = df['rent_disc'].replace(year, 365)

In [54]:
# remove entries without proper rent descriptions
allowed_vals = [7, 30, 365]
df = df[df['rent_disc'].isin(allowed_vals)]
df['rent_disc'] = df['rent_disc'].astype('int')
df = df[~df['rent_raw'].isnull()]
df = df[df['rent_raw'].str.isnumeric()]
df['rent_raw'] = df['rent_raw'].astype('int')

In [55]:
# convert all rent into rent per week for consistency
df['weekly_rent'] =  df['rent_raw'] / df['rent_disc'] * 7
df = df.drop(['rent_disc', 'rent_raw'], axis=1)

In [56]:
# as all listing does not have a specific date
# all dates are assumed to be the first date of each month
# df['day'] = 1
df['month'] = pd.to_datetime(df.month, format='%B').dt.month
df['list_date'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1))
df = df.drop(['year', 'month'], axis=1)

In [62]:
df.head()

Unnamed: 0,bed,bath,car,type,address,suburb,code,url,loc_address,lat,lon,weekly_rent,list_date
1,3,1,1,unknown,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153,320.0,2012-02-01
2,3,1,1,unknown,"1/31 DANDENONG ROAD EAST, FRANKSTON",Frankston,3199,https://www.oldlistings.com.au/real-estate/VIC...,"Dandenong Road East, Frankston, Melbourne, Cit...",-38.12989,145.132153,340.0,2012-01-01
9,3,1,2,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494,350.0,2016-12-01
10,3,1,2,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494,330.0,2016-12-01
11,3,1,2,House,"4 LUCERNE CRESCENT, FRANKSTON",Frankston,3199,https://www.oldlistings.com.au/real-estate/VIC...,"Lucerne Crescent, Frankston, Melbourne, City o...",-38.146375,145.166494,330.0,2016-01-01


In [58]:
# check once again to make sure there is no duplicate and less than 2 record
df = df[df.groupby('address').address.transform('count') > 1]
len(df)

417344

In [59]:
# check the number of unique properties in future investigation
len(list(set(df['address'].tolist())))

113309

In [60]:
df.to_csv('../data/curated/processed_listing.csv')

In [61]:
df.iloc[:1000, :].to_csv('../data/curated/SAMPLE_processed_listing.csv')