In [1]:
# imports libraries
import re
import pandas as pd

In [2]:
# read in the csv file after first cleaned property features
path = "../data/curated/cleaned_property_with_features.csv"
property_data = pd.read_csv(path)

In [3]:
# check the shape of property data
property_data.shape

(14819, 13)

In [5]:
# get the rent price data
rent_df_new = pd.DataFrame()
rent_df_new["rent"] = property_data["rent"]

# clean the rent data to gain the weekly rent
for i in range(len(rent_df_new["rent"])):

    # get the price
    price = rent_df_new["rent"][i].lower()
    rent = re.findall(r"[$](\d+\.?,?\d*)", price)

    # find the annually rent
    if ("p.a." in price) or ("annually" in price) or ("pa." in price):
        rent_df_new["rent"][i] = float(rent[0].replace(",", ""))/52

    # find the fortnightly rent
    elif "fortnightly" in price:
        rent_df_new["rent"][i] = float(rent[0].replace(",", ""))/2

    # find the nightly rent
    elif "night" in price:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)
        rent_df_new["rent"][i] = float(rent[0].replace(",", "")) * 7

    # find the monthly rent
    elif ("pcm" in price) or ("pm" in price) or ("month" in price):

        # check whether contain weekly rent
        if re.findall(r"(p([a-z]*|\/*)\s?w)", price) != "":
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            rent_df_new["rent"][i] = float(rent[0].replace(",", ""))
            
        # if only contain monthly rent, calculate the weekly rent
        else:
            if len(rent) == 0:
                rent = re.findall(r"(\d+\.?,?\d*)", price)
            rent = (float(rent[0].replace(",", "")) / 30) * 7
            rent_df_new["rent"][i] = rent
        
    # find the weekly rent
    else:
        if len(rent) == 0:
            rent = re.findall(r"(\d+\.?,?\d*)", price)

            # set the rent without any price data as zero
            if len(rent) == 0:
                rent_df_new["rent"][i] = 0
            else:
                rent_df_new["rent"][i] = float(rent[0].replace(",", ""))
        
        elif len(rent) == 2:
            num1 = int(float(rent[0].replace(",", "")))
            num2 = int(float(rent[1].replace(",", "")))

            # check for normal weekly rent range
            if len(str(num1)) != len(str(num2)):
                # check whether the larger price is twice than the lower price, 
                # if yes, using lower price, if no, using the mean price
                if num1 * 2 < num2:
                    rent = num1
                else:
                    rent = (num1 + num2)/2
            else:
                rent = (num1 + num2)/2
            rent_df_new["rent"][i] = rent

        else:
            rent_df_new["rent"][i] = float(rent[0].replace(",", ""))


# transfer the rent data type to integer
rent_df_new = rent_df_new.astype(int)

In [6]:
# check the shape
rent_df_new.shape

(14819, 1)

In [7]:
# add the processed rent data into the whole data set
property_data["rent_weekly"] = rent_df_new["rent"]

In [8]:
# check the shape of the prperty data
property_data.shape

(14819, 14)

In [10]:
# clean the zero rent
drop_zero_rent = property_data[property_data["rent_weekly"] != 0]

In [11]:
# check the dataframe shape after clean the zero rent
drop_zero_rent.shape

(14458, 14)

In [13]:
# save the cleaned rent dataframe as csv
drop_zero_rent.to_csv("../data/curated/cleaned_property_with_rent.csv", index = None)