In [2]:
import pandas as pd
import json

In [3]:
# Load data
with open('../../data/raw/example_1.json', 'r') as file:
    data = json.load(file)
    
file_path = '../../data/raw/rent_history.xlsx' # Read file
df = pd.read_excel(file_path)

# Clean Historical dataset - Processing
- Fix column names
- Zone detection
- Reshape the Data
- Melt
- Extract Year and Month
- Pivot
- Concatenation

In [4]:
# clean historical rent dataset

def process_rent_data(file_path):

    # Define the mapping of sheet names to the corresponding number of bedrooms
    bed_map = {
        '1 bedroom flat': 1,
        '2 bedroom flat': 2,
        '3 bedroom flat': 3,
        '2 bedroom house': 2,
        '3 bedroom house': 3,
        '4 bedroom house': 4
    }

    # Define a mapping for apartment detection: 1 for flats, 0 for houses
    apartment_map = {
        '1 bedroom flat': 1,
        '2 bedroom flat': 1,
        '3 bedroom flat': 1,
        '2 bedroom house': 0,
        '3 bedroom house': 0,
        '4 bedroom house': 0
    }
    # Read the Excel file and get all sheet names
    xls = pd.ExcelFile(file_path)

    # To store the processed data from each sheet
    all_sheets_data = [] 
    
    # Loop through all sheet names in the Excel file
    for sheet_name in xls.sheet_names:

        # Check if the sheet name is relevant
        if sheet_name in bed_map: 

            # Read the data from the sheet and skip the first row as it contains metadata
            df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1)
            
            # Initialize new column names
            new_columns = []
            columns = df.columns

            # Replace 'Unnamed' columns with previous column names
            for idx, col in enumerate(columns):

                # Keep the original column names of the first two columns
                if idx < 2: 
                    new_columns.append(col)
                else:

                    # Use the previous column name instead if the column name contains 'Unnamed'
                    if 'Unnamed' in str(col):
                        new_columns.append(new_columns[-1])
                    else:
                        new_columns.append(col)

            # Assign the updated column names back to the dataframe
            df.columns = new_columns

            # Add a 'zone' column by detecting the zone name (e.g., 'Inner Melbourne')
            df['Zone'] = None
            zone_name = None
            for idx, row in df.iloc[1:].iterrows():
                if pd.notnull(row[sheet_name]):
                    zone_name = row[sheet_name]
                    if pd.notna(row['Unnamed: 1']):
                        df.at[idx, 'Zone'] = zone_name

            # Forward-fill zone column to ensure every row has the correct zone
            df['Zone'] = df['Zone'].fillna(method='ffill')

            # Drop the first column as it's unnecessary
            df = df.drop(df.columns[0], axis=1)

            # Rename 'Unnamed: 1' to 'Location'
            if 'Unnamed: 1' in df.columns:
                df = df.rename(columns={'Unnamed: 1': 'Location'})

            # Extract first row for Count/Median info and remove the first row from the dataframe
            type_row = df.iloc[0, 1:-1]  # Exclude 'Location' and 'zone'
            df = df.drop(0).reset_index(drop=True)

            # Set new multi-level columns by combining YearMonth and Count/Median type
            year_month_cols = df.columns[1:-1]  # Exclude 'Location' and 'zone'
            if len(year_month_cols) != len(type_row):
                raise ValueError(f"Column length mismatch in {sheet_name}.")
            
            df.columns = ['Location'] + list(year_month_cols) + ['Zone']
            type_row_full = [''] + type_row.tolist() + ['']
            df.columns = pd.MultiIndex.from_arrays([df.columns, type_row_full], names=['YearMonth', 'Type'])

            # Flatten the multi-level columns before melting
            df.columns = [f'{col[0]}' if col[1] == '' else f'{col[0]}_{col[1]}' for col in df.columns]
    
            # Melt dataframe to convert wide format to long format
            df_melted = df.melt(id_vars=['Location', 'Zone'], var_name='YearMonth_Type', value_name='Value')

            # Extract Year and Month information
            df_melted['Year'] = df_melted['YearMonth_Type'].str.extract(r'(\d{4})')
            df_melted['Month'] = df_melted['YearMonth_Type'].str.extract(r'([A-Za-z]+)')
            df_melted['Type'] = df_melted['YearMonth_Type'].str.extract(r'_(Count|Median)')

            # Drop the intermediary YearMonth_Type column
            df_melted = df_melted.drop(columns=['YearMonth_Type'])

            # Create a pivot table to separate Count and Median into different columns
            df_pivot = df_melted.pivot_table(
                index=['Location', 'Year', 'Month', 'Zone'],
                columns='Type',
                values='Value',
                aggfunc='first'
            ).reset_index()

            # Add the 'Bed' column to the DataFrame based on the sheet name
            df_pivot['Bed'] = bed_map.get(sheet_name, None)
            
            # Add the 'Apartment' column (1 for flat, 0 for house) based on the sheet name
            df_pivot['Apartment'] = apartment_map.get(sheet_name, None)

            # Append the processed DataFrame to the list of all sheets' data
            all_sheets_data.append(df_pivot)
    
    # Concatenate all the processed DataFrames from each sheet into a single DataFrame
    combined_data = pd.concat(all_sheets_data, ignore_index=True)
    
    return combined_data

# Final processed data
processed_data = process_rent_data(file_path)

processed_data.head(3)

Type,Location,Year,Month,Zone,Count,Median,Bed,Apartment
0,Albert Park-Middle Park-West St Kilda,2000,Dec,Inner Melbourne,369,175,1,1
1,Albert Park-Middle Park-West St Kilda,2000,Jun,Inner Melbourne,347,165,1,1
2,Albert Park-Middle Park-West St Kilda,2000,Mar,Inner Melbourne,352,165,1,1


In [5]:
group_total = processed_data[processed_data['Location'] == 'Group Total']
processed_data = processed_data[~processed_data['Location'].str.contains('Group Total', na=False)]

In [6]:
processed_data = processed_data[processed_data['Median']!='-']

In [7]:
# Define the path for the "curated data" folder
file_path = '../../data/landing/history_rent.csv'

# Save the DataFrame to the specified path in CSV format
processed_data.to_csv(file_path, index=False)

- This code efficiently handles messy, multi-sheet Excel data by standardizing column names, handling missing data, and reshaping the data for easier analysis.
- The final dataset is in a tidy format, which is suitable for further analysis, such as time series analysis of rental price trends or comparisons between different property types and locations.

# Process Current data
- Transpose the DataFrame
- Extract Number of Bedrooms
- Define Function to Convert Rent to Weekly Rent
- Apply the Conversion Function
- Remove Rows with Missing Weekly Rent
- Remove Unnecessary Columns

In [8]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,https://www.domain.com.au/1208-50-albert-street-south-melbourne-vic-3205-17193635,https://www.domain.com.au/64-mills-street-albert-park-vic-3206-16254842,https://www.domain.com.au/11-barnato-st-weir-views-vic-3338-16482891,https://www.domain.com.au/1104-70-southbank-boulevard-southbank-vic-3006-15846616,https://www.domain.com.au/167-charman-road-beaumaris-vic-3193-17193619,https://www.domain.com.au/10-135-ormond-esplanade-elwood-vic-3184-17193581,https://www.domain.com.au/2-107-addison-street-elwood-vic-3184-17193576,https://www.domain.com.au/3-18-joyce-street-elwood-vic-3184-17193573,https://www.domain.com.au/2-435-st-kilda-st-elwood-vic-3184-17171549,https://www.domain.com.au/305-101-bay-street-port-melbourne-vic-3207-17193493,...,https://www.domain.com.au/96-macclesfield-rd-monbulk-vic-3793-16994018,https://www.domain.com.au/44-kalbian-drive-clyde-north-vic-3978-16430829,https://www.domain.com.au/1-wordsworth-street-st-kilda-vic-3182-16956336,https://www.domain.com.au/13-adelaide-st-blairgowrie-vic-3942-15791609,https://www.domain.com.au/160-moore-rd-sunbury-vic-3429-16902107,https://www.domain.com.au/176-scotchmans-rd-drysdale-vic-3222-16884014,https://www.domain.com.au/485-punt-road-south-yarra-vic-3141-16773920,https://www.domain.com.au/4-vital-drive-tarneit-vic-3029-16625114,https://www.domain.com.au/14-sheffield-way-keysborough-vic-3173-15470011,https://www.domain.com.au/4-ocean-park-drive-marengo-vic-3233-13210948
name,"1208/50 Albert Street, South Melbourne VIC 3205","64 Mills Street, Albert Park VIC 3206","11 Barnato St, Weir Views VIC 3338","1104/70 Southbank Boulevard, Southbank VIC 3006","167 Charman Road, Beaumaris VIC 3193","10/135 Ormond Esplanade, Elwood VIC 3184","2/107 Addison Street, Elwood VIC 3184","3/18 Joyce Street, Elwood VIC 3184","2/435 St Kilda St, Elwood VIC 3184","305/101 Bay Street, Port Melbourne VIC 3207",...,"96 Macclesfield Rd, Monbulk VIC 3793","44 Kalbian Drive, Clyde North VIC 3978","1 Wordsworth Street, St Kilda VIC 3182","13 Adelaide St, Blairgowrie VIC 3942","160 Moore rd, Sunbury VIC 3429","176 Scotchmans Rd, Drysdale VIC 3222","485 Punt Road, South Yarra VIC 3141","4 Vital drive, Tarneit VIC 3029","14 Sheffield Way, Keysborough VIC 3173","4 Ocean Park Drive, Marengo VIC 3233"
cost_text,$520 pw,$1495.00,$460 per week ($1999 pcm),$420,$950.00,"$1,200 weekly",$895 weekly,$870 weekly,$675 weekly,$595 weekly,...,$750 pw,$850 Per Week,"$2,500",$850 weekly,$1100 Per Week,"$1,800 weekly","$1,450.00",$550,$625 / wk,from $300 per night
rooms,"[1 Bed, 1 Bath]","[3 Beds, 1 Bath]","[4 Beds, 2 Baths]","[1 Bed, 1 Bath]","[4 Beds, 3 Baths]","[3 Beds, 2 Baths]","[3 Beds, 2 Baths]","[2 Beds, 2 Baths]","[2 Beds, 1 Bath]","[1 Bed, 1 Bath]",...,"[5 Beds, 2 Baths]","[5 Beds, 3 Baths]","[5 Beds, 3 Baths]","[5 Beds, 3 Baths]","[5 Beds, 2 Baths]","[7 Beds, 4 Baths]","[6 Beds, 2 Baths]","[5 Beds, 2 Baths]","[5 Beds, 2 Baths]","[5 Beds, 2 Baths]"


In [9]:
# swap rows and columns
df= df.transpose()

In [10]:
# calculate bedrooms number
df['Bed'] = df['rooms'].astype(str).str.extract(r'(\d+)\s*Beds?')

In [11]:
import re

# convert rent to weekly rent
def convert_to_weekly(cost_text):
    # convert text to lower case
    cost_text = cost_text.lower().replace(',', '')
    
    # get all prices in texts
    amounts = re.findall(r'(\d+\.?\d*)', cost_text)
    amounts = [float(amount) for amount in amounts]
    
    if not amounts:
        return None 
    
    # weekly rent is prioritised
    if 'pw' in cost_text or 'per week' in cost_text or 'weekly' in cost_text or '/ wk' in cost_text or 'p/w' in cost_text:
        return amounts[0]  
    # victoria rent calculating method
    elif 'pcm' in cost_text or 'per month' in cost_text:
        return (amounts[-1] * 12) / 365 * 7 
    elif 'per night' in cost_text :
        return amounts[0] * 7  
    else:
        return amounts[0]  

# apply to cost text
df['weekly_rent'] = df['cost_text'].apply(convert_to_weekly)

# Remove rows where 'weekly_rent' is NaN
df = df.dropna(subset=['weekly_rent'])

In [12]:
# drop url index and unnessary columns
df = df.reset_index(drop=True)
df = df.drop(['cost_text','rooms','desc', 'parking'], axis=1)

In [13]:
df.head(3)

Unnamed: 0,name,Bed,weekly_rent
0,"1208/50 Albert Street, South Melbourne VIC 3205",1,520.0
1,"64 Mills Street, Albert Park VIC 3206",3,1495.0
2,"11 Barnato St, Weir Views VIC 3338",4,460.0


In [14]:
# Define the path for the "curated data" folder
file_path = '../../data/raw/location_rent.csv'

# Save the DataFrame to the specified path in CSV format
df.to_csv(file_path, index=False)

- The core functionality of this code is to extract useful information from the raw rental data and convert it into a standardized weekly rental format. It also cleans the dataset and optimizes the structure of the data frame by removing invalid records and redundant columns.