# Preprocessing oldlisting_buy_6

In [1]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import re
import ast

In [2]:
# Load your dataset (replace with the correct path to your CSV file)
file_path = "../data/landing/oldlistings_buy/oldlistings_buy_6.csv"
df = pd.read_csv(file_path)

# Helper functions to extract and process data
def expand_rented_prices(row):
    try:
        rent_list = ast.literal_eval(row['rented_prices'])
        rows = []
        for rent in rent_list:
            new_row = row.copy()
            new_row['rented_price'] = rent.get('price', None)
            new_row['date'] = rent.get('date', None)
            rows.append(new_row)
        return pd.DataFrame(rows)
    except (ValueError, SyntaxError):
        return pd.DataFrame([row])

def extract_from_meta_data(meta_data_str, label):
    try:
        meta_list = ast.literal_eval(meta_data_str)
        for item in meta_list:
            if item.get('label') == label:
                return item.get('quantity', None)
    except Exception:
        return None

# Apply the process to expand the first 100 rows (or all rows if needed)
expanded_rows = pd.concat([expand_rented_prices(row) for _, row in df.iterrows()], ignore_index=True)

# Extract meta_data columns for bed, bath, car, land, type
expanded_rows['bed'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bed'))
expanded_rows['bath'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bath'))
expanded_rows['car'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'car'))
expanded_rows['land'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'land'))
expanded_rows['type'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'type'))

# Keep only relevant columns
final_expanded_df = expanded_rows[['lat', 'lng', 'address', 'bed', 'bath', 'car', 'land', 'type', 'rented_price', 'date']]

# Optionally, print or view the dataframe
# print(final_expanded_df)

In [3]:
# Function to clean rented_prices
final_expanded_df['property_price_cleaned'] = (
    final_expanded_df['rented_price']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = (


In [4]:
# Replace "O" with "0" in rented_price_cleaned
final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace('O', '0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace('O', '0')


In [5]:
# Replace all non-numeric characters with NaN from rented_price_cleaned except for commas and "$" signs and "-" signs
final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)


In [6]:
# Function to clean and handle range prices by calculating the average
def clean_price(price):
    if pd.isna(price):
        return price  # Return NaN as is
    # Handle price ranges like "$425,000-$455,000"
    range_match = re.match(r"\$(\d+,\d+)-\$(\d+,\d+)", price)
    if range_match:
        low_price = int(range_match.group(1).replace(',', ''))
        high_price = int(range_match.group(2).replace(',', ''))
        return (low_price + high_price) / 2  # Return the average of the range
    # Handle normal prices
    price_cleaned = re.sub(r'[^\d]', '', price)
    return int(price_cleaned) if price_cleaned.isdigit() else None

In [7]:
final_expanded_df['property_price_cleaned'] = [clean_price(price) for price in final_expanded_df['property_price_cleaned']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = [clean_price(price) for price in final_expanded_df['property_price_cleaned']]


In [8]:
final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')

  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')


In [9]:
# Function to clean "bed"
final_expanded_df['bed_cleaned'] = (
    final_expanded_df['bed']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['bed_cleaned'] = (


In [10]:
# Function to clean "bath"
final_expanded_df['bath_cleaned'] = (
    final_expanded_df['bath']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [11]:
# Function to clean "car"
final_expanded_df['car_cleaned'] = (
    final_expanded_df['car']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [12]:
# Function to clean "address"
final_expanded_df['address_cleaned'] = (
    final_expanded_df['address']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    .str.replace('𝙻', 'L')
    .str.replace('𝙺', 'K')
    .str.replace('𝙹', 'J')
    .str.replace('𝙸', 'I')
    .str.replace('𝙷', 'H')
    .str.replace('𝙶', 'G')
    .str.replace('𝙵', 'F')
    .str.replace('𝙴', 'E')
    .str.replace('𝙳', 'D')
    .str.replace('𝙲', 'C')
    .str.replace('𝙱', 'B')
    .str.replace('𝙰', 'A')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘿', 'D')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘻', 'z')
    .str.replace('𝘺', 'y')
    .str.replace('𝘹', 'x')
    .str.replace('𝘸', 'w')
    .str.replace('𝘷', 'v')
    .str.replace('𝘶', 'u')
    .str.replace('𝘵', 't')
    .str.replace('𝘴', 's')

)

In [13]:
# Function to clean "address"
final_expanded_df['address_cleaned'] = (
    final_expanded_df['address_cleaned']
    .str.replace('𝘳', 'r')
    .str.replace('𝘲', 'q')
    .str.replace('𝘱', 'p')
    .str.replace('𝘰', 'o')
    .str.replace('𝘯', 'n')
    .str.replace('𝘭', 'l')
    .str.replace('𝘬', 'k')
    .str.replace('𝘫', 'j')
    .str.replace('𝘪', 'i')
    .str.replace('𝘩', 'h')
    .str.replace('𝘨', 'g')
    .str.replace('𝘧', 'f')
    .str.replace('𝘦', 'e')
    .str.replace('𝘥', 'd')
    .str.replace('𝘣', 'b')
    .str.replace('𝘢', 'a')
    .str.replace('𝘡', 'Z')
    .str.replace('𝘠', 'Y')
    .str.replace('𝘟', 'X')
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    

)

In [14]:
# drop some columns
final_expanded_df = final_expanded_df.drop(columns=['rented_price', 'bed', 'bath', 'car', 'land', 'address'])
 
# Extract year from date
final_expanded_df['year'] = final_expanded_df['date'].dt.year

# Extract suburb from address_cleaned, which is all text after the last comma
final_expanded_df['suburb'] = final_expanded_df['address_cleaned'].str.rsplit(',').str[-1]

# Remove all type that are not 'House' or 'Unit/ampt'
final_expanded_df = final_expanded_df[final_expanded_df['type'].isin(['House', 'Unit/apmt'])]

# Remove all rows with NaN in 'property_price_cleaned'
final_expanded_df = final_expanded_df[final_expanded_df['property_price_cleaned'].notna()]

In [15]:
final_expanded_df['bed_cleaned'] = pd.to_numeric(final_expanded_df['bed_cleaned'], errors='coerce')
final_expanded_df['bath_cleaned'] = pd.to_numeric(final_expanded_df['bath_cleaned'], errors='coerce')
final_expanded_df['car_cleaned'] = pd.to_numeric(final_expanded_df['car_cleaned'], errors='coerce')

In [16]:
# In the column 'property_price_cleaned', remove the rows that digits are more than 10
final_expanded_df = final_expanded_df[final_expanded_df['property_price_cleaned'].astype(str).str.len() <= 10]
final_expanded_df

Unnamed: 0,lat,lng,type,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,address_cleaned,year,suburb
1,,,House,2022-10-01,1525000.0,4.0,1.0,1.0,"211 BARKLY STREET, BRUNSWIC​K",2022.0,BRUNSWIC​K
3,,,House,2011-04-01,989000.0,4.0,1.0,1.0,"211 BARKLY STREET, BRUNSWIC​K",2011.0,BRUNSWIC​K
4,,,House,2011-04-01,999000.0,4.0,1.0,1.0,"211 BARKLY STREET, BRUNSWIC​K",2011.0,BRUNSWIC​K
5,,,Unit/apmt,2022-07-01,525000.0,2.0,2.0,2.0,"616/601 SYDNEY ROAD, BRUNSWICK",2022.0,BRUNSWICK
6,,,Unit/apmt,2018-02-01,610000.0,2.0,2.0,2.0,"616/601 SYDNEY ROAD, BRUNSWICK",2018.0,BRUNSWICK
...,...,...,...,...,...,...,...,...,...,...,...
325408,,,House,2018-01-01,590000.0,4.0,2.0,2.0,"58 SPΕСTΑСLΕ СRΕSСΕNT, PОINT СООK",2018.0,PОINT СООK
325409,,,House,2018-01-01,550000.0,4.0,2.0,2.0,"​12 YOSEMI​TE STRΕΕT , PОINT СООK",2018.0,PОINT СООK
325411,,,House,2017-04-01,270000.0,4.0,2.0,2.0,"​12 YOSEMI​TE STRΕΕT , PОINT СООK",2017.0,PОINT СООK
325412,,,House,2018-01-01,645000.0,4.0,2.0,2.0,"5/32 SANDLEWOOD LANE, POINT COOK",2018.0,POINT COOK


In [17]:
create_dir("../data/raw/oldlistings_buy/")
final_expanded_df.to_csv("../data/raw/oldlistings_buy/oldlistings_buy_6.csv", index=False)

Directory already exists: ../data/raw/oldlistings_buy/



In [18]:
# Calculate average property price, average bed amount, average bath amount, average car amount by year, suburb
final_expanded_df_avg = final_expanded_df.groupby(['year', 'suburb']).agg(
    avg_property_price=('property_price_cleaned', 'mean'),
    avg_bed=('bed_cleaned', 'mean'),
    avg_bath=('bath_cleaned', 'mean'),
    avg_car=('car_cleaned', 'mean')
).reset_index()
final_expanded_df_avg

Unnamed: 0,year,suburb,avg_property_price,avg_bed,avg_bath,avg_car
0,2006.0,ALPHINGTON,680000.0,3.0,2.0,2.0
1,2006.0,ALTONA MEADOWS,251250.0,3.0,1.5,1.0
2,2006.0,ARDEER,220000.0,3.0,1.0,6.0
3,2006.0,ARMADALE,315000.0,3.0,1.0,1.0
4,2006.0,ASCOT VALE,500000.0,,2.0,2.0
...,...,...,...,...,...,...
30470,2024.0,​SOUTHBANK,383000.0,1.0,1.0,
30471,2024.0,​WERRIBEE,432000.0,4.0,,2.0
30472,2024.0,​WEST MELBOURNE​,577500.0,2.0,1.0,1.0
30473,2024.0,​WODONGA,600000.0,3.0,2.0,2.0


In [19]:
create_dir("../data/raw/oldlistings_buy/")
final_expanded_df_avg.to_csv("../data/raw/oldlistings_buy/oldlistings_buy_6_avg.csv", index=False)

Directory already exists: ../data/raw/oldlistings_buy/

