# Preprocessing oldlisting_buy_6

In [1]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import re
import ast

In [2]:
# Load your dataset (replace with the correct path to your CSV file)
file_path = "../data/landing/oldlistings_buy/oldlistings_buy_6.csv"
df = pd.read_csv(file_path)

# Helper functions to extract and process data
def expand_rented_prices(row):
    try:
        rent_list = ast.literal_eval(row['rented_prices'])
        rows = []
        for rent in rent_list:
            new_row = row.copy()
            new_row['rented_price'] = rent.get('price', None)
            new_row['date'] = rent.get('date', None)
            rows.append(new_row)
        return pd.DataFrame(rows)
    except (ValueError, SyntaxError):
        return pd.DataFrame([row])

def extract_from_meta_data(meta_data_str, label):
    try:
        meta_list = ast.literal_eval(meta_data_str)
        for item in meta_list:
            if item.get('label') == label:
                return item.get('quantity', None)
    except Exception:
        return None

# Apply the process to expand the first 100 rows (or all rows if needed)
expanded_rows = pd.concat([expand_rented_prices(row) for _, row in df.iterrows()], ignore_index=True)

# Extract meta_data columns for bed, bath, car, land, type
expanded_rows['bed'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bed'))
expanded_rows['bath'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bath'))
expanded_rows['car'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'car'))
expanded_rows['land'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'land'))
expanded_rows['type'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'type'))

# Keep only relevant columns
final_expanded_df = expanded_rows[['lat', 'lng', 'address', 'bed', 'bath', 'car', 'land', 'type', 'rented_price', 'date']]

# Optionally, print or view the dataframe
# print(final_expanded_df)

In [3]:
# Function to clean rented_prices
final_expanded_df['rented_price_cleaned'] = (
    final_expanded_df['rented_price']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['rented_price_cleaned'] = (


In [4]:
# Replace "O" with "0" in rented_price_cleaned
final_expanded_df['rented_price_cleaned'] = final_expanded_df['rented_price_cleaned'].str.replace('O', '0')
final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['rented_price_cleaned'] = final_expanded_df['rented_price_cleaned'].str.replace('O', '0')


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,,
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",October 2022,"$1,525,000"
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,February 2022,Auction
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",April 2011,"$989,000"
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",April 2011,"$999,000"
...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,December 2017,Under Contract
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",April 2012,"$425,000 - $455,000"
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",January 2012,"$425,000 - $455,000"
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,,


In [5]:
# Replace all non-numeric characters with NaN from rented_price_cleaned except for commas and "$" signs and "-" signs
final_expanded_df['rented_price_cleaned'] = final_expanded_df['rented_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)
final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['rented_price_cleaned'] = final_expanded_df['rented_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,,
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",October 2022,"$1,525,000"
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,February 2022,
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",April 2011,"$989,000"
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",April 2011,"$999,000"
...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,December 2017,
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",April 2012,"$425,000-$455,000"
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",January 2012,"$425,000-$455,000"
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,,


In [6]:
# Function to clean and handle range prices by calculating the average
def clean_price(price):
    if pd.isna(price):
        return price  # Return NaN as is
    # Handle price ranges like "$425,000-$455,000"
    range_match = re.match(r"\$(\d+,\d+)-\$(\d+,\d+)", price)
    if range_match:
        low_price = int(range_match.group(1).replace(',', ''))
        high_price = int(range_match.group(2).replace(',', ''))
        return (low_price + high_price) / 2  # Return the average of the range
    # Handle normal prices
    price_cleaned = re.sub(r'[^\d]', '', price)
    return int(price_cleaned) if price_cleaned.isdigit() else None

In [7]:
final_expanded_df['rented_price_cleaned'] = [clean_price(price) for price in final_expanded_df['rented_price_cleaned']]

final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['rented_price_cleaned'] = [clean_price(price) for price in final_expanded_df['rented_price_cleaned']]


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,,
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",October 2022,1525000.0
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,February 2022,
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",April 2011,989000.0
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",April 2011,999000.0
...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,December 2017,
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",April 2012,440000.0
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",January 2012,440000.0
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,,


In [8]:
final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')
final_expanded_df

  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,NaT,
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",2022-10-01,1525000.0
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,2022-02-01,
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",2011-04-01,989000.0
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",2011-04-01,999000.0
...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,2017-12-01,
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",2012-04-01,440000.0
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",2012-01-01,440000.0
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,NaT,


In [9]:
# Function to clean "bed"
final_expanded_df['bed_cleaned'] = (
    final_expanded_df['bed']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['bed_cleaned'] = (


In [10]:
# Function to clean "bath"
final_expanded_df['bath_cleaned'] = (
    final_expanded_df['bath']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [11]:
# Function to clean "car"
final_expanded_df['car_cleaned'] = (
    final_expanded_df['car']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [12]:
# Function to clean "land"
final_expanded_df['land_cleaned'] = (
    final_expanded_df['land']
    .str.replace(r'm2', '', regex=False)           # Remove "m2" unit
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [13]:
final_expanded_df

Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,NaT,,2,1,1,
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",2022-10-01,1525000.0,4,1,1,
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,2022-02-01,,4,1,1,
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",2011-04-01,989000.0,4,1,1,
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",2011-04-01,999000.0,4,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,2017-12-01,,4,2,2,
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",2012-04-01,440000.0,4,2,2,
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",2012-01-01,440000.0,4,2,2,
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,NaT,,,,,1521


In [14]:
# Function to clean "address"
final_expanded_df['address_cleaned'] = (
    final_expanded_df['address']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('𝙻', 'L')
    .str.replace('𝙺', 'K')
    .str.replace('𝙹', 'J')
    .str.replace('𝙸', 'I')
    .str.replace('𝙷', 'H')
    .str.replace('𝙶', 'G')
    .str.replace('𝙵', 'F')
    .str.replace('𝙴', 'E')
    .str.replace('𝙳', 'D')
    .str.replace('𝙲', 'C')
    .str.replace('𝙱', 'B')
    .str.replace('𝙰', 'A')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘿', 'D')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘻', 'z')
    .str.replace('𝘺', 'y')
    .str.replace('𝘹', 'x')
    .str.replace('𝘸', 'w')
    .str.replace('𝘷', 'v')
    .str.replace('𝘶', 'u')
    .str.replace('𝘵', 't')
    .str.replace('𝘴', 's')
    .str.replace('𝘳', 'r')
    .str.replace('𝘲', 'q')
    .str.replace('𝘱', 'p')
    .str.replace('𝘰', 'o')
    .str.replace('𝘯', 'n')
    .str.replace('𝘭', 'l')
    .str.replace('𝘬', 'k')
    .str.replace('𝘫', 'j')
    .str.replace('𝘪', 'i')
    .str.replace('𝘩', 'h')
    .str.replace('𝘨', 'g')
    .str.replace('𝘧', 'f')
    .str.replace('𝘦', 'e')
    .str.replace('𝘥', 'd')
    .str.replace('𝘣', 'b')
    .str.replace('𝘢', 'a')
    .str.replace('𝘡', 'Z')
    .str.replace('𝘠', 'Y')
    .str.replace('𝘟', 'X')
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [15]:
final_expanded_df

Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,rented_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned
0,,,"𝟸 / 𝟪４４-𝟪４𝟨 SYDNEY​ RОΑD, BRUNSWICK",2,\u200b1,𝟷,,,,NaT,,2,1,1,,"2 / 844-846 SYDNEY​ RОΑD, BRUNSWICK"
1,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$1,525,000",2022-10-01,1525000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
2,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,Auction,2022-02-01,,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
3,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$𝟫𝟪𝟫,OOO",2011-04-01,989000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
4,,,"211 BARKLY STREET, BRUNSWIC​K",4\u200b,1\u200b,𝟷,,House,"$99\u200b9,000",2011-04-01,999000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325432,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,Under Contract,2017-12-01,,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325433,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO \u200b- $４𝟻𝟻,OOO",2012-04-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325434,,,"2​6 PΑ𝙻ΑСΕ RОΑD, POINT​ СООK",4,2\u200b,𝟸,,,"$４𝟸𝟻,OOO - $４𝟻𝟻,OOO",2012-01-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325435,,,"𝟸𝟽𝟻 PОINT COO​K ROAD​, POINT COOK",,,,1521\xa0m2,,,NaT,,,,,1521,"275 PОINT COO​K ROAD​, POINT COOK"


In [16]:
# drop some columns
final_expanded_df = final_expanded_df.drop(columns=['rented_price', 'bed', 'bath', 'car', 'land', 'address'])
final_expanded_df

Unnamed: 0,lat,lng,type,date,rented_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned
0,,,,NaT,,2,1,1,,"2 / 844-846 SYDNEY​ RОΑD, BRUNSWICK"
1,,,House,2022-10-01,1525000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
2,,,House,2022-02-01,,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
3,,,House,2011-04-01,989000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
4,,,House,2011-04-01,999000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K"
...,...,...,...,...,...,...,...,...,...,...
325432,,,,2017-12-01,,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325433,,,,2012-04-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325434,,,,2012-01-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK"
325435,,,,NaT,,,,,1521,"275 PОINT COO​K ROAD​, POINT COOK"


In [17]:
# Extract year from date
final_expanded_df['year'] = final_expanded_df['date'].dt.year
final_expanded_df

Unnamed: 0,lat,lng,type,date,rented_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned,year
0,,,,NaT,,2,1,1,,"2 / 844-846 SYDNEY​ RОΑD, BRUNSWICK",
1,,,House,2022-10-01,1525000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2022.0
2,,,House,2022-02-01,,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2022.0
3,,,House,2011-04-01,989000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2011.0
4,,,House,2011-04-01,999000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2011.0
...,...,...,...,...,...,...,...,...,...,...,...
325432,,,,2017-12-01,,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2017.0
325433,,,,2012-04-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2012.0
325434,,,,2012-01-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2012.0
325435,,,,NaT,,,,,1521,"275 PОINT COO​K ROAD​, POINT COOK",


In [18]:
# Extract suburb from address_cleaned, which is all text after the last comma
final_expanded_df['suburb'] = final_expanded_df['address_cleaned'].str.rsplit(',').str[-1]
final_expanded_df

Unnamed: 0,lat,lng,type,date,rented_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned,year,suburb
0,,,,NaT,,2,1,1,,"2 / 844-846 SYDNEY​ RОΑD, BRUNSWICK",,BRUNSWICK
1,,,House,2022-10-01,1525000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2022.0,BRUNSWIC​K
2,,,House,2022-02-01,,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2022.0,BRUNSWIC​K
3,,,House,2011-04-01,989000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2011.0,BRUNSWIC​K
4,,,House,2011-04-01,999000.0,4,1,1,,"211 BARKLY STREET, BRUNSWIC​K",2011.0,BRUNSWIC​K
...,...,...,...,...,...,...,...,...,...,...,...,...
325432,,,,2017-12-01,,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2017.0,POINT​ СООK
325433,,,,2012-04-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2012.0,POINT​ СООK
325434,,,,2012-01-01,440000.0,4,2,2,,"2​6 PΑLΑСΕ RОΑD, POINT​ СООK",2012.0,POINT​ СООK
325435,,,,NaT,,,,,1521,"275 PОINT COO​K ROAD​, POINT COOK",,POINT COOK


In [19]:
create_dir("../data/raw/oldlistings_buy/")
final_expanded_df.to_csv("../data/raw/oldlistings_buy/oldlistings_buy_6.csv", index=False)

Directory already exists: ../data/raw/oldlistings_buy/

