# Preprocessing oldlisting_buy_0

In [1]:
import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time 
start_time = time.time()

import pandas as pd
import re
import ast

In [2]:
# Load your dataset (replace with the correct path to your CSV file)
file_path = "../data/landing/oldlistings_buy/oldlistings_buy_0.csv"
df = pd.read_csv(file_path)

# Helper functions to extract and process data
def expand_rented_prices(row):
    try:
        rent_list = ast.literal_eval(row['rented_prices'])
        rows = []
        for rent in rent_list:
            new_row = row.copy()
            new_row['rented_price'] = rent.get('price', None)
            new_row['date'] = rent.get('date', None)
            rows.append(new_row)
        return pd.DataFrame(rows)
    except (ValueError, SyntaxError):
        return pd.DataFrame([row])

def extract_from_meta_data(meta_data_str, label):
    try:
        meta_list = ast.literal_eval(meta_data_str)
        for item in meta_list:
            if item.get('label') == label:
                return item.get('quantity', None)
    except Exception:
        return None

# Apply the process to expand the first 100 rows (or all rows if needed)
expanded_rows = pd.concat([expand_rented_prices(row) for _, row in df.iterrows()], ignore_index=True)

# Extract meta_data columns for bed, bath, car, land, type
expanded_rows['bed'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bed'))
expanded_rows['bath'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'bath'))
expanded_rows['car'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'car'))
expanded_rows['land'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'land'))
expanded_rows['type'] = expanded_rows['meta_data'].apply(lambda x: extract_from_meta_data(x, 'type'))

# Keep only relevant columns
final_expanded_df = expanded_rows[['lat', 'lng', 'address', 'bed', 'bath', 'car', 'land', 'type', 'rented_price', 'date']]

# Optionally, print or view the dataframe
# print(final_expanded_df)

In [3]:
# Function to clean rented_prices
final_expanded_df['property_price_cleaned'] = (
    final_expanded_df['rented_price']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = (


In [4]:
# Replace "O" with "0" in rented_price_cleaned
final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace('O', '0')
final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace('O', '0')


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",April 2013,"$340,00​0"
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",November 2011,"$340,000"
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",April 2013,"$499,000"
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",November 2011,"$​499,000"
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",April 2013,"$65,000"
...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",July 2012,"$329,000"
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",July 2012,"$369,000"
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",July 2012,"$115,000 - $125,000"
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",July 2012,"$359,000"


In [5]:
# Replace all non-numeric characters with NaN from rented_price_cleaned except for commas and "$" signs and "-" signs
final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)
final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = final_expanded_df['property_price_cleaned'].str.replace(r'[^0-9,$-]', '', regex=True)


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",April 2013,"$340,000"
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",November 2011,"$340,000"
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",April 2013,"$499,000"
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",November 2011,"$499,000"
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",April 2013,"$65,000"
...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",July 2012,"$329,000"
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",July 2012,"$369,000"
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",July 2012,"$115,000-$125,000"
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",July 2012,"$359,000"


In [6]:
# Function to clean and handle range prices by calculating the average
def clean_price(price):
    if pd.isna(price):
        return price  # Return NaN as is
    # Handle price ranges like "$425,000-$455,000"
    range_match = re.match(r"\$(\d+,\d+)-\$(\d+,\d+)", price)
    if range_match:
        low_price = int(range_match.group(1).replace(',', ''))
        high_price = int(range_match.group(2).replace(',', ''))
        return (low_price + high_price) / 2  # Return the average of the range
    # Handle normal prices
    price_cleaned = re.sub(r'[^\d]', '', price)
    return int(price_cleaned) if price_cleaned.isdigit() else None

In [7]:
final_expanded_df['property_price_cleaned'] = [clean_price(price) for price in final_expanded_df['property_price_cleaned']]

final_expanded_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['property_price_cleaned'] = [clean_price(price) for price in final_expanded_df['property_price_cleaned']]


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",April 2013,340000.0
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",November 2011,340000.0
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",April 2013,499000.0
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",November 2011,499000.0
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",April 2013,65000.0
...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",July 2012,329000.0
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",July 2012,369000.0
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",July 2012,120000.0
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",July 2012,359000.0


In [8]:
final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')
final_expanded_df

  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_expanded_df['date'] = pd.to_datetime(final_expanded_df['date'], errors='coerce')


Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",2013-04-01,340000.0
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",2011-11-01,340000.0
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",2013-04-01,499000.0
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",2011-11-01,499000.0
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",2013-04-01,65000.0
...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",2012-07-01,329000.0
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",2012-07-01,369000.0
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",2012-07-01,120000.0
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",2012-07-01,359000.0


In [13]:
# Function to clean "bed"
final_expanded_df['bed_cleaned'] = (
    final_expanded_df['bed']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [14]:
# Function to clean "bath"
final_expanded_df['bath_cleaned'] = (
    final_expanded_df['bath']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [15]:
# Function to clean "car"
final_expanded_df['car_cleaned'] = (
    final_expanded_df['car']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [16]:
# Function to clean "land"
final_expanded_df['land_cleaned'] = (
    final_expanded_df['land']
    .str.replace(r'm2', '', regex=False)           # Remove "m2" unit
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    #.str.translate(translation_table)                 # Convert full-width digits to ASCII digits
    # Convert full-width digits to ASCII digits
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    .str.replace('𝟌', '4')
    .str.replace('𝟋', '3')
    .str.replace('𝟊', '2')
    .str.replace('𝟉', '1')
    .str.replace('𝟈', '0')
    .str.replace('𝟇', '9')
    .str.replace('𝟆', '8')
    .str.replace('𝟅', '7')
    .str.replace('𝟄', '6')
    .str.replace('𝟃', '5')
    .str.replace('𝟂', '4')
    .str.replace('𝟁', '3')
    .str.replace('𝟀', '2')
    .str.replace('𝞿', '1')
    .str.replace('𝞾', '0')
)

In [17]:
final_expanded_df

Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",2013-04-01,340000.0,2,1,,
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",2011-11-01,340000.0,2,1,,
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",2013-04-01,499000.0,3,2,4,
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",2011-11-01,499000.0,3,2,4,
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",2013-04-01,65000.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",2012-07-01,329000.0,​3,,4,
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",2012-07-01,369000.0,,2,1,
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",2012-07-01,120000.0,,,,
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",2012-07-01,359000.0,3,2,1​,


In [18]:
# Function to clean "address"
final_expanded_df['address_cleaned'] = (
    final_expanded_df['address']
    .str.replace(r'\u200b', '', regex=False)           # Remove zero-width space
    .str.replace(r'\xa0', '', regex=False)             # Remove non-breaking space
    .str.replace(r'<span>', '', regex=False)           # Remove <span> tag
    .str.replace(r'</span>', '', regex=False)          # Remove </span> tag
    .str.replace('𝙻', 'L')
    .str.replace('𝙺', 'K')
    .str.replace('𝙹', 'J')
    .str.replace('𝙸', 'I')
    .str.replace('𝙷', 'H')
    .str.replace('𝙶', 'G')
    .str.replace('𝙵', 'F')
    .str.replace('𝙴', 'E')
    .str.replace('𝙳', 'D')
    .str.replace('𝙲', 'C')
    .str.replace('𝙱', 'B')
    .str.replace('𝙰', 'A')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘿', 'D')
    .str.replace('𝘾', 'C')
    .str.replace('𝘽', 'B')
    .str.replace('𝘼', 'A')
    .str.replace('𝘻', 'z')
    .str.replace('𝘺', 'y')
    .str.replace('𝘹', 'x')
    .str.replace('𝘸', 'w')
    .str.replace('𝘷', 'v')
    .str.replace('𝘶', 'u')
    .str.replace('𝘵', 't')
    .str.replace('𝘴', 's')

)

In [19]:
# Function to clean "address"
final_expanded_df['address_cleaned'] = (
    final_expanded_df['address_cleaned']
    .str.replace('𝘳', 'r')
    .str.replace('𝘲', 'q')
    .str.replace('𝘱', 'p')
    .str.replace('𝘰', 'o')
    .str.replace('𝘯', 'n')
    .str.replace('𝘭', 'l')
    .str.replace('𝘬', 'k')
    .str.replace('𝘫', 'j')
    .str.replace('𝘪', 'i')
    .str.replace('𝘩', 'h')
    .str.replace('𝘨', 'g')
    .str.replace('𝘧', 'f')
    .str.replace('𝘦', 'e')
    .str.replace('𝘥', 'd')
    .str.replace('𝘣', 'b')
    .str.replace('𝘢', 'a')
    .str.replace('𝘡', 'Z')
    .str.replace('𝘠', 'Y')
    .str.replace('𝘟', 'X')
    .str.replace('０', '0')
    .str.replace('１', '1')
    .str.replace('２', '2')
    .str.replace('３', '3')
    .str.replace('４', '4')
    .str.replace('５', '5')
    .str.replace('６', '6')
    .str.replace('７', '7')
    .str.replace('８', '8')
    .str.replace('９', '9')
    .str.replace('4', '4')
    .str.replace('𝟻', '5')
    .str.replace('𝟼', '6')
    .str.replace('𝟽', '7')
    .str.replace('𝟾', '8')
    .str.replace('𝟿', '9')
    .str.replace('𝟶', '0')
    .str.replace('𝟹', '3')
    .str.replace('𝟺', '4')
    .str.replace('𝟸', '2')
    .str.replace('𝟷', '1')
    .str.replace('𝟫', '9')
    .str.replace('𝟪', '8')
    .str.replace('𝟩', '7')
    .str.replace('𝟨', '6')
    .str.replace('𝟧', '5')
    .str.replace('𝟦', '4')
    .str.replace('𝟥', '3')
    .str.replace('𝟤', '2')
    .str.replace('𝟣', '1')
    .str.replace('𝟢', '0')
    .str.replace('𝟡', '9')
    .str.replace('𝟠', '8')
    .str.replace('𝟟', '7')
    .str.replace('𝟞', '6')
    .str.replace('𝟝', '5')
    .str.replace('𝟜', '4')
    .str.replace('𝟛', '3')
    .str.replace('𝟚', '2')
    .str.replace('𝟙', '1')
    .str.replace('𝟘', '0')
    .str.replace('𝟗', '9')
    .str.replace('𝟖', '8')
    .str.replace('𝟕', '7')
    .str.replace('𝟔', '6')
    .str.replace('𝟓', '5')
    .str.replace('𝟒', '4')
    .str.replace('𝟑', '3')
    .str.replace('𝟐', '2')
    .str.replace('𝟏', '1')
    .str.replace('𝟎', '0')
    

)

In [20]:
final_expanded_df

Unnamed: 0,lat,lng,address,bed,bath,car,land,type,rented_price,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned
0,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,00​0",2013-04-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​"
1,,,"𝟸𝟻-𝟸𝟽 𝙻ОՍIS RОΑD ​- 𝟷ST ΕSTΑTΕ, VΕNՍS BAY​",2,𝟷,,,House,"$340,000",2011-11-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​"
2,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$４𝟫𝟫,OOO",2013-04-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY"
3,,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",3,2,4,,House,"$​499,000",2011-11-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY"
4,,,"𝟨-𝟪 В𝙻ΑСK AVEN​UE, VENUS BAY",,,,,Land,"<span>$𝟨𝟻,OOO</span>",2013-04-01,65000.0,,,,,"6-8 ВLΑСK AVEN​UE, VENUS BAY"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512633,,,"𝟷𝟷𝟷 JUS​TICE, COW​ES",​3,,４,,,"$329,000",2012-07-01,329000.0,​3,,4,,"111 JUS​TICE, COW​ES"
512634,,,"𝟽 RΕDWООD, СОWΕS",,2,1,,,"$369,000",2012-07-01,369000.0,,2,1,,"7 RΕDWООD, СОWΕS"
512635,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",,,,,,"$115,000 - $125,000",2012-07-01,120000.0,,,,,"LOT 2/ 59 MCHAFFIE, СОWΕS"
512636,,,"𝟨Α ВRΑD𝙻ΕY, COWES",3,2,1​,,Townhouse,"$359,000",2012-07-01,359000.0,3,2,1​,,"6Α ВRΑDLΕY, COWES"


In [21]:
# drop some columns
final_expanded_df = final_expanded_df.drop(columns=['rented_price', 'bed', 'bath', 'car', 'land', 'address'])
final_expanded_df

Unnamed: 0,lat,lng,type,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned
0,,,House,2013-04-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​"
1,,,House,2011-11-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​"
2,,,House,2013-04-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY"
3,,,House,2011-11-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY"
4,,,Land,2013-04-01,65000.0,,,,,"6-8 ВLΑСK AVEN​UE, VENUS BAY"
...,...,...,...,...,...,...,...,...,...,...
512633,,,,2012-07-01,329000.0,​3,,4,,"111 JUS​TICE, COW​ES"
512634,,,,2012-07-01,369000.0,,2,1,,"7 RΕDWООD, СОWΕS"
512635,,,,2012-07-01,120000.0,,,,,"LOT 2/ 59 MCHAFFIE, СОWΕS"
512636,,,Townhouse,2012-07-01,359000.0,3,2,1​,,"6Α ВRΑDLΕY, COWES"


In [22]:
# Extract year from date
final_expanded_df['year'] = final_expanded_df['date'].dt.year
final_expanded_df

Unnamed: 0,lat,lng,type,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned,year
0,,,House,2013-04-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​",2013.0
1,,,House,2011-11-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​",2011.0
2,,,House,2013-04-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",2013.0
3,,,House,2011-11-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",2011.0
4,,,Land,2013-04-01,65000.0,,,,,"6-8 ВLΑСK AVEN​UE, VENUS BAY",2013.0
...,...,...,...,...,...,...,...,...,...,...,...
512633,,,,2012-07-01,329000.0,​3,,4,,"111 JUS​TICE, COW​ES",2012.0
512634,,,,2012-07-01,369000.0,,2,1,,"7 RΕDWООD, СОWΕS",2012.0
512635,,,,2012-07-01,120000.0,,,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",2012.0
512636,,,Townhouse,2012-07-01,359000.0,3,2,1​,,"6Α ВRΑDLΕY, COWES",2012.0


In [23]:
# Extract suburb from address_cleaned, which is all text after the last comma
final_expanded_df['suburb'] = final_expanded_df['address_cleaned'].str.rsplit(',').str[-1]
final_expanded_df

Unnamed: 0,lat,lng,type,date,property_price_cleaned,bed_cleaned,bath_cleaned,car_cleaned,land_cleaned,address_cleaned,year,suburb
0,,,House,2013-04-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​",2013.0,VΕNՍS BAY​
1,,,House,2011-11-01,340000.0,2,1,,,"25-27 LОՍIS RОΑD ​- 1ST ΕSTΑTΕ, VΕNՍS BAY​",2011.0,VΕNՍS BAY​
2,,,House,2013-04-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",2013.0,VENUS BAY
3,,,House,2011-11-01,499000.0,3,2,4,,"85 PANDORA AVENUE - 1ST ESTATE, VENUS BAY",2011.0,VENUS BAY
4,,,Land,2013-04-01,65000.0,,,,,"6-8 ВLΑСK AVEN​UE, VENUS BAY",2013.0,VENUS BAY
...,...,...,...,...,...,...,...,...,...,...,...,...
512633,,,,2012-07-01,329000.0,​3,,4,,"111 JUS​TICE, COW​ES",2012.0,COW​ES
512634,,,,2012-07-01,369000.0,,2,1,,"7 RΕDWООD, СОWΕS",2012.0,СОWΕS
512635,,,,2012-07-01,120000.0,,,,,"LOT 2/ 59 MCHAFFIE, СОWΕS",2012.0,СОWΕS
512636,,,Townhouse,2012-07-01,359000.0,3,2,1​,,"6Α ВRΑDLΕY, COWES",2012.0,COWES


In [24]:
create_dir("../data/raw/oldlistings_buy/")
final_expanded_df.to_csv("../data/raw/oldlistings_buy/oldlistings_buy_0.csv", index=False)

Directory already exists: ../data/raw/oldlistings_buy/

