# Code for data cleaning and best buffer selection for other amenities
 

## Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os

## Import data

In [None]:
housing_tor = pd.DataFrame(pd.read_csv("C:\\Users\\mahagam3\\Documents\\CME Course\\Regression_model\\Regression_House_Pricewiese\\Codes_new\\Toronto_all_data_To_use.csv"))
housing_tor.head()

In [None]:
housing_van = pd.DataFrame(pd.read_csv("C:\\Users\\mahagam3\\Documents\\CME Course\\Regression_model\\Regression_House_Pricewiese\\Codes_new\\Vancouver_all_data_To_use.csv"))
housing_van.head()

## Data arranging

In [None]:
""" Vancouver """

# Clean the columns to keep only the numeric values: For Price, remove non-numeric characters like '$' and commas
housing_van['Price'] = housing_van['Price'].replace({'\$': '', ',': '', 'C': ''}, regex=True)
housing_van['Price'] = pd.to_numeric(housing_van['Price'], errors='coerce')  # Convert to numeric, NaNs for invalid values

# Clean Bedrooms: Replace "studio" with 0, remove "bds" and extract numbers
housing_van['Bedrooms'] = housing_van['Bedrooms'].str.lower()  # Convert to lowercase
housing_van['Bedrooms'] = housing_van['Bedrooms'].replace({'studio': '0'}, regex=True)
housing_van['Bedrooms'] = housing_van['Bedrooms'].str.extract('(\d+)')  # Extract numeric values
housing_van['Bedrooms'] = pd.to_numeric(housing_van['Bedrooms'], errors='coerce')  # Convert to numeric

# Clean Bathrooms: Remove non-numeric text including " ba", spaces, and variations of "ba"
housing_van['Bathrooms'] = housing_van['Bathrooms'].str.lower()  # Convert to lowercase
housing_van['Bathrooms'] = housing_van['Bathrooms'].replace({'\s*ba\s*': '', '\s+': '', 'ba': ''}, regex=True)  # Remove patterns like " ba", spaces, "ba"
housing_van['Bathrooms'] = pd.to_numeric(housing_van['Bathrooms'], errors='coerce')  # Convert to numeric

# Clean Square Footage: Remove 'sqft' and other non-numeric characters
housing_van['Square Footage'] = housing_van['Square Footage'].replace({'\$': '', ',': '', 'sqft': ''}, regex=True)
housing_van['Square Footage'] = pd.to_numeric(housing_van['Square Footage'], errors='coerce')  # Convert to numeric

# Convert the entire DataFrame to lowercase
housing_van = housing_van.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)

# Convert column headers to lowercase
housing_van.columns = housing_van.columns.str.lower()
housing_van.head()  # Display the head of the new DataFrame


In [None]:
""" Toronto """

# Clean the columns to keep only the numeric values: For Price, remove non-numeric characters like '$' and commas
housing_tor['Price'] = housing_tor['Price'].replace({'\$': '', ',': '', 'C': ''}, regex=True)
housing_tor['Price'] = pd.to_numeric(housing_tor['Price'], errors='coerce')  # Convert to numeric, NaNs for invalid values

# Clean Bedrooms: Replace "studio" with 0 and extract numbers, removing other non-numeric text (" bds")
housing_tor['Bedrooms'] = housing_tor['Bedrooms'].str.lower()  # Convert to lowercase
housing_tor['Bedrooms'] = housing_tor['Bedrooms'].replace({'studio': '0'}, regex=True)
housing_tor['Bedrooms'] = housing_tor['Bedrooms'].str.extract('(\d+)', expand=False)  # Extract the number part
housing_tor['Bedrooms'] = pd.to_numeric(housing_tor['Bedrooms'], errors='coerce')  # Convert to numeric

# Clean Bathrooms by removing non-numeric text (" ba")
housing_tor['Bathrooms'] = housing_tor['Bathrooms'].str.lower()  # Convert to lowercase
housing_tor['Bathrooms'] = housing_tor['Bathrooms'].replace({'\sba': '', '\s+': ''}, regex=True)
housing_tor['Bathrooms'] = pd.to_numeric(housing_tor['Bathrooms'], errors='coerce')  # Convert to numeric

# Clean Square Footage: Remove 'sqft' and other non-numeric characters
housing_tor['Square Footage'] = housing_tor['Square Footage'].replace({'\$': '', ',': '', 'sqft': ''}, regex=True)
housing_tor['Square Footage'] = pd.to_numeric(housing_tor['Square Footage'], errors='coerce')  # Convert to numeric

# Convert the entire DataFrame to lowercase
housing_tor = housing_tor.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)

# Convert column headers to lowercase
housing_tor.columns = housing_tor.columns.str.lower()
housing_tor.head() # Display the head of the new DataFrame

# Inspect the data - Vancouver

In [None]:
housing_van.shape

In [None]:
housing_van.info()

It can be seen that the bathrooms and square foot has less values compared to the other entries. The Square foot has missing values dominates, therefore the column can be dropped. On the other hand, the missing bathroom values can be filled with median imputation.

In [None]:
# Fill missing values with the median
median_bathrooms_van = housing_van['bathrooms'].median()
housing_van['bathrooms'].fillna(median_bathrooms_van, inplace=True)

# drop square foot column
housing_van.drop(columns=['square footage'], inplace=True)
housing_van.info()

In [None]:
housing_van.describe()

# Inspect the data - Toronto

In [None]:
housing_tor.shape

In [None]:
housing_tor.info()

Similar observation for bathrooms and square foot can be observed for toronto. Thereofre the median imputation is conducted for bathrooms and the squre foot column is dropped

In [None]:
# Fill missing values with the median
median_bathrooms_tor = housing_tor['bathrooms'].median()
housing_tor['bathrooms'].fillna(median_bathrooms_tor, inplace=True)

# drop square foot column
housing_tor.drop(columns=['square footage'], inplace=True)
housing_tor.info()

# Find best buffers for other amenities
 ## *Determine the Best Buffer for Each Category*
-------------------------------------------
*the best buffer size (e.g., area_0.5, area_1.0, area_1.5) for each category (e.g., Hospital, School, Bus Stop, etc.) based on the highest count of facilities in each buffer range.*

 *1. For each category (e.g., Hospital), it compares the counts across the three buffer columns.*
 
 *2. It selects the buffer column with the highest count as the "best" buffer for that category.*

In [None]:
# List of all categories based on the csv headers
categories = [
    'Hospital', 'Railway Station', 'School',
    'Supermarket', 'Bus Stop', 'Pub', 'Restaurant', 'Cafe',
    'Dentist', 'Fast Food', 'Kindergartens', 'Malls', 'Play Ground',
    'Park', 'Post Office'
]

# Define the buffer sizes 
buffer_sizes = ['area_0.5', 'area_1.0', 'area_1.5']

In [None]:
""" Vancouver """

# Create a dictionary to store the best buffer for each category
best_buffers_van = {}

# Iterate over each category
for category in categories:
    # For each category, create a list of the corresponding buffer columns
    category_columns = [f'{category.lower()} in {buffer}' for buffer in buffer_sizes]
    
    # Ensure the columns exist in the DataFrame
    if all(col in housing_van.columns for col in category_columns):
        # Find the buffer with the maximum value for this category
        max_column = housing_van[category_columns].idxmax(axis=1).iloc[0]
        
        # Extract the buffer size from the column name
        best_buffer = max_column.split()[-1]
        
        # Store the best buffer for this category in the dictionary
        best_buffers_van[category] = best_buffer
        
        # Get the column corresponding to the best buffer
        best_buffer_column = f'{category.lower()} in {best_buffer}'
        
        # Create the new column name with buffer size (e.g., "restaurant_best_buffer_1.5_values")
        new_column_name = f'{category.lower()}_best_buffer_{best_buffer}_values'
        
        # Add the values of the best buffer column to the housing_van DataFrame with the new name
        housing_van[new_column_name] = housing_van[best_buffer_column]

In [None]:
""" Toronto """

# Create a dictionary to store the best buffer for each category
best_buffers_tor = {}

# Iterate over each category
for category in categories:
    # For each category, create a list of the corresponding buffer columns
    category_columns = [f'{category.lower()} in {buffer}' for buffer in buffer_sizes]
    
    # Ensure the columns exist in the DataFrame
    if all(col in housing_tor.columns for col in category_columns):
        # Find the buffer with the maximum value for this category
        max_column = housing_tor[category_columns].idxmax(axis=1).iloc[0]
        
        # Extract the buffer size from the column name
        best_buffer = max_column.split()[-1]
        
        # Store the best buffer for this category in the dictionary
        best_buffers_tor[category] = best_buffer
        
        # Get the column corresponding to the best buffer
        best_buffer_column = f'{category.lower()} in {best_buffer}'
        
        # Create the new column name with buffer size (e.g., "post office_best_buffer_1.5_values")
        new_column_name = f'{category.lower()}_best_buffer_{best_buffer}_values'
        
        # Add the values of the best buffer column to the housing_tor DataFrame with the new name
        housing_tor[new_column_name] = housing_tor[best_buffer_column]

## Remove the exisisting other amenity buffers from the dataset and replace with best buffers 

In [None]:
""" Vancouver """

# Get all columns that contain the word 'in area'
columns_to_remove_van = [col for col in housing_van.columns if 'in area' in col]
housing_van = housing_van.drop(columns=columns_to_remove_van) # Drop those columns
housing_van.info()

In [None]:
# Checking Null values
housing_van.isnull().sum()*100/housing_van.shape[0] 

In [None]:
""" Toronto """

# Get all columns that contain the word 'in area'
columns_to_remove_tor = [col for col in housing_tor.columns if 'in area' in col]
housing_tor = housing_tor.drop(columns=columns_to_remove_tor) # Drop those columns
housing_tor.info()

In [None]:
# Checking Null values
housing_tor.isnull().sum()*100/housing_tor.shape[0] 

## Save data cleaned with selected buffers

In [None]:
# Vancouver
housing_van.to_csv("C:\\Users\\mahagam3\\Documents\\CME Course\\Regression_model\\Regression_House_Pricewiese\\Codes_new\\housing_van_clean_with_best_buffers_v2.csv", index=False)

# Toronto
housing_tor.to_csv("C:\\Users\\mahagam3\\Documents\\CME Course\\Regression_model\\Regression_House_Pricewiese\\Codes_new\\housing_tor_clean_with_best_buffers_v2.csv", index=False)

               ###################### End of the code for data cleaning and buffer selection #############################