# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read CSV

In [2]:
listings = pd.read_csv('og_data/listings2024.csv')

# Cleaning

## bathrooms_text -> bathrooms & bathroom_type

In [3]:
def parse_bathrooms(bathrooms_text):
    if pd.isnull(bathrooms_text): 
        return pd.Series([0.0, "unknown"]) # Return 0.0 if the value is missing
    bathrooms_text = bathrooms_text.lower() # Convert to lowercase
    if "half" in bathrooms_text:
        return pd.Series([0.5, "shared" if "shared" in bathrooms_text else "private"]) # Return 0.5 if it's a half bathroom 
    elif "shared" in bathrooms_text:
        return pd.Series([float(bathrooms_text.split(" ")[0]), "shared"]) # Return the first number if it's a shared bathroom
    else:
        return pd.Series([float(bathrooms_text.split(" ")[0]), "private"]) # Return the first number if it's a private bathroom

# Apply the function
listings[['bathrooms', 'bathroom_type']] = listings['bathrooms_text'].apply(parse_bathrooms) 

## Data type: Convert to "datetime"

In [4]:
# Convert dates
date_columns = ['last_scraped', 'host_since', 'first_review', 'last_review', 'calendar_last_scraped']
for col in date_columns:
    listings[col] = pd.to_datetime(listings[col], errors='coerce')  # Convert to datetime, handle errors

## Data type: Convert percentages to float

In [5]:
# Convert percentages to float (remove '%' and divide by 100)
percentage_columns = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_columns:
    listings[col] = (
        listings[col]
        .str.replace('%', '', regex=False)  # Remove percentage symbol
        .astype(float) / 100  # Convert to float and divide by 100
    )

## Convert t and f to proper boolean

In [6]:
# Convert boolean columns
boolean_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable', 'has_availability']
for col in boolean_columns:
    listings[col] = listings[col].map({'t': True, 'f': False})  # Map 't' to True and 'f' to Falslistings

## Data type: Convert to categoricals

In [7]:
# Convert categorical columns
categorical_columns = ['source', 'property_type', 'room_type', 'bathroom_type', 'neighbourhood_cleansed']
for col in categorical_columns:
    listings[col] = listings[col].astype('category') # Convert to category type

## Fix the price column

In [8]:
# Fix the price column if it's formatted as "$1,865.00"
listings['price'] = listings['price'].str.replace("[$,]", "", regex=True).astype(float) # Remove $ and , and convert to float

In [9]:
# Fix numeric columns with missing or incorrect values
numeric_columns = ['host_listings_count', 'host_total_listings_count']
for col in numeric_columns:
    listings[col] = pd.to_numeric(listings[col], errors='coerce')  # Convert to numeric, handle errolistings

## Drop empty columns

In [10]:
# Drop empty columns
empty_columns = ['license', 'calendar_updated', 'neighbourhood_group_cleansed']
listings = listings.drop(columns=empty_columns)

## Data type: Convert floats to integers

In [11]:
# Convert float columns with integer values to int (if appropriate)
integer_like_float_columns = ['host_listings_count', 'host_total_listings_count', 'bedrooms', 'beds']
for col in integer_like_float_columns:
    listings[col] = listings[col].fillna(0).astype(int)  # Fill NaN with 0 and convert to int

# Export

In [12]:
listings.to_csv('cleaned_listings.csv', index=False)