# Data Cleaning

In [None]:
import numpy as np
import pandas as pd
import re

## Column Cleaning

### Price

In [None]:
# remove £ sign and comma
df_clean['price'] = df_raw['price'].apply(lambda x: x.replace('£', '').replace(',', ''))
df_clean['price']

In [None]:
# convert price column to number
df_clean['price'] = pd.to_numeric(df_clean['price'], errors='coerce')
df_clean['price'].describe()

### Address

In [None]:
# Remove postcode from address
df_clean['address'] = df_clean.apply(lambda row: row['address'].replace(row['postcode'], ''), axis=1)

### House Type

In [None]:
# replaces numbers and 'bed' with empty string
df_clean['house_type'] = df_raw['house_type'].apply(lambda x: re.sub(r'\d+', '', x.replace('bed', '')).strip())
df_clean['house_type'].unique()

In [None]:
df_clean['house_type'].describe()

### Number of Bedrooms

In [None]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_bedrooms'] = df_raw['number_of_bedrooms'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_bedrooms'] = pd.to_numeric(df_clean['number_of_bedrooms'], errors='coerce')

df_clean['number_of_bedrooms']

### Number of Bathrooms

In [None]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_bathrooms'] = df_raw['number_of_bathrooms'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_bathrooms'] = pd.to_numeric(df_clean['number_of_bathrooms'], errors='coerce')

df_clean['number_of_bathrooms']

### Number of Receptions

In [None]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_receptions'] = df_raw['number_of_receptions'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_receptions'] = pd.to_numeric(df_clean['number_of_receptions'], errors='coerce')

df_clean['number_of_receptions']

### Other Features

In [None]:
# Strip whitespace and replace empty strings with NaN
df_clean['other_features'] = df_raw['other_features'].str.strip().where(lambda x : x != '', np.nan)

df_clean['other_features'] = pd.to_numeric(df_clean['other_features'].str.replace('sq. ft', '').str.replace(',', ''), 
                                           errors='coerce')
df_clean['other_features'].describe()

### Tenure

In [None]:
# Removes whitespace and converts string to lowercase
df_clean['tenure'] = df_raw['tenure'].str.strip().str.lower()
df_clean['tenure'].unique()

### Lease Time

In [None]:
# Removes whitespace and 'years' and converts column to number
df_clean['lease_time'] = df_raw['lease_time'].str.replace(r'\D+', ' ', regex=True).str.strip()
df_clean['lease_time'].describe()

### Service Charge

In [None]:
# removes '£', ',' and 'per year' from string 
df_clean['service_charge'] = (df_raw['service_charge'].str.replace('£', '').str.replace('per year', '')
                              .str.replace(',', '').str.strip())
df_clean['service_charge'].unique()

### Tax Band

In [None]:
df_raw['tax_band'].unique()

### Ground Rent

In [None]:
# removes '£', ',' and 'per month' from string
df_clean['ground_rent'] = (df_raw['ground_rent'].str.replace('£' , '').str.replace(',' , '')
                           .str.replace('per month', ''))
df_clean['ground_rent'].describe()

### Commonhold Details

In [None]:
df_raw['commonhold_details'].unique()

### Points of Interest

In [None]:
df_raw['points_of_interest']