# Data Cleaning

In [74]:
import numpy as np
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import re

## Connection to local SQL DB

In [9]:
conn = pymysql.connect(
            host="localhost",          # Your host name
            user="root",      # Your username
            password="admin",  # Your password
            database="property_info_db"   # Your database name
        )

## Importing Data

In [23]:
# Define the SQL query
query = """
SELECT * FROM property_info_db.london_properties
ORDER BY property_id;
"""

engine = create_engine("mysql+pymysql://root:admin@localhost/property_info_db")

df_raw = pd.read_sql(query, engine)

In [24]:
df_raw.head()

Unnamed: 0,property_id,price,address,house_type,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,tenure,lease_time,service_charge,tax_band,ground_rent,commonhold_details,points_of_interest,listing_features,description_text,property_link,postcode
0,1,"£300,000","Ringers Road, Bromley BR1",1 bed flat,1 bed,1 bath,1 reception,650 sq. ft,Leasehold,118 years,Not available,C,£640 per month,,St Mark's Church of England Primary School\n0....,Leasehold\nAllocated parking\nModern finish\nL...,"** Guide price £300,000 - £325,000 **\n\nThis ...",https://www.zoopla.co.uk/for-sale/details/6275...,BR1
1,2,"£125,000","Downham Way, Bromley, Kent BR1",studio,,1 bath,1 reception,,Leasehold,82 years,£816 per year,B,£250,,"Launcelot Primary School\n0.1 miles,Haberdashe...",Leasehold\n* Perfect residential or investment...,Perfect residential or investment property wit...,https://www.zoopla.co.uk/for-sale/details/6521...,BR1
2,3,"£595,000","Rolvenden Gardens, Bromley BR1",3 bed terraced house,3 beds,2 baths,1 reception,,Freehold,,,E,,,"Breaside Preparatory School\n0.3 miles,Scotts ...",Freehold\nSpacious Reception Room\nLarge Kitch...,An immaculately presented three bedroom mid te...,https://www.zoopla.co.uk/for-sale/details/6520...,BR1
3,4,"£1,475,000","Upper Park Road, Bromley BR1",7 bed detached house,7 beds,6 baths,3 receptions,,Freehold,,,G,,,"The Tutorial Foundation (SEN)\n0.1 miles,St Jo...",Freehold\n7 bedrooms\n6 bathrooms (4 ensuite)\...,Ref DT0182. A rare to market and substantial V...,https://www.zoopla.co.uk/for-sale/details/6420...,BR1
4,5,"£1,499,950","Garden Lane, Bromley BR1",5 bed detached house,5 beds,1 bath,1 reception,,Freehold,,,G,,,St Joseph's Catholic Primary School\n0.4 miles...,Freehold\nLandscaped Southerly Facing Garden\n...,An exceptional five bedroom detached cottage s...,https://www.zoopla.co.uk/for-sale/details/6520...,BR1


In [25]:
df_raw.shape

(51880, 19)

In [26]:
df_raw.iloc[0, :]

property_id                                                             1
price                                                            £300,000
address                                         Ringers Road, Bromley BR1
house_type                                                     1 bed flat
number_of_bedrooms                                                  1 bed
number_of_bathrooms                                                1 bath
number_of_receptions                                          1 reception
other_features                                                 650 sq. ft
tenure                                                          Leasehold
lease_time                                                      118 years
service_charge                                              Not available
tax_band                                                                C
ground_rent                                                £640 per month
commonhold_details                    

## Column Cleaning

In [56]:
# replace empty rows with np.nan
mask = df_raw.isnull().all(axis=1)
df_clean = df_raw
df_clean[mask] = np.nan

### Price

In [57]:
# remove £ sign and comma
df_clean['price'] = df_clean['price'].apply(lambda x: x.replace('£', '').replace(',', ''))
df_clean['price']

0         300000
1         125000
2         595000
3        1475000
4        1499950
          ...   
51875    2000000
51876    1495000
51877     599000
51878    1125000
51879     425000
Name: price, Length: 51880, dtype: object

In [62]:
df_clean['price'].unique()

array(['300000', '125000', '595000', ..., '1395300', '8150000', '2170000'],
      dtype=object)

In [67]:
# convert price column to float

def convert_price(price):
    if (price == '') or (price == 'POA') or (price == 'Sale by tender') or (price == 'Coming soon'):
        return np.nan
    else:
        return float(price)

df_clean['price'] = df_clean['price'].apply(convert_price)
df_clean['price'].describe()

count    4.903200e+04
mean     9.031544e+05
std      1.646269e+06
min      4.000000e+03
25%      3.850000e+05
50%      5.500000e+05
75%      8.500000e+05
max      7.500000e+07
Name: price, dtype: float64

In [68]:
df_clean['price'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 51880 entries, 0 to 51879
Series name: price
Non-Null Count  Dtype  
--------------  -----  
49032 non-null  float64
dtypes: float64(1)
memory usage: 405.4 KB


## house_type

In [75]:
# replaces numbers and 'bed' with empty string
df_clean['house_type'] = df_clean['house_type'].apply(lambda x: re.sub(r'\d+', '', x.replace('bed', '')).strip())
df_clean['house_type']

0                  flat
1                studio
2        terraced house
3        detached house
4        detached house
              ...      
51875    terraced house
51876            studio
51877    terraced house
51878              flat
51879              flat
Name: house_type, Length: 51880, dtype: object

In [77]:
df_clean['house_type'].describe()

count     51880
unique       30
top        flat
freq      27703
Name: house_type, dtype: object

In [78]:
df_clean.head()

Unnamed: 0,property_id,price,address,house_type,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,tenure,lease_time,service_charge,tax_band,ground_rent,commonhold_details,points_of_interest,listing_features,description_text,property_link,postcode
0,1.0,300000.0,"Ringers Road, Bromley BR1",flat,1 bed,1 bath,1 reception,650 sq. ft,Leasehold,118 years,Not available,C,£640 per month,,St Mark's Church of England Primary School\n0....,Leasehold\nAllocated parking\nModern finish\nL...,"** Guide price £300,000 - £325,000 **\n\nThis ...",https://www.zoopla.co.uk/for-sale/details/6275...,BR1
1,2.0,125000.0,"Downham Way, Bromley, Kent BR1",studio,,1 bath,1 reception,,Leasehold,82 years,£816 per year,B,£250,,"Launcelot Primary School\n0.1 miles,Haberdashe...",Leasehold\n* Perfect residential or investment...,Perfect residential or investment property wit...,https://www.zoopla.co.uk/for-sale/details/6521...,BR1
2,3.0,595000.0,"Rolvenden Gardens, Bromley BR1",terraced house,3 beds,2 baths,1 reception,,Freehold,,,E,,,"Breaside Preparatory School\n0.3 miles,Scotts ...",Freehold\nSpacious Reception Room\nLarge Kitch...,An immaculately presented three bedroom mid te...,https://www.zoopla.co.uk/for-sale/details/6520...,BR1
3,4.0,1475000.0,"Upper Park Road, Bromley BR1",detached house,7 beds,6 baths,3 receptions,,Freehold,,,G,,,"The Tutorial Foundation (SEN)\n0.1 miles,St Jo...",Freehold\n7 bedrooms\n6 bathrooms (4 ensuite)\...,Ref DT0182. A rare to market and substantial V...,https://www.zoopla.co.uk/for-sale/details/6420...,BR1
4,5.0,1499950.0,"Garden Lane, Bromley BR1",detached house,5 beds,1 bath,1 reception,,Freehold,,,G,,,St Joseph's Catholic Primary School\n0.4 miles...,Freehold\nLandscaped Southerly Facing Garden\n...,An exceptional five bedroom detached cottage s...,https://www.zoopla.co.uk/for-sale/details/6520...,BR1
