# Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import re

## Column Cleaning

In [2]:
df_raw = pd.read_csv('./Data/brighton_data_raw.csv')
df_clean = df_raw.copy()

In [3]:
df_raw.head()

Unnamed: 0,property_id,price,address,house_type,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,tenure,lease_time,...,tax_band,ground_rent,commonhold_details,points_of_interest,listing_features,description_text,property_link,postcode,latitude,longitude
0,1,"£300,000","Clyde Road, Brighton BN1",2 bed flat,2 beds,1 bath,1 reception,,Share of freehold,Not available,...,A,Not available,Not available,"London Road (Brighton)\n0.1 miles,Bellerbys Co...",Share of freehold\nTwo-Bedroom First Floor Apa...,Description\n\nA wonderful split-level apartme...,https://www.zoopla.co.uk/for-sale/details/6064...,BN1,50.834835,-0.136658
1,2,"£295,000","North Street, Brighton BN1",1 bed flat,1 bed,,,,Not available,,...,Not available,,,"Middle Street Primary School\n0.1 miles,St Pau...",No Forward Chain\nPassenger Lift\nLocated Next...,Perfect investment property or first time buy!...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.828022,-0.136807
2,3,"£450,000","Crabtree Avenue, Brighton BN1",3 bed terraced house,3 beds,1 bath,2 receptions,,Freehold,,...,C,,,"Carden Nursery and Primary School\n0.3 miles,T...",Freehold\nAttractive 3 Bedroom Family Home Wit...,Summary of accommodation\n\nGround Floor Entra...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.860744,-0.131786
3,4,,,,,,,,,,...,,,,,,,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,,
4,5,"£425,000","Warleigh Road, Brighton BN1",2 bed maisonette,2 beds,1 bath,1 reception,,Leasehold,120 years,...,Not available,Not available,,"London Road (Brighton)\n0.1 miles,Downs Infant...",Leasehold\n2 Double Bedrooms\nNo Onward Chain\...,Avard Estate Agents are pleased to offer for s...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.834766,-0.135156


In [4]:
df_raw.columns

Index(['property_id', 'price', 'address', 'house_type', 'number_of_bedrooms',
       'number_of_bathrooms', 'number_of_receptions', 'other_features',
       'tenure', 'lease_time', 'service_charge', 'tax_band', 'ground_rent',
       'commonhold_details', 'points_of_interest', 'listing_features',
       'description_text', 'property_link', 'postcode', 'latitude',
       'longitude'],
      dtype='object')

### Price

In [5]:
df_raw['price']

0       £300,000
1       £295,000
2       £450,000
3            NaN
4       £425,000
          ...   
5184    £999,950
5185    £192,500
5186    £150,000
5187    £450,000
5188    £390,000
Name: price, Length: 5189, dtype: object

In [6]:
# remove '£', ',', and convert to float
def convert_price(price):
    # Ensure the price is a string, if not, return the original value or some default
    if not isinstance(price, str):
        return np.nan # or return price if you want to keep the original value

    # Use regex to extract numbers, periods, and commas from the string
    extracted = ''.join(re.findall(r'[0-9.,]', price))
    
    # Remove commas
    cleaned = extracted.replace(',', '')
    
    # Try converting the cleaned string to a float
    try:
        return float(cleaned)
    except ValueError:
        return np.nan


df_clean['price'] = df_raw['price'].apply(convert_price)
df_clean.head()

Unnamed: 0,property_id,price,address,house_type,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,tenure,lease_time,...,tax_band,ground_rent,commonhold_details,points_of_interest,listing_features,description_text,property_link,postcode,latitude,longitude
0,1,300000.0,"Clyde Road, Brighton BN1",2 bed flat,2 beds,1 bath,1 reception,,Share of freehold,Not available,...,A,Not available,Not available,"London Road (Brighton)\n0.1 miles,Bellerbys Co...",Share of freehold\nTwo-Bedroom First Floor Apa...,Description\n\nA wonderful split-level apartme...,https://www.zoopla.co.uk/for-sale/details/6064...,BN1,50.834835,-0.136658
1,2,295000.0,"North Street, Brighton BN1",1 bed flat,1 bed,,,,Not available,,...,Not available,,,"Middle Street Primary School\n0.1 miles,St Pau...",No Forward Chain\nPassenger Lift\nLocated Next...,Perfect investment property or first time buy!...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.828022,-0.136807
2,3,450000.0,"Crabtree Avenue, Brighton BN1",3 bed terraced house,3 beds,1 bath,2 receptions,,Freehold,,...,C,,,"Carden Nursery and Primary School\n0.3 miles,T...",Freehold\nAttractive 3 Bedroom Family Home Wit...,Summary of accommodation\n\nGround Floor Entra...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.860744,-0.131786
3,4,,,,,,,,,,...,,,,,,,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,,
4,5,425000.0,"Warleigh Road, Brighton BN1",2 bed maisonette,2 beds,1 bath,1 reception,,Leasehold,120 years,...,Not available,Not available,,"London Road (Brighton)\n0.1 miles,Downs Infant...",Leasehold\n2 Double Bedrooms\nNo Onward Chain\...,Avard Estate Agents are pleased to offer for s...,https://www.zoopla.co.uk/for-sale/details/6516...,BN1,50.834766,-0.135156


In [7]:
df_clean['price'].describe()

count    5.092000e+03
mean     5.068797e+05
std      4.594099e+05
min      5.000000e+03
25%      2.900000e+05
50%      4.187475e+05
75%      6.000000e+05
max      1.500000e+07
Name: price, dtype: float64

### House Type

In [8]:
df_raw['house_type']

0                 2 bed flat
1                 1 bed flat
2       3 bed terraced house
3                        NaN
4           2 bed maisonette
                ...         
5184          2 bed property
5185              1 bed flat
5186          2 bed property
5187                    land
5188              2 bed flat
Name: house_type, Length: 5189, dtype: object

In [9]:
# replaces numbers and 'bed' with empty string
def clean_house_type(x):
    if isinstance(x, str):
        # replaces numbers and 'bed' with empty string
        return re.sub(r'\d+', '', x.replace('bed', '')).strip()
    else:
        return np.nan

df_clean['house_type'] = df_raw['house_type'].apply(clean_house_type)
df_clean['house_type']

0                 flat
1                 flat
2       terraced house
3                  NaN
4           maisonette
             ...      
5184          property
5185              flat
5186          property
5187              land
5188              flat
Name: house_type, Length: 5189, dtype: object

In [10]:
df_clean['house_type'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: house_type
Non-Null Count  Dtype 
--------------  ----- 
5092 non-null   object
dtypes: object(1)
memory usage: 40.7+ KB


In [11]:
df_clean['house_type'].unique()

array(['flat', 'terraced house', nan, 'maisonette', 'semi-detached house',
       'lodge', 'studio', 'property', 'town house', 'bungalow',
       'end terrace house', 'detached house', 'semi-detached bungalow',
       'parking/garage', 'cottage', 'detached bungalow', 'land',
       'block of flats', 'mews house', 'mobile/park home',
       'barn conversion', 'link-detached house', 'farmhouse', 'houseboat',
       'shared accommodation', 'terraced bungalow', 'penthouse',
       'country house', 'equestrian property', 'room'], dtype=object)

### Number of Bedrooms

In [12]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_bedrooms'] = df_raw['number_of_bedrooms'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_bedrooms'] = pd.to_numeric(df_clean['number_of_bedrooms'], errors='coerce')

df_clean['number_of_bedrooms'].info()
df_clean['number_of_bedrooms'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: number_of_bedrooms
Non-Null Count  Dtype  
--------------  -----  
4920 non-null   float64
dtypes: float64(1)
memory usage: 40.7 KB


array([ 2.,  1.,  3., nan,  4.,  6.,  5.,  9.,  8.,  7., 14., 12., 30.,
       11., 50., 10., 15.])

### Number of Bathrooms

In [13]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_bathrooms'] = df_raw['number_of_bathrooms'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_bathrooms'] = pd.to_numeric(df_clean['number_of_bathrooms'], errors='coerce')

df_clean['number_of_bathrooms'].info()
df_clean['number_of_bathrooms'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: number_of_bathrooms
Non-Null Count  Dtype  
--------------  -----  
4791 non-null   float64
dtypes: float64(1)
memory usage: 40.7 KB


array([ 1., nan,  2.,  3.,  4.,  9.,  7.,  6., 14.,  5., 12., 24.,  8.])

### Number of Receptions

In [14]:
# Remove all non-digit characters but keep space between numbers
df_clean['number_of_receptions'] = df_raw['number_of_receptions'].str.replace(r'\D+', ' ', regex=True).str.strip()

# Convert to numbers and handle errors by converting them to NaN
df_clean['number_of_receptions'] = pd.to_numeric(df_clean['number_of_receptions'], errors='coerce')

df_clean['number_of_receptions'].info()
df_clean['number_of_receptions'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: number_of_receptions
Non-Null Count  Dtype  
--------------  -----  
4630 non-null   float64
dtypes: float64(1)
memory usage: 40.7 KB


array([ 1., nan,  2.,  3.,  4.,  5.,  6., 23.,  8.])

### Other Features

In [15]:
# Strip whitespace and replace empty strings with NaN
df_clean['other_features'] = df_raw['other_features'].str.strip().where(lambda x : x != '', np.nan)

df_clean['other_features'] = pd.to_numeric(df_clean['other_features'].str.replace('sq. ft', '').str.replace(',', ''), 
                                           errors='coerce')
df_clean['other_features'].info()
df_clean['other_features'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: other_features
Non-Null Count  Dtype  
--------------  -----  
303 non-null    float64
dtypes: float64(1)
memory usage: 40.7 KB


  df_clean['other_features'] = pd.to_numeric(df_clean['other_features'].str.replace('sq. ft', '').str.replace(',', ''),


array([       nan, 1.2050e+03, 6.2400e+02, 1.2250e+03, 7.5300e+02,
       1.0700e+03, 7.4800e+02, 9.1200e+02, 5.4900e+02, 8.1800e+02,
       5.4000e+02, 7.3500e+02, 1.7860e+03, 8.3900e+02, 1.3340e+03,
       1.5170e+03, 1.4230e+03, 6.7400e+02, 8.0700e+02, 1.0110e+03,
       5.9200e+02, 4.9500e+02, 6.4500e+02, 9.0400e+02, 1.4570e+03,
       9.5000e+02, 6.1400e+02, 2.5400e+03, 9.3000e+02, 1.2180e+03,
       1.1950e+03, 1.0770e+03, 1.0740e+03, 1.1310e+03, 6.6300e+02,
       8.4800e+02, 1.2960e+03, 1.0980e+03, 6.7800e+02, 1.5430e+03,
       4.6200e+02, 1.0960e+03, 1.0550e+03, 7.4300e+02, 9.1500e+02,
       2.1550e+03, 9.9500e+02, 8.0000e+02, 3.2470e+03, 1.3500e+03,
       1.7830e+03, 1.3250e+03, 4.6000e+02, 3.9200e+02, 4.3800e+02,
       6.6000e+02, 4.6900e+02, 3.9600e+02, 1.3780e+03, 1.2060e+03,
       5.0600e+02, 9.6900e+02, 1.7330e+03, 1.1840e+03, 8.5000e+02,
       1.3240e+03, 1.2810e+03, 2.1740e+03, 9.3600e+02, 9.9000e+02,
       1.1410e+03, 1.3030e+03, 4.1700e+02, 1.5310e+03, 1.1520e

### Tenure

In [16]:
# Removes whitespace and converts string to lowercase
df_clean['tenure'] = df_raw['tenure'].str.strip().str.lower()
df_clean['tenure'].info()
df_clean['tenure'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: tenure
Non-Null Count  Dtype 
--------------  ----- 
5092 non-null   object
dtypes: object(1)
memory usage: 40.7+ KB


array(['share of freehold', 'not available', 'freehold', nan, 'leasehold',
       'leasehold shared own', 'commonhold'], dtype=object)

### Lease Time

In [17]:
# Removes whitespace and 'years' and converts column to number
df_clean['lease_time'] = pd.to_numeric(df_raw['lease_time'].str.replace(r'\D+', ' ', regex=True).str.strip(), errors='coerce')
df_clean['lease_time'].info()
df_clean['lease_time'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: lease_time
Non-Null Count  Dtype  
--------------  -----  
758 non-null    float64
dtypes: float64(1)
memory usage: 40.7 KB


array([ nan, 120., 117., 110., 100., 118., 963., 945.,  99., 988., 979.,
       942.,  97., 109., 992., 948., 113., 124.,  88., 171., 154., 114.,
       996., 111., 104., 160., 995.,  64., 105., 969.,  90.,  89.,  86.,
       974., 107., 180., 164., 103., 131., 125., 997.,  92., 965.,  65.,
       138., 137., 119., 190., 982., 108., 116., 250., 132.,  63., 983.,
        79., 121., 140.,  66., 112., 991., 102.,  55., 155.,  95., 106.,
       958., 999.,  70.,  85.,   4.,  98., 142., 123., 150.,  62., 122.,
       927.,  94.,  61., 950.,  91., 115.,  49., 157.,  57., 961., 170.,
       136.,  87., 939., 985., 181., 940., 986.,  96., 169., 168.,  45.,
       994., 135.,   2., 952.,  58., 156., 993.,  81.,  77., 145., 161.,
        74., 964.,   1.,  82.,   5.,  60., 998., 162., 966., 149., 141.,
       899., 935., 957., 127., 989., 126., 943., 167., 230.,  52.,  10.,
        47., 147., 151., 936.,  83., 176., 165., 248., 143., 249.,  71.,
       981.,  53., 984.,  93., 139., 133., 130.,  6

### Service Charge

In [18]:
# removes '£', ',' and 'per year' from string 
df_clean['service_charge'] = (df_raw['service_charge'].str.replace('£', '').str.replace('per year', '')
                              .str.replace(',', '').str.replace('per month', '').str.strip())
df_clean['service_charge'].info()
df_clean['service_charge'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: service_charge
Non-Null Count  Dtype 
--------------  ----- 
1614 non-null   object
dtypes: object(1)
memory usage: 40.7+ KB


array(['1100', nan, 'Not available', '254', '1412', '1200', '1254',
       '3560', '600', '1320', '3792', '1424', '1034', '1000', '1274',
       '2560', '1135', '2608', '720', '1375', '2775', '2220', '250',
       '300', '2600', '651', '1564', '1330', '3540', '2974', '840',
       '1781', '1080', '4333', '1126', '2326', '1788', '2080', '1210',
       '1404', '3952', '2240', '1668', '1017', '480', '1778', '1716',
       '543', '1360', '1280', '4167', '900', '1500', '3800', '3387',
       '1600', '260', '3332', '1700', '284', '3252', '2686', '764',
       '2500', '943', '2097', '8130', '1300', '867', '3161', '1640',
       '918', '2204', '50', '1132', '110', '9988', '1514', '84',
       '1 years', '1398', '1062', '1750', '115', '999', '1896', '1685',
       '758', '425', '2200', '850', '1800', '1095', '500', '725', '4038',
       '1400', '1442', '8126', '5000', '2392', '3999', '2800', '2519',
       '4382', '2620', '2000', '1969', '4376', '2260', '7044', '140',
       '3600', '2300', '37

### Tax Band

In [19]:
df_clean['tax_band'] = df_raw['tax_band'].str.strip()
df_clean['tax_band'].info()
df_clean['tax_band'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: tax_band
Non-Null Count  Dtype 
--------------  ----- 
5092 non-null   object
dtypes: object(1)
memory usage: 40.7+ KB


array(['A', 'Not available', 'C', nan, 'D',
       'A band has not yet been confirmed.', 'F', 'B', 'E', 'G',
       'New build', 'H', 'A band has not yet been assigned',
       'The property is not currently residential so does not pay council tax. Once the build is complete, the property will need to be assessed for rates.',
       'Leisure Mooring', 'a band has not been assigned yet',
       'A band has not been assigned', 'caravan', 'Holiday Homes'],
      dtype=object)

### Ground Rent

In [20]:
# removes '£', ',' and 'per month' from string
df_clean['ground_rent'] = (df_raw['ground_rent'].str.replace('£' , '').str.replace(',' , '')
                           .str.replace('per month', '').str.strip())
df_clean['ground_rent'].info()
df_clean['ground_rent'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: ground_rent
Non-Null Count  Dtype 
--------------  ----- 
1606 non-null   object
dtypes: object(1)
memory usage: 40.7+ KB


array(['Not available', nan, '175', '48', '200', '250', '150', '400',
       '100', '195', '10', '80', '280', '125', '350', '75', '25', '1',
       '120', '330', '60', '657', '169', '0', '50', '300', '820', '145',
       '385', '455', '434', '274', '206', '307', '93', '435', '13', '410',
       '360', '165', '7', '220', '510', '15', '20', '218', '365', '160',
       '140', '550', '950', '816', '439', '225', '440', '27', '418',
       '1848', '331', '30', '63', '179', '189', '95', '180', '472', '425',
       '500', '1107', '240', '389', '213', '450', '70', '582', '236',
       '285', '9745', '9695', '7595', '57', '190', '465', '38', '40',
       '45', '462', '580', '90', '513', '2472', '520'], dtype=object)

### Commonhold Details

In [21]:
df_raw['commonhold_details'].info()
df_raw['commonhold_details'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 5189 entries, 0 to 5188
Series name: commonhold_details
Non-Null Count  Dtype 
--------------  ----- 
426 non-null    object
dtypes: object(1)
memory usage: 40.7+ KB


array(['Not available', nan], dtype=object)

### Points of Interest

In [22]:
df_raw['points_of_interest'][0]

'London Road (Brighton)\n0.1 miles,Bellerbys College Brighton\n0.2 miles,Downs Infant School\n0.3 miles,Brighton\n0.4 miles'

In [23]:
df_raw['points_of_interest'].describe()

count                                                  5092
unique                                                 3267
top       St Andrew's CofE (Aided) Primary School\n0.5 m...
freq                                                     33
Name: points_of_interest, dtype: object

In [24]:
df_info = df_raw[['property_id','points_of_interest']].copy()
df_info['points_of_interest'] = df_info['points_of_interest'].str.split(',')
df_info

Unnamed: 0,property_id,points_of_interest
0,1,"[London Road (Brighton)\n0.1 miles, Bellerbys ..."
1,2,"[Middle Street Primary School\n0.1 miles, St P..."
2,3,"[Carden Nursery and Primary School\n0.3 miles,..."
3,4,
4,5,"[London Road (Brighton)\n0.1 miles, Downs Infa..."
...,...,...
5184,5185,"[Lewes Old Grammar School\n0.3 miles, Lewes\n0..."
5185,5186,[South Malling CofE Primary and Nursery School...
5186,5187,"[Priory School\n0.4 miles, Lewes\n0.4 miles, S..."
5187,5188,"[Priory School\n0.3 miles, Lewes\n0.4 miles, W..."


In [25]:
def get_length(array):
    """
    Gets the length of each array in dataframe column.
    Returns int
    """
    if isinstance(array, list):
        return len(array)
    else:
        return 0

df_info['length'] = df_info['points_of_interest'].apply(get_length)
df_info.tail(30)

Unnamed: 0,property_id,points_of_interest,length
5159,5160,"[Lewes\n0.1 miles, Western Road Community Prim...",4
5160,5161,"[Lewes\n0.2 miles, Priory School\n0.3 miles, W...",4
5161,5162,"[Wallands Community Primary School\n0.2 miles,...",4
5162,5163,"[Lewes Old Grammar School\n0.2 miles, Western ...",4
5163,5164,"[Priory School\n0.3 miles, Lewes\n0.4 miles, S...",4
5164,5165,"[Wallands Community Primary School\n0.2 miles,...",4
5165,5166,"[Lewes\n0.1 miles, Priory School\n0.3 miles, W...",4
5166,5167,[South Malling CofE Primary and Nursery School...,4
5167,5168,"[Hamsey Community Primary School\n1.3 miles, C...",4
5168,5169,[Western Road Community Primary School\n0 mile...,4


In [26]:
extracted_points = []

def get_info(row):
    # Initialize dictionary with np.nan values
    data = {
        'property_id': row.property_id,
        'school_1_name': np.nan,
        'school_1_distance': np.nan,
        'school_2_name': np.nan,
        'school_2_distance': np.nan,
        'train_1_name': np.nan,
        'train_1_distance': np.nan,
        'train_2_name': np.nan,
        'train_2_distance': np.nan
    }

    if row.length == 4:
        schools_found = 0
        trains_found = 0
        school_words = ['school', 'boys', 'girls', 'primary', 'tutorial', 'catholic', 'academy', 'college']

        for info in row.points_of_interest:
            name, distance = info.split('\n')
            name =  name.lower()
            distance = float(re.sub(r'[^0-9.]', '', distance))
            if any(sub in name for sub in school_words):
                if schools_found == 0:
                    data['school_1_name'] = name
                    data['school_1_distance'] = distance
                    schools_found += 1
                elif schools_found == 1:
                    data['school_2_name'] = name
                    data['school_2_distance'] = distance
            else:
                if trains_found == 0:
                    data['train_1_name'] = name
                    data['train_1_distance'] = distance
                    trains_found += 1
                elif trains_found == 1:
                    data['train_2_name'] = name
                    data['train_2_distance'] = distance
    
    return data
#     print(data)

# Sample test
# test = df_info.loc[0, :]
# get_info(test)
# extracted_points

for row in df_info.itertuples():
    value = get_info(row)
    extracted_points.append(value)

len(extracted_points)

5189

In [27]:
df_info_2 = pd.DataFrame(extracted_points)
df_info_2.head()

Unnamed: 0,property_id,school_1_name,school_1_distance,school_2_name,school_2_distance,train_1_name,train_1_distance,train_2_name,train_2_distance
0,1,bellerbys college brighton,0.2,downs infant school,0.3,london road (brighton),0.1,brighton,0.4
1,2,middle street primary school,0.1,st paul's cofe primary school and nursery,0.3,brighton,0.5,london road (brighton),1.0
2,3,carden nursery and primary school,0.3,the dharma primary school,0.4,moulsecoomb,1.1,preston park,1.3
3,4,,,,,,,,
4,5,downs infant school,0.2,downs junior school,0.3,london road (brighton),0.1,brighton,0.5


In [28]:
# merge dataframes to original dataframe
df_clean_2 = pd.merge(df_clean, df_info_2, on='property_id')
df_clean_2.head()

Unnamed: 0,property_id,price,address,house_type,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,tenure,lease_time,...,latitude,longitude,school_1_name,school_1_distance,school_2_name,school_2_distance,train_1_name,train_1_distance,train_2_name,train_2_distance
0,1,300000.0,"Clyde Road, Brighton BN1",flat,2.0,1.0,1.0,,share of freehold,,...,50.834835,-0.136658,bellerbys college brighton,0.2,downs infant school,0.3,london road (brighton),0.1,brighton,0.4
1,2,295000.0,"North Street, Brighton BN1",flat,1.0,,,,not available,,...,50.828022,-0.136807,middle street primary school,0.1,st paul's cofe primary school and nursery,0.3,brighton,0.5,london road (brighton),1.0
2,3,450000.0,"Crabtree Avenue, Brighton BN1",terraced house,3.0,1.0,2.0,,freehold,,...,50.860744,-0.131786,carden nursery and primary school,0.3,the dharma primary school,0.4,moulsecoomb,1.1,preston park,1.3
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,425000.0,"Warleigh Road, Brighton BN1",maisonette,2.0,1.0,1.0,,leasehold,120.0,...,50.834766,-0.135156,downs infant school,0.2,downs junior school,0.3,london road (brighton),0.1,brighton,0.5


In [29]:
df_clean_2.columns

Index(['property_id', 'price', 'address', 'house_type', 'number_of_bedrooms',
       'number_of_bathrooms', 'number_of_receptions', 'other_features',
       'tenure', 'lease_time', 'service_charge', 'tax_band', 'ground_rent',
       'commonhold_details', 'points_of_interest', 'listing_features',
       'description_text', 'property_link', 'postcode', 'latitude',
       'longitude', 'school_1_name', 'school_1_distance', 'school_2_name',
       'school_2_distance', 'train_1_name', 'train_1_distance', 'train_2_name',
       'train_2_distance'],
      dtype='object')

### Description Text

In [30]:
df_clean_2['description_text'] = df_clean_2['description_text'].str.strip().str.replace('*', '').str.lower()

  df_clean_2['description_text'] = df_clean_2['description_text'].str.strip().str.replace('*', '').str.lower()


In [31]:
df_clean_2['description_text']

0       description\n\na wonderful split-level apartme...
1       perfect investment property or first time buy!...
2       summary of accommodation\n\nground floor entra...
3                                                     NaN
4       avard estate agents are pleased to offer for s...
                              ...                        
5184    a unique grade ii listed property spanning ove...
5185    final one bedroom apartment remaining; plot 4a...
5186    summary\nnestled in the heart lewes is this 2 ...
5187    rarely available building plot with planning p...
5188    a modernised, first floor retirement apartment...
Name: description_text, Length: 5189, dtype: object

### Latitude

In [32]:
df_clean_2['latitude']

0       50.834835
1       50.828022
2       50.860744
3             NaN
4       50.834766
          ...    
5184    50.874866
5185          NaN
5186    50.874283
5187    50.874283
5188          NaN
Name: latitude, Length: 5189, dtype: float64

### Longitude

In [33]:
df_clean_2['longitude']

0      -0.136658
1      -0.136807
2      -0.131786
3            NaN
4      -0.135156
          ...   
5184    0.011649
5185         NaN
5186    0.019230
5187    0.019230
5188         NaN
Name: longitude, Length: 5189, dtype: float64

### Postcode

In [34]:
df_clean_2['postcode']

0       BN1
1       BN1
2       BN1
3       BN1
4       BN1
       ... 
5184    BN7
5185    BN7
5186    BN7
5187    BN7
5188    BN7
Name: postcode, Length: 5189, dtype: object

### School distance

In [35]:
df_clean_2['school_1_distance'] 

0       0.2
1       0.1
2       0.3
3       NaN
4       0.2
       ... 
5184    0.3
5185    0.2
5186    0.4
5187    0.3
5188    0.3
Name: school_1_distance, Length: 5189, dtype: float64

In [36]:
df_clean_2.describe()

Unnamed: 0,property_id,price,number_of_bedrooms,number_of_bathrooms,number_of_receptions,other_features,lease_time,latitude,longitude,school_1_distance,school_2_distance,train_1_distance,train_2_distance
count,5189.0,5092.0,4920.0,4791.0,4630.0,303.0,758.0,2554.0,2554.0,4645.0,4151.0,4645.0,4645.0
mean,2595.0,506879.7,2.809959,1.551242,1.464147,15734.33,314.478892,50.130247,0.551715,0.339419,0.565936,1.094682,1.684952
std,1498.079604,459409.9,1.513866,0.916238,0.794987,250174.0,369.050449,7.516191,14.425469,0.297453,0.491676,1.114011,1.170459
min,1.0,5000.0,1.0,1.0,1.0,36.0,1.0,-45.948929,-104.771187,0.0,0.0,0.0,0.1
25%,1298.0,290000.0,2.0,1.0,1.0,648.5,95.0,50.810425,-0.22039,0.2,0.3,0.4,0.9
50%,2595.0,418747.5,3.0,1.0,1.0,915.0,117.0,50.827091,-0.13614,0.3,0.4,0.7,1.4
75%,3892.0,600000.0,4.0,2.0,2.0,1286.0,249.75,50.840504,0.227991,0.4,0.6,1.3,2.0
max,5189.0,15000000.0,50.0,24.0,23.0,4356000.0,999.0,56.557793,170.3293,4.5,5.9,7.4,7.9


In [37]:
df_clean_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5189 entries, 0 to 5188
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   property_id           5189 non-null   int64  
 1   price                 5092 non-null   float64
 2   address               5092 non-null   object 
 3   house_type            5092 non-null   object 
 4   number_of_bedrooms    4920 non-null   float64
 5   number_of_bathrooms   4791 non-null   float64
 6   number_of_receptions  4630 non-null   float64
 7   other_features        303 non-null    float64
 8   tenure                5092 non-null   object 
 9   lease_time            758 non-null    float64
 10  service_charge        1614 non-null   object 
 11  tax_band              5092 non-null   object 
 12  ground_rent           1606 non-null   object 
 13  commonhold_details    426 non-null    object 
 14  points_of_interest    5092 non-null   object 
 15  listing_features     

## Save cleaned dataset

In [39]:
df_clean_2.to_csv('./Data/brighton_cleaned_data.csv', index=False)