In [345]:
import pandas as pd
feature_columns = ['price', 'bedrooms', 'address', 'latitude', 'longitude', 'propertyType','summary']
rm = pd.read_csv('right_move_data.csv', usecols=feature_columns)

In [346]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20169 entries, 0 to 20168
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         20169 non-null  int64  
 1   bedrooms      19696 non-null  float64
 2   address       20169 non-null  object 
 3   latitude      20169 non-null  float64
 4   longitude     20169 non-null  float64
 5   propertyType  20169 non-null  object 
 6   summary       20169 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 1.1+ MB


In [347]:
rm.shape

(20169, 7)

In [348]:
# - Looks like there are a number of duplicate property types in the list which is being caused by capital letters - 
# - Remove all caps 
rm.propertyType.value_counts()

apartment                   6168
flat                        5520
terraced house              2163
semi-detached house         1451
detached house              1005
house                        799
end of terrace house         618
maisonette                   558
penthouse                    340
Studio flat                  217
retirement property          175
ground floor flat            144
bungalow                     125
town house                   116
Studio apartment             104
duplex                        97
detached bungalow             80
property                      76
mews house                    76
Land                          62
ground maisonette             43
semi-detached bungalow        34
cottage                       32
block of apartments           21
link detached house           18
Property                      16
park home                     15
Detached house                14
Terraced house                13
Plot                          13
House     

In [349]:
rm = rm.apply(lambda x: x.str.lower()if x.dtype =='object' else x)

In [350]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20169 entries, 0 to 20168
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         20169 non-null  int64  
 1   bedrooms      19696 non-null  float64
 2   address       20169 non-null  object 
 3   latitude      20169 non-null  float64
 4   longitude     20169 non-null  float64
 5   propertyType  20169 non-null  object 
 6   summary       20169 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 1.1+ MB


In [351]:
#There are 473 property listings which do not have details re the number of bedrooms
rm.isnull().sum()

price             0
bedrooms        473
address           0
latitude          0
longitude         0
propertyType      0
summary           0
dtype: int64

In [352]:
# update studio flat and Ground floor studio flat  = studio apartment as they are the same 
# 321 of the 473 rows where bedroom = NaN are studio apartments so I can update these records to have 0 bedrooms
rm.loc[rm['propertyType'].isin(['studio flat', 'ground floor studio flat']), 'propertyType'] = 'studio apartment'
rm.loc[(rm['bedrooms'].isnull()) & (rm['propertyType'] == 'studio apartment'), 'bedrooms'] = 0

In [353]:
# I want to see the remaining bedrooms = nan and their corresponding property types

rm[rm.bedrooms.isnull()].groupby(rm['propertyType']).count()

Unnamed: 0_level_0,price,bedrooms,address,latitude,longitude,propertyType,summary
propertyType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
block of apartments,6,0,6,6,6,6,6
bungalow,1,0,1,1,1,1,1
detached house,14,0,14,14,14,14,14
end of terrace house,3,0,3,3,3,3,3
hotel room,2,0,2,2,2,2,2
house,8,0,8,8,8,8,8
land,62,0,62,62,62,62,62
parking,7,0,7,7,7,7,7
penthouse,2,0,2,2,2,2,2
plot,13,0,13,13,13,13,13


In [354]:
# remove the following property types from the dataset: Land, Parking, Plot, Block of apartments, hotel rooms
rm.drop(rm.loc[rm['propertyType'].isin(['land', 'parking', 'plot','block of apartments','garages', 'hotel room', 'house boat', 'park home'])].index, inplace=True)


In [355]:
# the remaining property types that do have bedrooms = NaN are as follows:

rm[rm.bedrooms.isnull()].groupby(rm['propertyType']).count()

Unnamed: 0_level_0,price,bedrooms,address,latitude,longitude,propertyType,summary
propertyType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bungalow,1,0,1,1,1,1,1
detached house,14,0,14,14,14,14,14
end of terrace house,3,0,3,3,3,3,3
house,8,0,8,8,8,8,8
penthouse,2,0,2,2,2,2,2
property,16,0,16,16,16,16,16
semi-detached house,4,0,4,4,4,4,4
terraced house,13,0,13,13,13,13,13


In [356]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20035 entries, 0 to 20168
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         20035 non-null  int64  
 1   bedrooms      19974 non-null  float64
 2   address       20035 non-null  object 
 3   latitude      20035 non-null  float64
 4   longitude     20035 non-null  float64
 5   propertyType  20035 non-null  object 
 6   summary       20035 non-null  object 
dtypes: float64(3), int64(1), object(3)
memory usage: 1.7+ MB


In [357]:
# Remove the 61 rows where bedroom = na
# because the bedroom column is the only one with na, we can simply use the dropna function

rm.dropna(inplace=True)

In [358]:
rm.isnull().sum()

price           0
bedrooms        0
address         0
latitude        0
longitude       0
propertyType    0
summary         0
dtype: int64

In [359]:
#See of garden is mentioned in the summary 
rm['garden'] = rm['summary'].str.contains('garden')
rm.garden.sum()

4277

In [360]:
rm.to_csv(r'C:\Users\jwpow\data_science\Final Project\Rightmove data\rm_clean.csv')

In [361]:
# - Looks like there are a number of duplicate property types in the list which is being caused by capital letters - 
# - Remove all caps 
rm.propertyType.value_counts()

apartment                 6168
flat                      5520
terraced house            2163
semi-detached house       1451
detached house            1005
house                      799
end of terrace house       618
maisonette                 558
penthouse                  340
studio apartment           322
retirement property        175
ground floor flat          144
bungalow                   125
town house                 116
duplex                      97
detached bungalow           80
mews house                  76
property                    76
ground maisonette           43
semi-detached bungalow      34
cottage                     32
link detached house         18
serviced apartment           4
chalet                       2
lodge                        2
country house                1
terraced bungalow            1
cluster house                1
equestrian facility          1
triplex                      1
manor house                  1
Name: propertyType, dtype: int64

In [362]:
#There are too many property types in the list so I have decided to combine the types that are
#similar / the same:
    # apartments, flats, serviced apartment, ground floor flat
    # terraced house, semi-detacthed house, cluster house 
    # detacthed house, house, town house, link detached house
    # duplex,  ground maisonette, triplex 
    # detached bungalow,semi-detached bungalow, 
    # manor house equestrian facility chalet lodge
    
rm.loc[rm.propertyType.isin(['flat','serviced apartment','ground floor flat']), 'propertyType'] = 'apartment' 
rm.loc[rm.propertyType.isin(['terraced house','semi-detacthed house', 'cluster house']), 'propertyType'] = 'terraced house'  
rm.loc[rm.propertyType.isin(['detacthed house','town house', 'house', 'link detached house']), 'propertyType'] = 'detached house'  
rm.loc[rm.propertyType.isin(['duplex','ground maisonette', 'triplex']), 'propertyType'] = 'maisonette'  
rm.loc[rm.propertyType.isin(['detached bungalow','semi-detached bungalow','terraced bungalow']), 'propertyType'] = 'bungalow'  
rm.loc[rm.propertyType.isin(['manor house','equestrian facility', 'chalet', 'lodge']), 'propertyType'] = 'country house'  

rm.propertyType.value_counts()

apartment               11836
terraced house           2164
detached house           1938
semi-detached house      1451
maisonette                699
end of terrace house      618
penthouse                 340
studio apartment          322
bungalow                  240
retirement property       175
mews house                 76
property                   76
cottage                    32
country house               7
Name: propertyType, dtype: int64

In [363]:
# property type = 'property' appears to be a mix of houses and apartments
rm[rm['propertyType'] == 'property']

Unnamed: 0,price,bedrooms,address,latitude,longitude,propertyType,summary,garden
161,1800000,3.0,"arkley lane, barnet",51.653180,-0.230322,property,a rare opportunity to acquire a bespoke detach...,False
539,950000,3.0,"ravenscroft avenue, golders green",51.576897,-0.199075,property,exclusive marketing and sales agents for this ...,True
808,700000,3.0,"crescent road, new barnet",51.645714,-0.163772,property,a large semi-detached home offering family siz...,True
1351,475000,4.0,"geary court, geary drive, brentwood",51.626095,0.298930,property,**open house saturday 19th 9:30-11:30** call y...,False
1750,3500000,2.0,"belmont street, london, nw1",51.543900,-0.150989,property,a simply stunning loft style apartment set on ...,False
...,...,...,...,...,...,...,...,...
19944,3590000,1.0,"marylebone square, moxon street",51.519130,-0.152810,property,"a third floor, east facing one bedroom apartme...",False
20007,3400000,3.0,"millbank quarter, westminster, sw1p",51.494760,-0.125313,property,"off plan | completion q1 2022 | 1,485 sq ft | ...",False
20084,3173000,1.0,"marylebone lane, marylebone, w1u",51.518060,-0.151280,property,an amazing 1 bedroom apartment situated on the...,False
20103,3173000,1.0,"marylebone square, moxon street",51.519130,-0.152810,property,we are honoured to present marylebone square. ...,False


In [375]:
mask = rm['propertyType'] == 'property'
rm.drop(rm[mask].index, inplace=True)

In [376]:
#pull the postcodes from the address column, they appear to be the last part of the address string, separated by a comma
rm['postcode'] = rm['address'].str.rsplit(',').str[-1]

In [377]:
#reviewing the postcodes column coloser, it appears that not all postcodes are provided for every address so,
#drop the postcodes column - I will need to find another way of finding these postcodes
#rm.to_csv(r'..\Data Dumps JSON\Examine.csv') 

rm.drop('postcode', axis=1, inplace=True)

In [379]:
rm.to_csv('right_move_data_wip.csv')