In [17]:
import pandas as pd

ORIGINAL_FILE = r'../Data/rm_working_files/right_move_data_eda.csv'
UPDATED_FILE = r'../Data/rm_working_files/right_move_data_eda_update.csv'

feature_columns = ['price', 'bedrooms', 'address', 'latitude', 'longitude', 'propertyType','summary', 'bourough']
rm = pd.read_csv(ORIGINAL_FILE, usecols=feature_columns)

In [2]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20169 entries, 0 to 20168
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   identifier    20169 non-null  int64  
 1   price         20169 non-null  int64  
 2   bedrooms      19696 non-null  float64
 3   address       20169 non-null  object 
 4   latitude      20169 non-null  float64
 5   longitude     20169 non-null  float64
 6   propertyType  20169 non-null  object 
 7   summary       20169 non-null  object 
 8   bourough      20169 non-null  object 
dtypes: float64(3), int64(2), object(4)
memory usage: 1.4+ MB


In [99]:
rm.shape

(20169, 8)

In [100]:
# - Looks like there are a number of duplicate property types in the list which is being caused by capital letters - 
# - Remove all caps 
rm.propertyType.value_counts()

apartment                   6168
flat                        5520
terraced house              2163
semi-detached house         1451
detached house              1005
house                        799
end of terrace house         618
maisonette                   558
penthouse                    340
Studio flat                  217
retirement property          175
ground floor flat            144
bungalow                     125
town house                   116
Studio apartment             104
duplex                        97
detached bungalow             80
mews house                    76
property                      76
Land                          62
ground maisonette             43
semi-detached bungalow        34
cottage                       32
block of apartments           21
link detached house           18
Property                      16
park home                     15
Detached house                14
Terraced house                13
Plot                          13
House     

In [3]:
rm = rm.apply(lambda x: x.str.lower()if x.dtype =='object' else x)

In [4]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20169 entries, 0 to 20168
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   identifier    20169 non-null  int64  
 1   price         20169 non-null  int64  
 2   bedrooms      19696 non-null  float64
 3   address       20169 non-null  object 
 4   latitude      20169 non-null  float64
 5   longitude     20169 non-null  float64
 6   propertyType  20169 non-null  object 
 7   summary       20169 non-null  object 
 8   bourough      20169 non-null  object 
dtypes: float64(3), int64(2), object(4)
memory usage: 1.4+ MB


In [5]:
#There are 473 property listings which do not have details re the number of bedrooms
rm.isnull().sum()

identifier        0
price             0
bedrooms        473
address           0
latitude          0
longitude         0
propertyType      0
summary           0
bourough          0
dtype: int64

In [6]:
# update studio flat and Ground floor studio flat  = studio apartment as they are the same 
# 321 of the 473 rows where bedroom = NaN are studio apartments so I can update these records to have 0 bedrooms
rm.loc[rm['propertyType'].isin(['studio flat', 'ground floor studio flat']), 'propertyType'] = 'studio apartment'
rm.loc[(rm['bedrooms'].isnull()) & (rm['propertyType'] == 'studio apartment'), 'bedrooms'] = 0

In [7]:
# I want to see the remaining bedrooms = nan and their corresponding property types

rm[rm.bedrooms.isnull()].groupby(rm['propertyType']).count()

Unnamed: 0_level_0,identifier,price,bedrooms,address,latitude,longitude,propertyType,summary,bourough
propertyType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
block of apartments,6,6,0,6,6,6,6,6,6
bungalow,1,1,0,1,1,1,1,1,1
detached house,14,14,0,14,14,14,14,14,14
end of terrace house,3,3,0,3,3,3,3,3,3
hotel room,2,2,0,2,2,2,2,2,2
house,8,8,0,8,8,8,8,8,8
land,62,62,0,62,62,62,62,62,62
parking,7,7,0,7,7,7,7,7,7
penthouse,2,2,0,2,2,2,2,2,2
plot,13,13,0,13,13,13,13,13,13


In [8]:
# remove the following property types from the dataset: Land, Parking, Plot, Block of apartments, hotel rooms
rm.drop(rm.loc[rm['propertyType'].isin(['land', 'parking', 'plot','block of apartments','garages', 'hotel room', 'house boat', 'park home'])].index, inplace=True)


In [9]:
# the remaining property types that do have bedrooms = NaN are as follows:

rm[rm.bedrooms.isnull()].groupby(rm['propertyType']).count()

Unnamed: 0_level_0,identifier,price,bedrooms,address,latitude,longitude,propertyType,summary,bourough
propertyType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bungalow,1,1,0,1,1,1,1,1,1
detached house,14,14,0,14,14,14,14,14,14
end of terrace house,3,3,0,3,3,3,3,3,3
house,8,8,0,8,8,8,8,8,8
penthouse,2,2,0,2,2,2,2,2,2
property,16,16,0,16,16,16,16,16,16
semi-detached house,4,4,0,4,4,4,4,4,4
terraced house,13,13,0,13,13,13,13,13,13


In [10]:
rm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20035 entries, 0 to 20168
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   identifier    20035 non-null  int64  
 1   price         20035 non-null  int64  
 2   bedrooms      19974 non-null  float64
 3   address       20035 non-null  object 
 4   latitude      20035 non-null  float64
 5   longitude     20035 non-null  float64
 6   propertyType  20035 non-null  object 
 7   summary       20035 non-null  object 
 8   bourough      20035 non-null  object 
dtypes: float64(3), int64(2), object(4)
memory usage: 2.0+ MB


In [11]:
# Remove the 61 rows where bedroom = na
# because the bedroom column is the only one with na, we can simply use the dropna function

rm.dropna(inplace=True)

In [12]:
rm.isnull().sum()

identifier      0
price           0
bedrooms        0
address         0
latitude        0
longitude       0
propertyType    0
summary         0
bourough        0
dtype: int64

In [13]:
#See of garden is mentioned in the summary 
rm['outdoor space'] = rm['summary'].str.contains('balcony|garden|terrace|yard|acre',regex=True)

In [14]:
#remove .json from the end of each bourough 
# rm.bourough.str.replace('.json', '')
# df['prod_type'] = df['prod_type'].replace('respon' ,'responsvie')
rm.bourough.replace('.json', '', inplace=True, regex=True)

In [15]:
rm.to_csv(r'../Data wrangling/ids.csv')

In [113]:
# - Looks like there are a number of duplicate property types in the list which is being caused by capital letters - 
# - Remove all caps 
rm.propertyType.value_counts()

apartment                 6168
flat                      5520
terraced house            2163
semi-detached house       1451
detached house            1005
house                      799
end of terrace house       618
maisonette                 558
penthouse                  340
studio apartment           322
retirement property        175
ground floor flat          144
bungalow                   125
town house                 116
duplex                      97
detached bungalow           80
property                    76
mews house                  76
ground maisonette           43
semi-detached bungalow      34
cottage                     32
link detached house         18
serviced apartment           4
chalet                       2
lodge                        2
terraced bungalow            1
manor house                  1
triplex                      1
country house                1
cluster house                1
equestrian facility          1
Name: propertyType, dtype: int64

In [114]:
#There are too many property types in the list so I have decided to combine the types that are
#similar / the same:
    # apartments, flats, serviced apartment, ground floor flat
    # terraced house, semi-detacthed house, cluster house 
    # detacthed house, house, town house, link detached house
    # duplex,  ground maisonette, triplex 
    # detached bungalow,semi-detached bungalow, 
    # manor house equestrian facility chalet lodge
    
rm.loc[rm.propertyType.isin(['flat','serviced apartment','ground floor flat']), 'propertyType'] = 'apartment' 
rm.loc[rm.propertyType.isin(['terraced house','semi-detacthed house', 'cluster house']), 'propertyType'] = 'terraced house'  
rm.loc[rm.propertyType.isin(['detacthed house','town house', 'house', 'link detached house']), 'propertyType'] = 'detached house'  
rm.loc[rm.propertyType.isin(['duplex','ground maisonette', 'triplex']), 'propertyType'] = 'maisonette'  
rm.loc[rm.propertyType.isin(['detached bungalow','semi-detached bungalow','terraced bungalow']), 'propertyType'] = 'bungalow'  
rm.loc[rm.propertyType.isin(['manor house','equestrian facility', 'chalet', 'lodge']), 'propertyType'] = 'country house'  

rm.propertyType.value_counts()

apartment               11836
terraced house           2164
detached house           1938
semi-detached house      1451
maisonette                699
end of terrace house      618
penthouse                 340
studio apartment          322
bungalow                  240
retirement property       175
property                   76
mews house                 76
cottage                    32
country house               7
Name: propertyType, dtype: int64

In [115]:
# property type = 'property' appears to be a mix of houses and apartments
rm[rm['propertyType'] == 'property']

Unnamed: 0,price,bedrooms,address,latitude,longitude,propertyType,summary,bourough,outdoor space
161,1800000,3.0,"arkley lane, barnet",51.653180,-0.230322,property,a rare opportunity to acquire a bespoke detach...,barnet,True
539,950000,3.0,"ravenscroft avenue, golders green",51.576897,-0.199075,property,exclusive marketing and sales agents for this ...,barnet,True
808,700000,3.0,"crescent road, new barnet",51.645714,-0.163772,property,a large semi-detached home offering family siz...,barnet,True
1351,475000,4.0,"geary court, geary drive, brentwood",51.626095,0.298930,property,**open house saturday 19th 9:30-11:30** call y...,brent,False
1750,3500000,2.0,"belmont street, london, nw1",51.543900,-0.150989,property,a simply stunning loft style apartment set on ...,camden,False
...,...,...,...,...,...,...,...,...,...
19944,3590000,1.0,"marylebone square, moxon street",51.519130,-0.152810,property,"a third floor, east facing one bedroom apartme...",westminster,False
20007,3400000,3.0,"millbank quarter, westminster, sw1p",51.494760,-0.125313,property,"off plan | completion q1 2022 | 1,485 sq ft | ...",westminster,True
20084,3173000,1.0,"marylebone lane, marylebone, w1u",51.518060,-0.151280,property,an amazing 1 bedroom apartment situated on the...,westminster,True
20103,3173000,1.0,"marylebone square, moxon street",51.519130,-0.152810,property,we are honoured to present marylebone square. ...,westminster,False


In [116]:
mask = rm['propertyType'] == 'property'
rm.drop(rm[mask].index, inplace=True)

In [117]:
rm.propertyType.value_counts()

apartment               11836
terraced house           2164
detached house           1938
semi-detached house      1451
maisonette                699
end of terrace house      618
penthouse                 340
studio apartment          322
bungalow                  240
retirement property       175
mews house                 76
cottage                    32
country house               7
Name: propertyType, dtype: int64

In [118]:
#pull the postcodes from the address column, they appear to be the last part of the address string, separated by a comma
rm['postcode'] = rm['address'].str.rsplit(',').str[-1]

In [119]:
#reviewing the postcodes column coloser, it appears that not all postcodes are provided for every address so,
#drop the postcodes column - I will need to find another way of finding these postcodes
#rm.to_csv(r'..\Data Dumps JSON\Examine.csv') 

rm.drop('postcode', axis=1, inplace=True)

In [120]:
rm.head(7)

Unnamed: 0,price,bedrooms,address,latitude,longitude,propertyType,summary,bourough,outdoor space
0,40000000,10.0,"merton lane, london, n6",51.5677,-0.155048,detached house,exceptional contemporary mansion in highgate l...,barnet,True
1,40000000,10.0,"merton lane, london, n6",51.567127,-0.154896,detached house,located on a sought-after road in the heart of...,barnet,True
2,19995000,8.0,"totteridge green, totteridge, london, n20",51.628384,-0.195597,detached house,this immaculate home is set in approximately 1...,barnet,True
3,19995000,11.0,"grovelands, totteridge green, n20",51.628685,-0.196011,detached house,set amid 11.5 acres of luscious greenery this ...,barnet,True
4,19500000,6.0,"guildens, courtenay avenue, n6",51.57134,-0.167616,detached house,a rare opportunity to purchase a magnificent f...,barnet,True
5,19500000,5.0,"cannon lane, hampstead, nw3",51.5602,-0.17438,detached house,"cannon lane, in a quiet conservation area in t...",barnet,False
6,17950000,8.0,"the bishops avenue, london, n2",51.5787,-0.16968,detached house,an opulent seven bedroom house with outstandin...,barnet,False


In [121]:
rm.to_csv(UPDATED_FILE)