<h1>Data cleaning</h1>
<p>In this module we will:</p>
<ul>
    <li>Remove unwanted outliers</li>
    <li>Remove unwanted observations</li>
    <li>Label missing categorical data</li>
    <li>Drop unwanted outliers</li>
    <li>Flag and fill missing numerical data</li>
</ul>

In [1]:
# import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 
import seaborn as sns

In [2]:
#import dataset
df=pd.read_csv("project_files/kenya_listings.csv")

<h2>Flag and fill missing numerical data</h2>
<p>Checking null/missing values in the dataset</p>

In [3]:
#df.select_dtypes(exclude=['object']).isnull().sum()
count=df.isna().sum()
percent=((count/df.shape[0])*100)
null=pd.DataFrame(pd.concat([count,percent],keys=['Missing values','% Missing values'],axis=1))
null

Unnamed: 0,Missing values,% Missing values
id,0,0.0
price,3,0.018614
price_qualifier,8870,55.035056
bedrooms,0,0.0
bathrooms,0,0.0
toilets,10028,62.220016
furnished,0,0.0
serviced,0,0.0
shared,0,0.0
parking,0,0.0


In [4]:
# Missing toilet values should be 0
df['toilets'] = df['toilets'].fillna(0)

In [5]:
df['price'] = df['price'].fillna(0)

<h2>Fixing structural errors</h2>

In [6]:
df.price_qualifier.value_counts()

per month                       6227
per calendar month               373
per plot                         228
per acre                         128
per square foot / per month      112
per day                           62
per square meter / per month      55
per square foot / per week        46
per hectare                        6
per square meter / per week        5
per square foot / per annum        3
per square meter                   1
per square foot                    1
Name: price_qualifier, dtype: int64

In [7]:
df.type.value_counts()

House                   6026
Apartment               5840
Land                    2669
Commercial Property     1580
Event Centre / Venue       2
Name: type, dtype: int64

In [8]:
df.sub_type.value_counts()

Townhouse                               1563
Office Space                             872
Residential Land                         810
Detached Duplex                          630
Detached Bungalow                        579
Mixed-use Land                           376
Semi-detached Bungalow                   286
Mini Flat                                244
Commercial Land                          205
Bedsitter (Single Room)                  191
Warehouse                                186
Semi-detached Duplex                      74
Terraced Duplex                           55
Plaza / Complex / Mall                    45
Shop                                      45
Block of Flats                            30
Terraced Bungalow                         29
Hotel / Guest House                       27
Restaurant / Bar                          24
Industrial Land                           14
School                                     5
Factory                                    3
Filling St

In [9]:
df.state.value_counts()

Nairobi          8935
Kiambu           2694
Kajiado          1294
Mombasa          1125
Machakos          534
Kilifi            480
Nakuru            290
Kisumu            150
Laikipia           97
Kwale              78
Embu               44
Uasin Gishu        43
Meru               42
Nyeri              39
Nandi              38
Makueni            30
Baringo            28
Muranga            27
Bungoma            21
Kirinyaga          17
Kitui              13
Nyandarua          12
Kericho            11
Trans Nzoia        10
Kakamega           10
Kisii               9
Isiolo              7
Lamu                6
Busia               6
Narok               4
Homa Bay            3
Tharaka-Nithi       3
Bomet               3
Garissa             2
Siaya               2
Migori              2
Vihiga              2
Samburu             2
Turkana             1
Marsabit            1
West Pokot          1
Taita Taveta        1
Name: state, dtype: int64

In [10]:
df.locality.value_counts()

Westlands          2446
Kilimani           1199
Kikuyu             1064
Lavington           891
Karen               750
                   ... 
Akirang'Ondu          1
Chania                1
Gatitu/Muruguru       1
Esise                 1
Kiganjo/Mathari       1
Name: locality, Length: 357, dtype: int64

In [11]:
df.sub_locality.value_counts()

Runda              553
Loresho            107
South C             83
Industrial Area     80
Thigio              34
Muthaiga North      23
Old Muthaiga        22
Chiromo             20
South B             19
Rimpa               18
Tassia              13
Yukos               11
Imara Daima         10
Githurai 44         10
Rosslyn              9
Clay City            8
Githurai 45          5
Umoja Phase 1        5
Lucky Summer         4
New Muthaiga         3
Umoja Phase 2        3
Mukuru Village       1
Kiembeni             1
Kariba               1
Kihingo              1
Kwa Njenga           1
Lindi                1
Name: sub_locality, dtype: int64

In [12]:
df.category.value_counts()

For Sale         9108
For Rent         6916
Short Let          84
Joint Venture       9
Name: category, dtype: int64

<h3>No structural errors were noted.</h3>

<h2>Labelling missing categorical features</h2>

In [13]:
df.select_dtypes(include=['object']).isnull().sum()

price_qualifier     8870
category               0
type                   0
sub_type            9817
state                  0
locality               6
sub_locality       15071
listdate               0
dtype: int64

In [14]:
for column in df.select_dtypes(include=['object']).columns.tolist():
    df[column] = df[column].fillna('Missing')

In [15]:
df.select_dtypes(include=['object']).isnull().sum()

price_qualifier    0
category           0
type               0
sub_type           0
state              0
locality           0
sub_locality       0
listdate           0
dtype: int64

<h2>Dropping unwanted observations</h2>

In [18]:
#removing duplicates in the dataset
df=df.drop_duplicates()
df.shape

(16117, 17)

<p>
    There are some properties that will be removed in the dataset which includes:
    <ul>
        <li>Property types-land,commercial property,event centers </li>
    </ul>
</p>

In [26]:
df=df.loc[(df['type']=='House')| (df['type']=='Apartment')]

In [29]:
df.type.value_counts()

House        6026
Apartment    5840
Name: type, dtype: int64

In [32]:
df.price_qualifier.value_counts()

Missing               6251
per month             5184
per calendar month     370
per day                 61
Name: price_qualifier, dtype: int64

In [47]:
df.loc[(df['price_qualifier']=='per day')].sort_values(by='price' ,ascending=False).head(30)

Unnamed: 0,id,price,price_qualifier,bedrooms,bathrooms,toilets,furnished,serviced,shared,parking,category,type,sub_type,state,locality,sub_locality,listdate
15894,15902,40000.0,per day,4,4,5.0,1,0,0,2,Short Let,House,Terraced Bungalow,Kilifi,Malindi Town,Missing,2022-06-06 21:49:48
6315,6316,30000.0,per day,4,0,0.0,0,0,0,0,Short Let,House,Missing,Makueni,Emali/Mulala,Missing,2020-08-22 23:50:39
4345,4346,25000.0,per day,4,0,0.0,0,0,0,0,Short Let,House,Detached Duplex,Kilifi,Malindi Town,Missing,2020-08-01 23:40:42
8812,8813,25000.0,per day,4,4,5.0,1,0,0,2,Short Let,House,Townhouse,Mombasa,Shanzu,Missing,2021-06-22 08:42:02
11827,11831,18000.0,per day,5,5,5.0,0,0,1,4,Short Let,House,Missing,Mombasa,Nyali,Missing,2021-12-15 17:48:00
15886,15894,18000.0,per day,3,32,4.0,1,0,0,2,Short Let,Apartment,Missing,Mombasa,Nyali,Missing,2022-06-06 20:03:52
15862,15870,18000.0,per day,4,4,4.0,1,0,0,3,Short Let,House,Townhouse,Kilifi,Watamu,Missing,2022-06-06 09:34:15
14952,14960,17000.0,per day,3,3,4.0,1,1,0,2,Short Let,Apartment,Missing,Mombasa,Nyali,Missing,2022-04-08 13:38:23
15893,15901,15000.0,per day,3,3,4.0,1,0,0,2,Short Let,Apartment,Missing,Mombasa,Shanzu,Missing,2022-06-06 21:34:15
15891,15899,15000.0,per day,4,3,4.0,1,0,0,2,Short Let,House,Detached Bungalow,Kilifi,Malindi Town,Missing,2022-06-06 21:24:29


In [48]:
df.state.value_counts()

Nairobi          7332
Kiambu           1552
Kajiado           882
Mombasa           881
Machakos          354
Kilifi            246
Nakuru            155
Kisumu            111
Uasin Gishu        37
Laikipia           35
Kwale              33
Nandi              31
Embu               29
Meru               27
Baringo            23
Nyeri              19
Muranga            17
Makueni            15
Kitui              10
Bungoma            10
Kirinyaga           9
Trans Nzoia         8
Kakamega            7
Isiolo              7
Kericho             7
Kisii               6
Bomet               3
Busia               3
Nyandarua           3
Homa Bay            2
Tharaka-Nithi       2
Lamu                2
Garissa             1
Turkana             1
Narok               1
Migori              1
Samburu             1
Marsabit            1
Vihiga              1
West Pokot          1
Name: state, dtype: int64

In [52]:
df.loc[(df['state']=='Uasin Gishu')].head(37)

Unnamed: 0,id,price,price_qualifier,bedrooms,bathrooms,toilets,furnished,serviced,shared,parking,category,type,sub_type,state,locality,sub_locality,listdate
4945,4946,6500000.0,Missing,3,0,0.0,0,0,0,0,For Sale,House,Missing,Uasin Gishu,Kimumu,Missing,2020-08-10 14:07:28
4952,4953,7500000.0,Missing,3,0,0.0,0,0,0,0,For Sale,House,Detached Bungalow,Uasin Gishu,Kimumu,Missing,2020-08-10 14:53:25
5570,5571,320000000.0,Missing,6,0,0.0,0,0,0,0,For Sale,House,Townhouse,Uasin Gishu,Karuna/Meibeki,Missing,2020-08-15 19:24:58
5984,5985,9950000.0,Missing,3,0,0.0,0,0,0,0,For Sale,Apartment,Missing,Uasin Gishu,Racecourse,Missing,2020-08-18 01:07:36
6798,6799,65000.0,per month,4,4,0.0,0,0,0,0,For Rent,House,Missing,Uasin Gishu,Karuna/Meibeki,Missing,2020-08-24 18:30:51
8860,8861,90000.0,per calendar month,6,6,7.0,1,0,0,2,For Rent,House,Townhouse,Uasin Gishu,Racecourse,Missing,2021-06-24 14:51:48
8992,8993,35000.0,per month,3,2,2.0,0,0,0,0,For Rent,Apartment,Missing,Uasin Gishu,Eldoret,Missing,2021-07-07 16:48:07
8993,8994,6500000.0,Missing,3,2,2.0,0,0,0,0,For Sale,Apartment,Missing,Uasin Gishu,Eldoret,Missing,2021-07-07 16:54:53
8994,8995,10000000.0,Missing,4,4,5.0,0,0,0,0,For Sale,House,Townhouse,Uasin Gishu,Eldoret,Missing,2021-07-07 17:04:07
8995,8996,60000.0,per calendar month,4,4,5.0,0,0,0,0,For Rent,House,Townhouse,Uasin Gishu,Eldoret,Missing,2021-07-07 17:12:54
