# Data Cleaning: Airbnb Listings

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import math
import pylab
import scipy.stats as stats
%matplotlib inline

In [2]:
cols = [
    'id',
    'host_id',
    'zipcode',
    'property_type',
    'room_type',
    'accommodates',
    'bedrooms',
    'beds',
    'bed_type',
    'price',
    'number_of_reviews',
    'review_scores_rating',
    'host_listing_count',
    'availability_30',
    'minimum_nights',
    'bathrooms'
]

data = pd.read_csv('listings.csv', usecols=cols)

In [4]:
data.head()

Unnamed: 0,id,host_id,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,minimum_nights,availability_30,number_of_reviews,review_scores_rating,host_listing_count
0,1069266,5867023,10022-4175,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$160.00,3,21,62,86.0,1
1,1846722,2631556,,Apartment,Entire home/apt,10,1.0,3.0,3.0,Real Bed,$105.00,1,28,22,85.0,2
2,2061725,4601412,11221,Apartment,Private room,2,1.0,1.0,2.0,Real Bed,$58.00,3,4,35,98.0,4
3,44974,198425,10011,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$185.00,10,1,26,96.0,1
4,4701675,22590025,10011,Apartment,Entire home/apt,2,1.0,1.0,2.0,Real Bed,$195.00,1,30,1,100.0,1


In [46]:
len(data['zipcode'][data.zipcode.isnull()]) #null count
#data.info() #total detail 
#check the number of missing values in each individua column
#for col in data.columns:
#    print (col + ', Number of Missing Values:', len(data[col][data[col].isnull()]))
data.isnull().sum() #good method

id                         0
host_id                    0
zipcode                  162
property_type              6
room_type                  0
accommodates               0
bathrooms                463
bedrooms                 140
beds                      98
bed_type                   0
price                      0
minimum_nights             0
availability_30            0
number_of_reviews          0
review_scores_rating    8657
host_listing_count         0
dtype: int64

### 1. Remove NaN values from dataframe except review_scores_rating

In [49]:
original = len(data)
data = data.dropna(how='any', subset=['zipcode', 'property_type', 'bedrooms', 'beds', 'bathrooms'])
print('Number of Nan values removed:', original - len(data))

Number of Nan values removed: 769


### 2. Convert formatting for price from $1.00 into a float of 1.00

In [64]:
data['price'] = (data['price'].str.replace(r'[^-+\d.]', '').astype(float))

AttributeError: Can only use .str accessor with string values!

### 3. Drop any invalid values

In [68]:
print ('Number of Accommodates 0:', len(data[data['accommodates'] == 0]))
print ('Number of Bedrooms 0:', len(data[data['bedrooms'] == 0]))
print ('Number of Beds 0:', len(data[data['beds'] == 0]))
print ('Number of Listings with Price $0.00:', len(data[data['price'] == 0.00]))

data = data[data['accommodates'] != 0]
data = data[data['bedrooms'] != 0]
data = data[data['beds'] != 0]
data = data[data['price'] != 0.00]

Number of Accommodates 0: 0
Number of Bedrooms 0: 2321
Number of Beds 0: 0
Number of Listings with Price $0.00: 0


### 4. Convert Zipcode to 5 digits

In [75]:
data.zipcode = data['zipcode'].str.replace(r'-\d+', '')

### 5. Let's explore distribution of accommodates

In [83]:
print('Number of Unique Accomodation: ', np.unique(data['accommodates']))
for i in range(0, 16):
    print('Accommodation {}:'.format(i+1), len(data[data.accommodates == i+1]))

Number of Unique Accomodation:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
Accommodation 1: 2643
Accommodation 2: 11400
Accommodation 3: 2909
Accommodation 4: 4278
Accommodation 5: 982
Accommodation 6: 1214
Accommodation 7: 217
Accommodation 8: 333
Accommodation 9: 57
Accommodation 10: 119
Accommodation 11: 15
Accommodation 12: 43
Accommodation 13: 4
Accommodation 14: 14
Accommodation 15: 5
Accommodation 16: 69


In [105]:
data.groupby(['accommodates']).agg('count')

Unnamed: 0_level_0,id,host_id,zipcode,property_type,room_type,bathrooms,bedrooms,beds,bed_type,price,minimum_nights,availability_30,number_of_reviews,review_scores_rating,host_listing_count
accommodates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,2643,2643,2643,2643,2643,2643,2643,2643,2643,2643,2643,2643,2643,1501,2643
2,11400,11400,11400,11400,11400,11400,11400,11400,11400,11400,11400,11400,11400,7798,11400
3,2909,2909,2909,2909,2909,2909,2909,2909,2909,2909,2909,2909,2909,2092,2909
4,4278,4278,4278,4278,4278,4278,4278,4278,4278,4278,4278,4278,4278,3017,4278
5,982,982,982,982,982,982,982,982,982,982,982,982,982,698,982
6,1214,1214,1214,1214,1214,1214,1214,1214,1214,1214,1214,1214,1214,874,1214
7,217,217,217,217,217,217,217,217,217,217,217,217,217,155,217
8,333,333,333,333,333,333,333,333,333,333,333,333,333,221,333
9,57,57,57,57,57,57,57,57,57,57,57,57,57,44,57
10,119,119,119,119,119,119,119,119,119,119,119,119,119,82,119
