In [1]:
# Required Packages
import pandas as pd
import numpy as np
import pickle

# Imputing
from sklearn.impute import SimpleImputer

# Displaying
from colorama import Fore, Back, Style

<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/6/69/Airbnb_Logo_B%C3%A9lo.svg/1280px-Airbnb_Logo_B%C3%A9lo.svg.png' width='350' align="center"/>

# Airbnb Listings (Preprocessing)

#### Table of contents
* [Removing the dollar sign](#Removing-the-dollar-sign)
* [Dealing with Missing Data](#Dealing-with-Missing-Data)
    * [Imputation of missing the values](#Imputation-of-the-missing-values)
        * [Numeric Data](#Numeric-Data)
        * [Text Data](#Text-Data)
* [Sorting the DataFrame columns](#Sorting-the-DataFrame-columns)
* [Dealing with Categorical Variables](#Dealing-with-Categorical-Variables)
    * [Properties](#Properties)
    * [Rooms](#Rooms)
    * [Beds](#Beds)
* [Exporting the Clean Data](#Exporting-the-Clean-Data)

In [2]:
Data = pd.read_csv('Data/Airbnb_Listings.csv', parse_dates=[2],encoding = "ISO-8859-1")
Data.head(3)

Unnamed: 0,id,scrape_id,last_scraped,name,picture_url,host_id,host_name,host_since,host_picture_url,street,...,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,1069266,20200000000000.0,2015-01-02,Stay like a real New Yorker!,https://a0.muscache.com/pictures/50276484/larg...,5867023,Michael,4/10/13,https://a2.muscache.com/ic/users/5867023/profi...,"East 53rd Street, New York, NY 10022, United S...",...,4/28/13,12/17/14,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
1,1846722,20200000000000.0,2015-01-02,Apartment 20 Minutes Times Square,https://a1.muscache.com/pictures/35865039/larg...,2631556,Denise,6/13/12,https://a2.muscache.com/ic/users/2631556/profi...,"West 155th Street, New York, NY, United States",...,1/5/14,12/29/14,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2
2,2061725,20200000000000.0,2015-01-02,Option of 2 Beds w Private Bathroom,https://a2.muscache.com/pictures/50650147/larg...,4601412,Miao,1/5/13,https://a0.muscache.com/ic/users/4601412/profi...,"Van Buren Street, Brooklyn, NY 11221, United S...",...,2/4/14,12/29/14,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4


In [3]:
print(Data.columns.tolist())

['id', 'scrape_id', 'last_scraped', 'name', 'picture_url', 'host_id', 'host_name', 'host_since', 'host_picture_url', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'city', 'state', 'zipcode', 'market', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_listing_count']


It seems that we don not need the following columns:

In [4]:
Drop_list=['scrape_id', 'last_scraped', 'name', 'picture_url','host_name','host_picture_url',\
           'street','neighbourhood','neighbourhood_cleansed','latitude', 'longitude', 'is_location_exact',\
           'square_feet','guests_included', 'extra_people','calendar_updated', 'calendar_last_scraped',\
          'host_listing_count', 'first_review', 'last_review','city','state','country','market']

In [5]:
Data.drop(columns=Drop_list, inplace = True)

Therefore,

In [6]:
Data.head().style.hide_index()

id,host_id,host_since,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,weekly_price,monthly_price,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
1069266,5867023,4/10/13,10022-4175,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$160.00,"$1,000.00",,3,14,21,51,72,322,62,86.0,9.0,7.0,9.0,9.0,10.0,9.0
1846722,2631556,6/13/12,,Apartment,Entire home/apt,10,1.0,3.0,3.0,Real Bed,$105.00,,,1,180,28,58,88,348,22,85.0,8.0,8.0,9.0,8.0,7.0,8.0
2061725,4601412,1/5/13,11221,Apartment,Private room,2,1.0,1.0,2.0,Real Bed,$58.00,,,3,30,4,13,26,227,35,98.0,10.0,10.0,10.0,10.0,9.0,10.0
44974,198425,8/11/10,10011,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$185.00,,"$3,400.00",10,30,1,1,1,274,26,96.0,10.0,9.0,10.0,10.0,10.0,9.0
4701675,22590025,10/15/14,10011,Apartment,Entire home/apt,2,1.0,1.0,2.0,Real Bed,$195.00,,,1,1125,30,60,90,365,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0


## Removing the dollar sign

In [7]:
Columns_list=Data.columns
Price_List=list()
for i in range(len(Columns_list)):
    if Columns_list[i].find('price') != -1:
        Price_List.append(Columns_list[i])
Data[Price_List].head()

Unnamed: 0,price,weekly_price,monthly_price
0,$160.00,"$1,000.00",
1,$105.00,,
2,$58.00,,
3,$185.00,,"$3,400.00"
4,$195.00,,


In [8]:
for i in Price_List:
    Data[i] = (Data[i].str.replace(r'[^-+\d.]', '').astype(float))
del i
Data[Price_List].head()

Unnamed: 0,price,weekly_price,monthly_price
0,160.0,1000.0,
1,105.0,,
2,58.0,,
3,185.0,,3400.0
4,195.0,,


## Dealing with Missing Data

Let's check the number of missing values

In [9]:
def Data_info(Inp, Only_NaN = False):
    Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
    Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out

In [10]:
Missing_Values = Data_info(Data, True)
Missing_Values

Unnamed: 0,Data Type,Number of NaN Values,Percentage
bathrooms,float64,463,1.69
bedrooms,float64,140,0.51
beds,float64,98,0.36
monthly_price,float64,17558,64.1
property_type,object,6,0.02
review_scores_accuracy,float64,8727,31.86
review_scores_checkin,float64,8729,31.87
review_scores_cleanliness,float64,8731,31.87
review_scores_communication,float64,8731,31.87
review_scores_location,float64,8732,31.88


### Imputation of the missing values

In [11]:
def hlg(inp):
    print(Back.CYAN +'"%s":' % inp)
    print(Style.RESET_ALL)

In [12]:
Num_list= Missing_Values.loc[(Missing_Values['Data Type'] == 'float64')].index.tolist()
Str_list= Missing_Values.loc[(Missing_Values['Data Type'] == 'object')].index.tolist()
hlg('float64 List:')
print(Num_list)
hlg('Object List:')
print(Str_list)

[46m"float64 List:":
[0m
['bathrooms', 'bedrooms', 'beds', 'monthly_price', 'review_scores_accuracy', 'review_scores_checkin', 'review_scores_cleanliness', 'review_scores_communication', 'review_scores_location', 'review_scores_rating', 'review_scores_value', 'weekly_price']
[46m"Object List:":
[0m
['property_type', 'zipcode']


#### Numeric Data

In [13]:
imp= SimpleImputer(missing_values=np.nan, strategy='mean')
for i in Num_list:
    temp=imp.fit_transform(Data[i].values.reshape(-1, 1))
    Data[i]=temp
    del temp

Note that now,

In [14]:
Temp=['accommodates','bedrooms','beds','price']
Drop_list=list()
for i in Temp:
    if len(Data[Data[i] == 0])>0:
        print('Number of zero availabe %s: %i' % (i,len(Data[Data[i] == 0])))
        Drop_list.append(i)

Number of zero availabe bedrooms: 2338


Droping these rows,

In [15]:
for i in Drop_list:
    Data = Data[Data[i] != 0.00]

Moreover, some columns only take integers as values. For these columns, we have,

In [16]:
Temp=['accommodates','bedrooms','beds']
Data[Temp]=Data[Temp].round(0).astype(int)

#### Text Data

In [17]:
Data_info(Data, True)

Unnamed: 0,Data Type,Number of NaN Values,Percentage
property_type,object,6,0.02
zipcode,object,148,0.59


Dropping the rows with NaN as zipcode

In [18]:
Data=Data.drop(list(Data[Data.zipcode.isna()==True].index))

Converting the zipcode to five digits

In [19]:
Data['zipcode'] = Data['zipcode'].str.replace(r'-\d+', '')

Note that

In [20]:
np.sort(Data.zipcode)

array(['10001', '10001', '10001', ..., '11694', '14072',
       '8456422473 call for more details'], dtype=object)

Thus,

In [21]:
Data=Data[Data.zipcode!='8456422473 call for more details']

Now,

In [22]:
np.sort(Data.zipcode)

array(['10001', '10001', '10001', ..., '11694', '11694', '14072'],
      dtype=object)

Now, we only have the following missing data

In [23]:
Data_info(Data, True)

Unnamed: 0,Data Type,Number of NaN Values,Percentage
property_type,object,6,0.02


Thus, we set this column to **NY**

As for **property type**, we have,

In [24]:
imp = SimpleImputer(strategy="most_frequent")
temp=imp.fit_transform(Data.property_type.values.reshape(-1, 1))
Data.property_type=temp
del temp

## Sorting the DataFrame columns

In [25]:
temp=list(set(Data.columns.tolist())-set(['id']))
temp.insert(0, 'id')
#Data = Data.reindex(columns=sorted(Data.columns))
Data = Data.reindex(columns=temp)
del temp

## Dealing with Categorical Variables

### Properties 

In [26]:
Properties_Dummies = pd.get_dummies(Data['property_type'])
Properties_Dummies.head().style.hide_index()

Apartment,Bed & Breakfast,Boat,Cabin,Camper/RV,Castle,Cave,Chalet,Dorm,Earth House,House,Hut,Lighthouse,Loft,Other,Tent,Treehouse,Villa
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
Properties_List=Properties_Dummies.columns
temp = Data.join(Properties_Dummies)
Data=temp
del temp, Properties_Dummies
Data=Data.drop(columns=['property_type'])

### Rooms

In [28]:
Rooms_Dummies = pd.get_dummies(Data['room_type'])
Rooms_Dummies.head().style.hide_index()

Entire home/apt,Private room,Shared room
1,0,0
0,1,0
1,0,0
1,0,0
1,0,0


In [29]:
Rooms_List=Rooms_Dummies.columns
temp = Data.join(Rooms_Dummies)
Data=temp
del temp, Rooms_Dummies
Data=Data.drop(columns=['room_type'])

### Beds

In [30]:
Beds_Dummies = pd.get_dummies(Data['bed_type'])
Beds_Dummies.head().style.hide_index()

Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1


In [31]:
Beds_List=Beds_Dummies.columns
temp = Data.join(Beds_Dummies)
Data=temp
del temp, Beds_Dummies
Data=Data.drop(columns=['bed_type'])

Creating a list Dictionary!

In [32]:
Lists_df = {'Properties_List': Properties_List, "Rooms_List":Rooms_List , "Beds_List": Beds_List}

## Exporting the Clean Data

Now, we have

In [33]:
Data.head().style.hide_index()

id,number_of_reviews,availability_90,review_scores_rating,accommodates,bedrooms,zipcode,review_scores_location,bathrooms,availability_30,availability_365,review_scores_checkin,minimum_nights,price,maximum_nights,availability_60,review_scores_communication,review_scores_value,host_id,monthly_price,host_since,review_scores_cleanliness,weekly_price,beds,review_scores_accuracy,Apartment,Bed & Breakfast,Boat,Cabin,Camper/RV,Castle,Cave,Chalet,Dorm,Earth House,House,Hut,Lighthouse,Loft,Other,Tent,Treehouse,Villa,Entire home/apt,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
1069266,62,72,86.0,2,1,10022,10.0,1.0,21,322,9.0,3,160.0,14,51,9.0,9.0,5867023,3054.316555,4/10/13,7.0,1000.0,1,9.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2061725,35,26,98.0,2,1,11221,9.0,1.0,4,227,10.0,3,58.0,30,13,10.0,10.0,4601412,3054.316555,1/5/13,10.0,953.795473,2,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
44974,26,1,96.0,2,1,10011,10.0,1.0,1,274,10.0,10,185.0,30,1,10.0,9.0,198425,3400.0,8/11/10,9.0,953.795473,1,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4701675,1,90,100.0,2,1,10011,10.0,1.0,30,365,10.0,1,195.0,1125,60,10.0,10.0,22590025,3054.316555,10/15/14,10.0,953.795473,2,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
68914,16,57,96.0,6,2,11231,10.0,1.0,11,287,10.0,2,165.0,365,33,9.0,9.0,343302,3054.316555,1/11/11,9.0,953.795473,3,10.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [34]:
Data.to_csv('Data/Clean_Airbnb_Listings.csv',index=False)

output = open('Data/Airbnb_Listings_Extra.pkl', 'wb')
pickle.dump(Lists_df, output)
output.close()

***