In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# Airbnb Listings (Preprocessing)

#### Table of contents
* [Removing the dollar sign](#Removing-the-dollar-sign)
* [Dealing with Missing Data](#Dealing-with-Missing-Data)
    * [Imputation of missing values](#Imputation-of-missing-values)
        * [Numeric Data](#Numeric-Data)
        * [Text Data](#Text-Data)
* [Sorting the DataFrame columns](#Sorting-the-DataFrame-columns)
* [Dealing with Categorical Variables](#Dealing-with-Categorical-Variables)
    * [Properties](#Properties)
    * [Rooms](#Rooms)
    * [Beds](#Beds)
* [Exporting the Clean Data](#Exporting-the-Clean-Data)

In [2]:
Data = pd.read_csv('Data/Airbnb_Listings.csv', parse_dates=[2],encoding = "ISO-8859-1")
Data.head(3)

Unnamed: 0,id,scrape_id,last_scraped,name,picture_url,host_id,host_name,host_since,host_picture_url,street,...,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_listing_count
0,1069266,20200000000000.0,2015-01-02,Stay like a real New Yorker!,https://a0.muscache.com/pictures/50276484/larg...,5867023,Michael,4/10/13,https://a2.muscache.com/ic/users/5867023/profi...,"East 53rd Street, New York, NY 10022, United S...",...,4/28/13,12/17/14,86.0,9.0,7.0,9.0,9.0,10.0,9.0,1
1,1846722,20200000000000.0,2015-01-02,Apartment 20 Minutes Times Square,https://a1.muscache.com/pictures/35865039/larg...,2631556,Denise,6/13/12,https://a2.muscache.com/ic/users/2631556/profi...,"West 155th Street, New York, NY, United States",...,1/5/14,12/29/14,85.0,8.0,8.0,9.0,8.0,7.0,8.0,2
2,2061725,20200000000000.0,2015-01-02,Option of 2 Beds w Private Bathroom,https://a2.muscache.com/pictures/50650147/larg...,4601412,Miao,1/5/13,https://a0.muscache.com/ic/users/4601412/profi...,"Van Buren Street, Brooklyn, NY 11221, United S...",...,2/4/14,12/29/14,98.0,10.0,10.0,10.0,10.0,9.0,10.0,4


In [3]:
print(list(Data.columns))

['id', 'scrape_id', 'last_scraped', 'name', 'picture_url', 'host_id', 'host_name', 'host_since', 'host_picture_url', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'city', 'state', 'zipcode', 'market', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_listing_count']


It seems that we don not need the following columns:

In [4]:
Drop_list=['scrape_id', 'last_scraped', 'name', 'picture_url','host_name','host_picture_url',\
           'street','neighbourhood','neighbourhood_cleansed','latitude', 'longitude', 'is_location_exact',\
           'square_feet','guests_included', 'extra_people','calendar_updated', 'calendar_last_scraped',\
          'host_listing_count', 'first_review', 'last_review','city','state','country','market']

In [5]:
Data=Data.drop(columns=Drop_list)

Therefore,

In [6]:
Data.head().style.hide_index()

id,host_id,host_since,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,weekly_price,monthly_price,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
1069266,5867023,4/10/13,10022-4175,Apartment,Entire home/apt,2,1,1,1,Real Bed,$160.00,"$1,000.00",,3,14,21,51,72,322,62,86,9,7,9,9,10,9
1846722,2631556,6/13/12,,Apartment,Entire home/apt,10,1,3,3,Real Bed,$105.00,,,1,180,28,58,88,348,22,85,8,8,9,8,7,8
2061725,4601412,1/5/13,11221,Apartment,Private room,2,1,1,2,Real Bed,$58.00,,,3,30,4,13,26,227,35,98,10,10,10,10,9,10
44974,198425,8/11/10,10011,Apartment,Entire home/apt,2,1,1,1,Real Bed,$185.00,,"$3,400.00",10,30,1,1,1,274,26,96,10,9,10,10,10,9
4701675,22590025,10/15/14,10011,Apartment,Entire home/apt,2,1,1,2,Real Bed,$195.00,,,1,1125,30,60,90,365,1,100,10,10,10,10,10,10


## Removing the dollar sign

In [7]:
Columns_list=Data.columns
Price_List=list()
for i in range(len(Columns_list)):
    if Columns_list[i].find('price') != -1:
        Price_List.append(Columns_list[i])
Data[Price_List].head()

Unnamed: 0,price,weekly_price,monthly_price
0,$160.00,"$1,000.00",
1,$105.00,,
2,$58.00,,
3,$185.00,,"$3,400.00"
4,$195.00,,


In [8]:
for i in Price_List:
    Data[i] = (Data[i].str.replace(r'[^-+\d.]', '').astype(float))
del i
Data[Price_List].head()

Unnamed: 0,price,weekly_price,monthly_price
0,160.0,1000.0,
1,105.0,,
2,58.0,,
3,185.0,,3400.0
4,195.0,,


## Dealing with Missing Data

Let's check the number of missing values

In [9]:
def Missing_Values_fun(df):
    Missing_Values=pd.DataFrame(np.zeros(len(Columns_list), dtype=int),
                            index=Columns_list, columns=['Number_of_missing'])

    Missing_Values['Data_Type']=''
    # check the number of missing values in each individual column
    for i in Columns_list:
        Missing_Values.loc[i,'Number_of_missing']=len(df[i][df[i].isnull()])
        Missing_Values.loc[i,'Data_Type']=str(type(df[df.isna()==False][i][0]))
    
    Missing_Values=Missing_Values[Missing_Values.Number_of_missing > 0].sort_values(by='Number_of_missing', ascending=False)
    return Missing_Values

In [10]:
Missing_Values=Missing_Values_fun(Data)
Missing_Values

Unnamed: 0,Number_of_missing,Data_Type
monthly_price,17558,<class 'numpy.float64'>
weekly_price,15374,<class 'numpy.float64'>
review_scores_value,8734,<class 'numpy.float64'>
review_scores_location,8732,<class 'numpy.float64'>
review_scores_cleanliness,8731,<class 'numpy.float64'>
review_scores_communication,8731,<class 'numpy.float64'>
review_scores_checkin,8729,<class 'numpy.float64'>
review_scores_accuracy,8727,<class 'numpy.float64'>
review_scores_rating,8657,<class 'numpy.float64'>
bathrooms,463,<class 'numpy.float64'>


### Imputation of missing values

In [11]:
Num_list=list(Missing_Values[Missing_Values.Data_Type=="<class 'numpy.float64'>"].index)
Str_list=list(Missing_Values[Missing_Values.Data_Type=="<class 'str'>"].index)
Num_list, Str_list

(['monthly_price',
  'weekly_price',
  'review_scores_value',
  'review_scores_location',
  'review_scores_cleanliness',
  'review_scores_communication',
  'review_scores_checkin',
  'review_scores_accuracy',
  'review_scores_rating',
  'bathrooms',
  'bedrooms',
  'beds'],
 ['zipcode', 'property_type'])

#### Numeric Data

In [12]:
imp= SimpleImputer(missing_values=np.nan, strategy='mean')
for i in Num_list:
    temp=imp.fit_transform(Data[i].values.reshape(-1, 1))
    Data[i]=temp
    del temp

Note that now,

In [13]:
Temp=['accommodates','bedrooms','beds','price']
Drop_list=list()
for i in Temp:
    if len(Data[Data[i] == 0])>0:
        print('Number of zero availabe %s: %i' % (i,len(Data[Data[i] == 0])))
        Drop_list.append(i)

Number of zero availabe bedrooms: 2338


Droping these rows,

In [14]:
for i in Drop_list:
    Data = Data[Data[i] != 0.00]

Moreover, some columns only take integers as values. For these columns, we have,

In [15]:
Temp=['accommodates','bedrooms','beds']
Data[Temp]=Data[Temp].round(0).astype(int)

#### Text Data

In [16]:
Missing_Values_fun(Data)

Unnamed: 0,Number_of_missing,Data_Type
zipcode,148,<class 'str'>
property_type,6,<class 'str'>


Dropping the rows with NaN as zipcode

In [17]:
Data=Data.drop(list(Data[Data.zipcode.isna()==True].index))

Converting the zipcode to five digits

In [18]:
Data['zipcode'] = Data['zipcode'].str.replace(r'-\d+', '')

Note that

In [19]:
np.sort(Data.zipcode)

array(['10001', '10001', '10001', ..., '11694', '14072',
       '8456422473 call for more details'], dtype=object)

Thus,

In [20]:
Data=Data[Data.zipcode!='8456422473 call for more details']

Now,

In [21]:
np.sort(Data.zipcode)

array(['10001', '10001', '10001', ..., '11694', '11694', '14072'],
      dtype=object)

Now, we only have the following missing data

In [22]:
Missing_Values_fun(Data)

Unnamed: 0,Number_of_missing,Data_Type
property_type,6,<class 'str'>


In [23]:
#pd.DataFrame(Data.state.value_counts())

Thus, we set this column to **NY**

As for **property type**, we have,

In [24]:
imp = SimpleImputer(strategy="most_frequent")
temp=imp.fit_transform(Data.property_type.values.reshape(-1, 1))
Data.property_type=temp
del temp

In [25]:
#City_pd=pd.DataFrame(Data['city'].value_counts()[Data['city'].value_counts() == Data['city'].value_counts()])
#City_pd[City_pd.city>5]

## Sorting the DataFrame columns

In [26]:
temp=list(set(Data.columns.tolist())-set(['id']))
temp.insert(0, 'id')
#Data = Data.reindex(columns=sorted(Data.columns))
Data = Data.reindex(columns=temp)
del temp

## Dealing with Categorical Variables

### Properties 

In [27]:
Properties_Dummies = pd.get_dummies(Data['property_type'])
Properties_Dummies.head().style.hide_index()

Apartment,Bed & Breakfast,Boat,Cabin,Camper/RV,Castle,Cave,Chalet,Dorm,Earth House,House,Hut,Lighthouse,Loft,Other,Tent,Treehouse,Villa
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
Properties_List=Properties_Dummies.columns
temp = Data.join(Properties_Dummies)
Data=temp
del temp, Properties_Dummies
Data=Data.drop(columns=['property_type'])

### Rooms

In [29]:
Rooms_Dummies = pd.get_dummies(Data['room_type'])
Rooms_Dummies.head().style.hide_index()

Entire home/apt,Private room,Shared room
1,0,0
0,1,0
1,0,0
1,0,0
1,0,0


In [30]:
Rooms_List=Rooms_Dummies.columns
temp = Data.join(Rooms_Dummies)
Data=temp
del temp, Rooms_Dummies
Data=Data.drop(columns=['room_type'])

### Beds

In [31]:
Beds_Dummies = pd.get_dummies(Data['bed_type'])
Beds_Dummies.head().style.hide_index()

Airbed,Couch,Futon,Pull-out Sofa,Real Bed
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1
0,0,0,0,1


In [32]:
Beds_List=Beds_Dummies.columns
temp = Data.join(Beds_Dummies)
Data=temp
del temp, Beds_Dummies
Data=Data.drop(columns=['bed_type'])

Creating a list Dataframe

In [33]:
Lists_df=pd.DataFrame(np.zeros([max([len(Properties_List),len(Rooms_List),len(Beds_List)]),3], dtype=np.int),
                     columns=['Properties_List','Rooms_List','Beds_List'])
Lists_df.Properties_List[0:len(Properties_List)] = Properties_List
Lists_df.Rooms_List[0:len(Rooms_List)] = Rooms_List
Lists_df.Beds_List[0:len(Beds_List)] = Beds_List

## Exporting the Clean Data

Now, we have

In [34]:
Data.head().style.hide_index()

id,review_scores_rating,availability_30,host_id,review_scores_accuracy,availability_365,review_scores_value,minimum_nights,price,accommodates,maximum_nights,review_scores_location,availability_90,number_of_reviews,availability_60,review_scores_checkin,monthly_price,beds,host_since,bathrooms,bedrooms,review_scores_cleanliness,review_scores_communication,zipcode,weekly_price,Apartment,Bed & Breakfast,Boat,Cabin,Camper/RV,Castle,Cave,Chalet,Dorm,Earth House,House,Hut,Lighthouse,Loft,Other,Tent,Treehouse,Villa,Entire home/apt,Private room,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed
1069266,86,21,5867023,9,322,9,3,160,2,14,10,72,62,51,9,3054.32,1,4/10/13,1,1,7,9,10022,1000.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2061725,98,4,4601412,10,227,10,3,58,2,30,9,26,35,13,10,3054.32,2,1/5/13,1,1,10,10,11221,953.795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
44974,96,1,198425,10,274,9,10,185,2,30,10,1,26,1,10,3400.0,1,8/11/10,1,1,9,10,10011,953.795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4701675,100,30,22590025,10,365,10,1,195,2,1125,10,90,1,60,10,3054.32,2,10/15/14,1,1,10,10,10011,953.795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
68914,96,11,343302,10,287,9,2,165,6,365,10,57,16,33,10,3054.32,3,1/11/11,1,2,9,9,11231,953.795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [35]:
Data.to_csv('Data/Clean_Airbnb_Listings.csv',index=False)
Lists_df.to_csv('Data/Airbnb_Listings_Extra.csv',index=False)

***