In [None]:
!pip install category_encoders

In [None]:
!pip install eli5

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets, linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, validation_curve
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce
import eli5
from eli5.sklearn import PermutationImportance


# In order to see all of the columns of the dataset we need to set the display options
# from the Pandas package to at least 100 (the dataset has 96 columns) and, for the rows,
# I set it to at least 100 which will help when I check for null values and dtypes.

pd.set_option('mode.chained_assignment', None) # Everytime I made a new column I would have a warning raised
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
# Importing the CSV 'listings_summary.csv' from the Kaggle dataset found at this
# URL: https://www.kaggle.com/brittabettendorf/berlin-airbnb-data

listings_summary = pd.read_csv('https://raw.githubusercontent.com/BuildWeekAirbnbOptimal2/Datascience/master/Berlin.csv')

In [3]:
# As stated above, there are 96 columns and over 20,000 observations

listings_summary.shape

(22552, 96)

In [4]:
# Checking the dtypes of the dataset...

# The goal of this project is to find the optimal price for an AirBnB in Belin, Germany so,
# the target variable will be the 'price' which is currently an object and therefore, will
# have to be dealt with appropriately.

listings_summary.dtypes

id                                    int64
listing_url                          object
scrape_id                             int64
last_scraped                         object
name                                 object
summary                              object
space                                object
description                          object
experiences_offered                  object
neighborhood_overview                object
notes                                object
transit                              object
access                               object
interaction                          object
house_rules                          object
thumbnail_url                       float64
medium_url                          float64
picture_url                          object
xl_picture_url                      float64
host_id                               int64
host_url                             object
host_name                            object
host_since                      

In [5]:
# Next we will check for the null values within the dataset - there are quite a few...

listings_summary.isna().sum()

id                                      0
listing_url                             0
scrape_id                               0
last_scraped                            0
name                                   59
summary                               963
space                                8532
description                           203
experiences_offered                     0
neighborhood_overview               11012
notes                               15337
transit                              9516
access                              11715
interaction                         12146
house_rules                         11103
thumbnail_url                       22552
medium_url                          22552
picture_url                             0
xl_picture_url                      22552
host_id                                 0
host_url                                0
host_name                              26
host_since                             26
host_location                     

In [6]:
# Calling the head of the dataset to visualize what the first row of observations looks like

listings_summary.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2015,https://www.airbnb.com/rooms/2015,20181107122246,2018-11-07,Berlin-Mitte Value! Quiet courtyard/very central,Great location! 30 of 75 sq meters. This wood...,A+++ location! This „Einliegerwohnung“ is an e...,Great location! 30 of 75 sq meters. This wood...,none,It is located in the former East Berlin area o...,"This is my home, not a hotel. I rent out occas...","Close to U-Bahn U8 and U2 (metro), Trams M12, ...","Simple kitchen/cooking, refrigerator, microwav...",Always available,"No parties No events No pets No smoking, not e...",,,https://a0.muscache.com/im/pictures/260fd609-7...,,2217,https://www.airbnb.com/users/show/2217,Ian,2008-08-18,"Key Biscayne, Florida, United States",Believe in sharing economy.,within an hour,96%,,t,https://a0.muscache.com/im/pictures/21428a22-4...,https://a0.muscache.com/im/pictures/21428a22-4...,Mitte,4.0,4.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"Berlin, Berlin, Germany",Mitte,Brunnenstr. Süd,Mitte,Berlin,Berlin,10119,Berlin,"Berlin, Germany",DE,Germany,52.534537,13.402557,f,Guesthouse,Entire home/apt,3,1.0,1.0,2.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",,$60.00,,,$200.00,$30.00,1,$28.00,4,1125,3 months ago,t,0,21,51,141,2018-11-07,118,2016-04-11,2018-10-28,93.0,10.0,9.0,10.0,10.0,10.0,9.0,t,,,f,f,strict_14_with_grace_period,f,f,4,3.76


In [7]:
listings_summary['neighbourhood_cleansed'].value_counts()

Tempelhofer Vorstadt                  1325
Frankfurter Allee Süd FK              1282
Alexanderplatz                        1091
Reuterstraße                          1002
Rixdorf                                880
                                      ... 
Malchow, Wartenberg und Falkenberg       3
Allende-Viertel                          3
Neu-Hohenschönhausen Süd                 2
Hellersdorf-Ost                          1
MV 2                                     1
Name: neighbourhood_cleansed, Length: 136, dtype: int64

In [8]:
# We can already tell later on we will have to drop a few columns where the cardinality for some
# object features, while finite, will be very high epecially in the case of URLs, names, reviews,
# descriptions, etc. so we will remove a few of them now and possibly later.

In [9]:
# First, we will use a for loop to check the number of unique values in each column.  This is acheived
# by taking the length of the value_counts of a column.

for col in listings_summary:
    print(f'There are/is {len(listings_summary[col].value_counts())} unique value(s) for column: {col}') if listings_summary[col].dtypes=='O' else print(None)

None
There are/is 22552 unique value(s) for column: listing_url
None
There are/is 2 unique value(s) for column: last_scraped
There are/is 21873 unique value(s) for column: name
There are/is 21041 unique value(s) for column: summary
There are/is 13598 unique value(s) for column: space
There are/is 21997 unique value(s) for column: description
There are/is 1 unique value(s) for column: experiences_offered
There are/is 10781 unique value(s) for column: neighborhood_overview
There are/is 6687 unique value(s) for column: notes
There are/is 12308 unique value(s) for column: transit
There are/is 9946 unique value(s) for column: access
There are/is 9584 unique value(s) for column: interaction
There are/is 10350 unique value(s) for column: house_rules
None
None
There are/is 22465 unique value(s) for column: picture_url
None
None
There are/is 19180 unique value(s) for column: host_url
There are/is 5997 unique value(s) for column: host_name
There are/is 2914 unique value(s) for column: host_since

In [10]:
listings_summary.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2015,https://www.airbnb.com/rooms/2015,20181107122246,2018-11-07,Berlin-Mitte Value! Quiet courtyard/very central,Great location! 30 of 75 sq meters. This wood...,A+++ location! This „Einliegerwohnung“ is an e...,Great location! 30 of 75 sq meters. This wood...,none,It is located in the former East Berlin area o...,"This is my home, not a hotel. I rent out occas...","Close to U-Bahn U8 and U2 (metro), Trams M12, ...","Simple kitchen/cooking, refrigerator, microwav...",Always available,"No parties No events No pets No smoking, not e...",,,https://a0.muscache.com/im/pictures/260fd609-7...,,2217,https://www.airbnb.com/users/show/2217,Ian,2008-08-18,"Key Biscayne, Florida, United States",Believe in sharing economy.,within an hour,96%,,t,https://a0.muscache.com/im/pictures/21428a22-4...,https://a0.muscache.com/im/pictures/21428a22-4...,Mitte,4.0,4.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"Berlin, Berlin, Germany",Mitte,Brunnenstr. Süd,Mitte,Berlin,Berlin,10119,Berlin,"Berlin, Germany",DE,Germany,52.534537,13.402557,f,Guesthouse,Entire home/apt,3,1.0,1.0,2.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,Gym,Heating,""Famil...",,$60.00,,,$200.00,$30.00,1,$28.00,4,1125,3 months ago,t,0,21,51,141,2018-11-07,118,2016-04-11,2018-10-28,93.0,10.0,9.0,10.0,10.0,10.0,9.0,t,,,f,f,strict_14_with_grace_period,f,f,4,3.76


In [11]:
# The first thing we will do is remove the object columns with high cardinality and features that are probably
# redundant like 'city' since this is the Berlin AirBnB dataset - 'zipcode' may be useful but neighbourhood could
# cover that.

high_cardin = ['listing_url', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview',
               'notes', 'transit', 'access', 'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
               'picture_url', 'xl_picture_url', 'host_url', 'host_name', 'host_about', 'host_thumbnail_url',
               'host_picture_url', 'host_verifications', 'street', 'city', 'state', 'zipcode', 'market',
               'smart_location', 'country_code', 'country', 'bed_type', 'amenities', 'weekly_price', 'monthly_price',
               'has_availability', 'calendar_last_scraped', 'requires_license', 'license', 'is_business_travel_ready',
               'require_guest_profile_picture', 'require_guest_phone_verification']

In [12]:
listings_df = listings_summary.drop(columns=high_cardin)

In [13]:
listings_df.isna().sum()

id                                    0
scrape_id                             0
last_scraped                          0
host_id                               0
host_since                           26
host_location                       116
host_response_time                12894
host_response_rate                12895
host_acceptance_rate              22552
host_is_superhost                    26
host_neighbourhood                 5094
host_listings_count                  26
host_total_listings_count            26
host_has_profile_pic                 26
host_identity_verified               26
neighbourhood                      1131
neighbourhood_cleansed                0
neighbourhood_group_cleansed          0
latitude                              0
longitude                             0
is_location_exact                     0
property_type                         0
room_type                             0
accommodates                          0
bathrooms                            32


In [14]:
# We will also remove columns that have many NaN values

high_na = ['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'square_feet', 'jurisdiction_names']

Berlin_airbnb = listings_df.drop(columns=high_na)

In [15]:
Berlin_airbnb.dtypes

id                                  int64
scrape_id                           int64
last_scraped                       object
host_id                             int64
host_since                         object
host_location                      object
host_is_superhost                  object
host_neighbourhood                 object
host_listings_count               float64
host_total_listings_count         float64
host_has_profile_pic               object
host_identity_verified             object
neighbourhood                      object
neighbourhood_cleansed             object
neighbourhood_group_cleansed       object
latitude                          float64
longitude                         float64
is_location_exact                  object
property_type                      object
room_type                          object
accommodates                        int64
bathrooms                         float64
bedrooms                          float64
beds                              

In [16]:
# Next we will engineer some features based on the data

In [17]:
# Originally, the 'security_deposit' column would've been kept and replaced NaN values with the mean but,
# Since there are many NaN values we will make a binary feature stating '1' if they require a security deposit
# and '0' if the do not require one.


# TODO: drop Berlin_airbnb['security_deposit']
has_security_dep = []
for i in Berlin_airbnb['security_deposit']:
    if i==np.NaN:
        has_security_dep.append(0)
    else:
        has_security_dep.append(1)
Berlin_airbnb['require_security_deposit'] = np.array(has_security_dep).astype(int)

In [18]:
# We will do the same with cleaning fee and call it 'has_cleaning_service'...

# TODO: drop Berlin_airbnb['cleaning_fee']
has_cleaning = []
for i in Berlin_airbnb['cleaning_fee']:
    if i==np.NaN:
        has_cleaning.append(0)
    else:
        has_cleaning.append(1)
Berlin_airbnb['has_cleaning_service'] = np.array(has_cleaning).astype(int)

In [19]:
# Possible columns to impute or use for feature engineering

# review_scores_rating - mode = 100.00 (46 unique values between 50.00 and 100.00)
# review_scores_accuracy - mode = 10.0 (more than 50% of the data)
# review_scores_cleanliness - mode = 10.0
# review_scores_checkin - mode = 10.0 (more than 50% of the data)
# review_scores_communication - mode = 10.0 (more than 50% of the data)
# review_scores_location - mode = 10.0
# review_scores_value - mode = 10.0

In [20]:
# Next, we will get rid of the dollar signs and any commas that may be contained in the 'price'
# and 'extra_people' column by making a function that will strip the dollar sign ('$') from the
# array, remove the redundant '.00', and then remove commas for amounts 1000 or larger

def dollar_to_int(row):
    return row.strip('$')[:-3]
def no_comma(row):
    return row.replace(',','')

# To show it works...

amount = dollar_to_int('$1,300.00')
print(no_comma(amount))

1300


In [21]:
# Applying them to the dataset...

Berlin_airbnb['price'] = Berlin_airbnb['price'].apply(dollar_to_int).apply(no_comma).astype(int)
Berlin_airbnb['extra_people'] = Berlin_airbnb['extra_people'].apply(dollar_to_int).apply(no_comma)

In [22]:
Berlin_airbnb.shape

(22552, 52)

In [23]:
Berlin_airbnb = Berlin_airbnb.drop(columns=['security_deposit', 'cleaning_fee'])

In [24]:
# 'property_type', 'room_type', 'accommodates','bathrooms', 'bedrooms', 'beds', 'bed_type','price','number_of_reviews',('review_scores_value '),'instant_bookable','cancellation_policy','neighbourhood','host_identity_verified'

In [25]:
# Possibly useful: - Predicting 'PRICE'
# 1. neighbourhood
# 2. property type
# 3. room type
# 4. accommodates
# 5. bathrooms
# 6. bedrooms
# 7. beds
# 8. reviews_scores_value
# 9. instant_bookable
# 10. cancellation_policy
# 10. has_cleaning_service

### Columns we may go with
# 'property_type', 'room_type', 'accommodates','bathrooms', 'bedrooms', 'beds', 'bed_type','price','number_of_reviews',('review_scores_value '),'instant_bookable','cancellation_policy','neighbourhood','host_identity_verified'

In [26]:
Berlin_subset = Berlin_airbnb[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
                               'price', 'number_of_reviews', 'review_scores_value', 'instant_bookable',
                               'cancellation_policy', 'neighbourhood', 'host_identity_verified']]
Berlin_subset.head()

Unnamed: 0,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_value,instant_bookable,cancellation_policy,neighbourhood,host_identity_verified
0,Guesthouse,Entire home/apt,3,1.0,1.0,2.0,60,118,9.0,f,strict_14_with_grace_period,Mitte,t
1,Apartment,Private room,2,1.0,1.0,1.0,17,6,10.0,f,flexible,,t
2,Apartment,Entire home/apt,4,1.0,1.0,2.0,90,143,9.0,t,strict_14_with_grace_period,Prenzlauer Berg,t
3,Apartment,Private room,2,1.0,1.0,1.0,26,25,9.0,f,strict_14_with_grace_period,Schöneberg,t
4,Apartment,Private room,2,1.0,1.0,2.0,42,197,9.0,f,moderate,Prenzlauer Berg,t


In [27]:
###### We need to include why we are using these columns!! ######

# i.e. Why we chose to condense 'accommodates'

In [28]:
Berlin_subset.dtypes

property_type              object
room_type                  object
accommodates                int64
bathrooms                 float64
bedrooms                  float64
beds                      float64
price                       int32
number_of_reviews           int64
review_scores_value       float64
instant_bookable           object
cancellation_policy        object
neighbourhood              object
host_identity_verified     object
dtype: object

In [29]:
Berlin_subset['accommodates'].value_counts()

2     12228
4      3216
1      2734
3      2487
6       733
5       701
8       151
7       131
10       57
9        36
12       24
16       20
11       15
14       11
15        6
13        2
Name: accommodates, dtype: int64

In [30]:
# Minimizing the values for the accommodates column
# We will make them objects from 1-6 and then 7+

accommodate = []
for int in Berlin_subset['accommodates']:
    if int==1:
        accommodate.append('1')
    elif int==2:
        accommodate.append('2')
    elif int==3:
        accommodate.append('3')
    elif int==4:
        accommodate.append('4')
    elif int==5:
        accommodate.append('5')
    elif int==6:
        accommodate.append('6')
    elif int>=7:
        accommodate.append('7+')
    else:
        accommodate.append('')
set(accommodate)

{'1', '2', '3', '4', '5', '6', '7+'}

In [31]:
Berlin_subset['can_accommodate'] = np.array(accommodate)

In [32]:
bedrooms = []
for bed in Berlin_subset['bedrooms']:
    if bed==1.0:
        bedrooms.append('1')
    else:
        bedrooms.append('2+')
set(bedrooms)

{'1', '2+'}

In [33]:
Berlin_subset['n_bedrooms'] = np.array(bedrooms)

In [34]:
bathrooms = []
for bath in Berlin_subset['bathrooms']:
    if bath==1.0:
        bathrooms.append('1')
    else:
        bathrooms.append('2+')
set(bathrooms)

{'1', '2+'}

In [35]:
Berlin_subset['n_bathrooms'] = np.array(bathrooms)

In [36]:
beds = []
for bed in Berlin_subset['beds']:
    if bed==1.0:
        beds.append('1')
    else:
        beds.append('2+')
set(beds)

{'1', '2+'}

In [37]:
Berlin_subset['n_beds'] = np.array(beds)

In [38]:
def to_nbool(array):
    for i in array:
        if i=='t':
            return 1
        else:
            return 0

In [39]:
Berlin_subset['host_identity_verified'] = Berlin_subset['host_identity_verified'].dropna().apply(to_nbool)

In [40]:
Berlin_subset['instant_bookable'] = Berlin_subset['instant_bookable'].dropna().apply(to_nbool)

In [41]:
Berlin_subset['review_scores_value'] = Berlin_subset['review_scores_value'].replace(np.NaN, 0)

In [42]:
scores = []
for rating in Berlin_subset['review_scores_value']:
    if rating>=7.0:
        scores.append(rating)
    else:
        scores.append(0.0)
set(scores)

{0.0, 7.0, 8.0, 9.0, 10.0}

In [43]:
Berlin_subset['review_score'] = scores

In [44]:
Berlin = Berlin_subset.drop(columns=['review_scores_value'])

In [45]:
Berlin.shape

(22552, 17)

In [46]:
Berlin.head()

Unnamed: 0,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,instant_bookable,cancellation_policy,neighbourhood,host_identity_verified,can_accommodate,n_bedrooms,n_bathrooms,n_beds,review_score
0,Guesthouse,Entire home/apt,3,1.0,1.0,2.0,60,118,0,strict_14_with_grace_period,Mitte,1.0,3,1,1,2+,9.0
1,Apartment,Private room,2,1.0,1.0,1.0,17,6,0,flexible,,1.0,2,1,1,1,10.0
2,Apartment,Entire home/apt,4,1.0,1.0,2.0,90,143,1,strict_14_with_grace_period,Prenzlauer Berg,1.0,4,1,1,2+,9.0
3,Apartment,Private room,2,1.0,1.0,1.0,26,25,0,strict_14_with_grace_period,Schöneberg,1.0,2,1,1,1,9.0
4,Apartment,Private room,2,1.0,1.0,2.0,42,197,0,moderate,Prenzlauer Berg,1.0,2,1,1,2+,9.0


In [47]:
# Instead of reassigning these values to 'cancellation_policy' we will keep the original just in case we need it back

Berlin['cancel_policy'] = Berlin['cancellation_policy'].replace(('super_strict_30', 'super_strict_60', 'strict_14_with_grace_period'),'strict')

In [48]:
len(Berlin.columns)

18

In [49]:
Berlin.isnull().sum()

property_type                0
room_type                    0
accommodates                 0
bathrooms                   32
bedrooms                    18
beds                        40
price                        0
number_of_reviews            0
instant_bookable             0
cancellation_policy          0
neighbourhood             1131
host_identity_verified      26
can_accommodate              0
n_bedrooms                   0
n_bathrooms                  0
n_beds                       0
review_score                 0
cancel_policy                0
dtype: int64

In [50]:
Berlin['neighbourhood'] = Berlin['neighbourhood'].replace(np.NaN, 'TODO')

In [51]:
Berlin['neighbourhood'].value_counts()

Neukölln                3209
Prenzlauer Berg         2768
Kreuzberg               2661
Friedrichshain          2526
Mitte                   1943
Wedding                 1417
TODO                    1131
Schöneberg              1065
Moabit                   778
Charlottenburg           767
Wilmersdorf              483
Pankow                   281
Tempelhof                254
Rummelsburg              248
Weißensee                195
Lichtenberg              174
Tiergarten               163
Alt-Treptow              156
Steglitz                 153
Reinickendorf            139
Westend                  117
Friedenau                 96
Lichterfelde              92
Zehlendorf                82
Baumschulenweg            82
Friedrichsfelde           68
Britz                     61
Halensee                  60
Niederschönhausen         59
Fennpfuhl                 56
Potsdamer Platz           53
Karlshorst                52
Plänterwald               52
Köpenick                  51
Alt-Hohenschön

In [53]:
# Something we could do is split these neighbourhoods into North, South, East, and West Berlin.
# There are also 12 boroughs in Berlin that could be used.
# All in the name of Data Science........

boroughs = []
for boro in Berlin['neighbourhood']:
    
    Charlot = ['Charlottenburg-Nord', 'Schmargendorf', 'Grunewald', 'Halensee', 'Westend', 'Wilmersdorf', 'Charlottenburg']
    Fried = ['Kreuzberg', 'Friedrichshain']
    Licht = ['Falkenberg', 'Neu-Hohenschönhausen', 'Alt-Hohenschönhausen', 'Karlshorst', 'Fennpfuhl', 'Friedrichsfelde',
             'Lichtenberg', 'Rummelsburg']
    Marzahn = ['Kaulsdorf', 'Hellersdorf', 'Mahlsdorf', 'Biesdorf', 'Marzahn']
    Mitte = ['Hansaviertel', 'Potsdamer Platz', 'Tiergarten', 'Moabit', 'Wedding', 'Mitte']
    Neukolln = ['Gropuisstadt', 'Buckow', 'Rudow', 'Britz', 'Neukölln']
    Pankow = ['Rosenthal', 'Wilhelmsruh', 'Blankenburg', 'Heinersdorf', 'Buch', 'Karow', 'Französisch Buchholz',
              'Niederschönhausen', 'Weißensee', 'Pankow', 'Prenzlauer Berg']
    Reinick = ['Lübars', 'Konradshöhe', 'Waidmannslust', 'Märkisches Viertel', 'Heiligesee', 'Hermsdorf', 'Frohnau',
               'Wittenau', 'Tegel', 'Reinickendorf']
    Spandau = ['Haselhorst', 'Gatow', 'Falkenhagener', 'Siemensstadt', 'Staaken', 'Hakenfelde', 'Kladow',
               'Wilhelmstadt', 'Spandau']
    Steglitz = ['Wansee', 'Dahlem', 'Nikolassee', 'Lankwitz', 'Zehlendorf', 'Lichterfelde', 'Steglitz']
    Tempelhof = ['Marienfelde', 'Lichtenrade', 'Mariendorf', 'Friedenau', 'Tempelhof', 'Schöneberg']
    Treptow = ['Müggelheim', 'Bohnsdorf', 'Grünau', 'Schmökewite', 'Niederschöneweide', 'Altglienicke',
               'Johannesthal', 'Friedrichshagen', 'Adlershof', 'Rahnsdorf', 'Oberschöneweide',
               'Köpenick', 'Plänterwald', 'Baumschulenweg', 'Alt-Treptow']
    
    if boro in Charlot:
        boroughs.append('Charlottenburg-Wilmersdorf')
    elif boro in Fried:
        boroughs.append('Friedrichshain-Kreuzberg')
    elif boro in Licht:
        boroughs.append('Lichtenberg')
    elif boro in Marzahn:
        boroughs.append('Marzahn-Hellersdorf')
    elif boro in Mitte:
        boroughs.append('Mitte')
    elif boro in Neukolln:
        boroughs.append('Neukölln')
    elif boro in Pankow:
        boroughs.append('Pankow')
    elif boro in Reinick:
        boroughs.append('Reinickendorf')
    elif boro in Spandau:
        boroughs.append('Spandau')
    elif boro in Steglitz:
        boroughs.append('Steglitz-Zehlendorf')
    elif boro in Tempelhof:
        boroughs.append('Tempelhof-Schöneberg')
    elif boro in Treptow:
        boroughs.append('Treptow-Köpernick')
    else:
        boroughs.append('TODO')
set(boroughs)

{'Charlottenburg-Wilmersdorf',
 'Friedrichshain-Kreuzberg',
 'Lichtenberg',
 'Marzahn-Hellersdorf',
 'Mitte',
 'Neukölln',
 'Pankow',
 'Reinickendorf',
 'Spandau',
 'Steglitz-Zehlendorf',
 'TODO',
 'Tempelhof-Schöneberg',
 'Treptow-Köpernick'}

In [54]:
Berlin['boroughs'] = boroughs

In [55]:
Berlin['boroughs'] = Berlin['boroughs'].replace('TODO', np.NaN)

In [56]:
Berlin['host_identity_verified'].dtypes

dtype('float64')

In [57]:
# For 'host_identity_verified', it would make sense to fill the NA values with 0 (FALSE)
# We will also convert the column from float to integers(int64)

Berlin['host_identity_verified'] = Berlin['host_identity_verified'].replace(np.NaN, 0).astype('int64')

In [58]:
# Saving this column for backup

Berlin_cancel = Berlin['cancellation_policy']

In [59]:
Berlin.columns

Index(['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'number_of_reviews', 'instant_bookable',
       'cancellation_policy', 'neighbourhood', 'host_identity_verified',
       'can_accommodate', 'n_bedrooms', 'n_bathrooms', 'n_beds',
       'review_score', 'cancel_policy', 'boroughs'],
      dtype='object')

In [60]:
# Reassigning the Berlin dataframe to the columns we are checking out
#  'neighbourhood',

Berlin = Berlin[['property_type', 'room_type', 'price', 'cancellation_policy',
                 'instant_bookable', 'neighbourhood', 'host_identity_verified',
                 'accommodates', 'bedrooms', 'bathrooms', 'beds']]

In [61]:
Berlin.head()

Unnamed: 0,property_type,room_type,price,cancellation_policy,instant_bookable,neighbourhood,host_identity_verified,accommodates,bedrooms,bathrooms,beds
0,Guesthouse,Entire home/apt,60,strict_14_with_grace_period,0,Mitte,1,3,1.0,1.0,2.0
1,Apartment,Private room,17,flexible,0,TODO,1,2,1.0,1.0,1.0
2,Apartment,Entire home/apt,90,strict_14_with_grace_period,1,Prenzlauer Berg,1,4,1.0,1.0,2.0
3,Apartment,Private room,26,strict_14_with_grace_period,0,Schöneberg,1,2,1.0,1.0,1.0
4,Apartment,Private room,42,moderate,0,Prenzlauer Berg,1,2,1.0,1.0,2.0


In [62]:
Berlin.isna().sum()

property_type              0
room_type                  0
price                      0
cancellation_policy        0
instant_bookable           0
neighbourhood              0
host_identity_verified     0
accommodates               0
bedrooms                  18
bathrooms                 32
beds                      40
dtype: int64

In [63]:
non_na = Berlin[Berlin[['bedrooms', 'bathrooms', 'beds']].notna()]
non_na.shape

(22552, 11)

In [64]:
non_na.head()

Unnamed: 0,property_type,room_type,price,cancellation_policy,instant_bookable,neighbourhood,host_identity_verified,accommodates,bedrooms,bathrooms,beds
0,,,,,,,,,1.0,1.0,2.0
1,,,,,,,,,1.0,1.0,1.0
2,,,,,,,,,1.0,1.0,2.0
3,,,,,,,,,1.0,1.0,1.0
4,,,,,,,,,1.0,1.0,2.0


In [65]:
non_na['property_type'].value_counts()

Series([], Name: property_type, dtype: int64)

In [66]:
prop = []
for prop_type in non_na['property_type']:
    if prop_type=='Apartment' or prop_type=='Serviced apartment':
        prop.append('Apartment')
    else:
        prop.append('Non-apartment')
set(prop)

{'Non-apartment'}

In [67]:
non_na['Property_type'] = prop

In [68]:
non_na['Property_type'].value_counts()

Non-apartment    22552
Name: Property_type, dtype: int64

In [69]:
non_na['room_type'].value_counts()

Series([], Name: room_type, dtype: int64)

In [70]:
non_na['host_identity_verified'].value_counts()

Series([], Name: host_identity_verified, dtype: int64)

In [71]:
non_na['can_accommodate'].value_counts()

KeyError: 'can_accommodate'

In [None]:
non_na['n_bedrooms'].value_counts()

In [None]:
non_na['n_bathrooms'].value_counts()

In [None]:
non_na['n_beds'].value_counts()

In [None]:
non_na['cancel_policy'].value_counts()

In [None]:
non_na['price'].value_counts()

In [None]:
non_na.shape

In [None]:
# There are 157 observations (of 19k) with a price above $300
non_na[(non_na['price']>300)]['price']

In [None]:
non_na_new = non_na[non_na['price']<=300]
non_na_new.shape

In [None]:
# Ofer work starting here (James' notes between two asterisks **)

# Create Train/**Val/**Test **Incorporating random_state for reproducibility**
# **If we have time, we may want to try different values for test_size**

train, test = train_test_split(non_na_new, train_size=0.80, test_size=0.20, random_state=42)
train, val = train_test_split(train, train_size=0.80, test_size=0.20, random_state=42)

for df in train, val, test:
    print(df.shape)

In [None]:
# **Splitting up features and target then, X_train, y_train, etc.**
# 'neighbourhood',

features = ['room_type', 'instant_bookable', 'boroughs',
            'can_accommodate', 'n_bedrooms', 'n_bathrooms', 'n_beds',
            'cancel_policy']
target = 'price'

X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [None]:
# **Taking a baseline using 0 features and the mean of the target('price') column**
# How far off would we be if we used this average?

# Arrange y target vectors - target already assigned in cell above
y_train_mae = train[target]
y_val_mae = val[target]
y_test_mae = test[target]

print(f'Mean Baseline (using 0 features) and Price mean of: {round(y_train_mae.mean())}')
guess = round(y_train.mean()) # **so that the mean is an integer**

# Train Error
y_pred = [guess] * len(y_train_mae)
mae = mean_absolute_error(y_train_mae, y_pred)
print(f'Train Error for Berlin: ${mae:.2f}')

# Validate Error
y_pred = [guess] * len(y_val_mae)
mae = mean_absolute_error(y_val_mae, y_pred)
print(f'Train Error for Berlin: ${mae:.2f}')

# Test Error
y_pred = [guess] * len(y_test_mae)
mae = mean_absolute_error(y_test_mae, y_pred)
print(f'Test Error for Berlin: ${mae:.2f}')

In [None]:
# Encoding the categorical features using Ordinal Encoder which will start at 1
# for the first unique string and then count +1 for each new unique string

encoder = ce.OrdinalEncoder()

X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.fit_transform(X_val)
X_test_encoded = encoder.fit_transform(X_test)

# To see what we did...
X_train_encoded.head()

In [None]:
# Now check the data types...
X_train_encoded.dtypes

In [None]:
# **Making a function which will test the numeric features indivudually using LinearRegression**

model = LinearRegression()

def lr_feature_error(array):
    
    # Arranging X features matrices (already did y target vectors)
    X_train_lr = X_train_encoded[array].values.reshape(-1, 1)
    print(f'Linear Regression, dependent on: {array}')

    # Fit the model
    model.fit(X_train_lr, y_train)
    y_pred = model.predict(X_train_lr)
    mae = mean_absolute_error(y_train, y_pred)
    print(f'Train Error: {mae:.2f} percentage points \n')

In [None]:
for col in X_train_encoded:
    lr_feature_error(col)

In [None]:
# **Getting the intercepts**

# Fit the model to the data
model.fit(X_train_encoded, y_train)

# Intercepts and coefficients
print('Intercept', model.intercept_)
coefficients = pd.Series(model.coef_, features)
print(coefficients.to_string())

In [None]:
# Plotting these coefficients
coefficients = pd.Series(model.coef_, X_train_encoded.columns)
plt.figure(figsize=(5,10))
coefficients.sort_values().plot.barh(color='grey');

In [None]:
# **TL:DR  - Mainly for originally numeric columns**
# **For the Positive features, as the feature increases, so does the Price**
# **For the Negative features, as the feature decreases, so does the Price**

In [None]:
# Trying some ordinal encoding and LinearRegression() through a pipeline (sklearn.pipeline.make_pipeline)
# Ordinal Encoder will take string values, start at 1 and then count up for each new unique value
# After that we will standardize the data so they can be comparable

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    StandardScaler(),
    LinearRegression()
)
pipeline.fit(X_train, y_train)

# # The code below will get coefficients for LinearRegression()

lr = pipeline.named_steps['linearregression']
importances_lr = pd.Series(lr.coef_, X_train.columns)

In [None]:
# Plot of the scaled coefficients

plt.figure(figsize=(8,6))
plt.title('Feature Importance')
importances_lr.sort_values().plot.barh(color='grey');

In [None]:
# Lets try a RandomForestRegressor!

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    StandardScaler(),
    RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=42)
)
pipeline.fit(X_train, y_train)

# Get feature importances
rf = pipeline.named_steps['randomforestregressor']
importances = pd.Series(rf.feature_importances_, X_train.columns)

# Plot feature importances
%matplotlib inline

plt.figure(figsize=(8,6))
plt.title('Feature Importance')
importances.sort_values().plot.barh(color='grey');

In [None]:
# Fit on train, score on test
pipeline.fit(X_train, y_train)
y_pred_train = pipeline.predict(X_train)
y_pred_val = pipeline.predict(X_val)
y_pred_test = pipeline.predict(X_test)

rf = pipeline.named_steps['randomforestregressor']

# Print Results
print('Training R^2', pipeline.score(X_train, y_train))
print(f'Training MAE: {mean_absolute_error(y_train, y_pred_train)} dollars')
print('Validation R^2', pipeline.score(X_val, y_val))
print(f'Validation MAE: {mean_absolute_error(y_val, y_pred_val)} dollars')
print('Test R^2', pipeline.score(X_test, y_test))
print(f'Test MAE: {mean_absolute_error(y_test, y_pred_test)} dollars')

In [None]:
# Cross_val_score with SelectKBest

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    StandardScaler(),
    SelectKBest(f_regression, k='all'),
    Ridge()
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k,
                          scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)
print('Mean score', -scores.mean())

In [None]:
# With RandomForestRegressor - The MAE is lower when using TargetEncoder

pipeline = make_pipeline(
#     ce.OrdinalEncoder(),
    ce.TargetEncoder(min_samples_leaf=1, smoothing=1),
#     StandardScaler(),
    RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
)

k = 3
scores = cross_val_score(pipeline, X_train, y_train, cv=k,
                          scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)
print('Mean score', -scores.mean())

In [None]:
# Randomized Search with multiple parameter distributions

pipeline = make_pipeline(
    ce.TargetEncoder(),
    RandomForestRegressor()
)

param_distributions = {
    'targetencoder__min_samples_leaf': stats.randint(1, 1000),
    'targetencoder__min_samples_leaf': stats.uniform(1, 1000),
    'randomforestregressor__n_estimators': stats.randint(50, 500),
    'randomforestregressor__max_depth': [5, 10, 15, 20, None],
    'randomforestregressor__max_features': stats.uniform(0,1)
}

In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=10,
    return_train_score=True,
    n_jobs=-1
)

search.fit(X_train, y_train);

In [None]:
print('Best hyperparameters:', search.best_params_)
print('Cross-Validation MAE:', -search.best_score_)

In [None]:
# Detailed Results
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score')

In [None]:
pipeline = search.best_estimator_

In [None]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

y_pred = pipeline.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
print(f'Train R^2 Score: {pipeline.score(X_train, y_train)}')
print(f'Train MAE: ${mae:,.0f}')

y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Val R^2 Score: {pipeline.score(X_val, y_val)}')
print(f'Val MAE: ${mae:,.0f}')

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Test R^2 Score: {pipeline.score(X_test, y_test)}')
print(f'Test MAE: ${mae:,.0f}')

In [None]:
# KNeighborsRegressor

pipeline = make_pipeline(
    ce.TargetEncoder(),
    KNeighborsRegressor(n_neighbors=8,n_jobs=-1)
)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
print(f'Train R^2 Score: {pipeline.score(X_train, y_train)}')
print(f'Train MAE: ${mae:,.0f}')

In [None]:
pipeline = make_pipeline(
    ce.TargetEncoder(min_samples_leaf=1, smoothing=1),
    KNeighborsRegressor()
)

n_neighbors = range(1, 30, 1)
train_scores, val_scores = validation_curve(
    pipeline, X_train, y_train,
    param_name='kneighborsregressor__n_neighbors',
    param_range=n_neighbors,
    cv=3,
    n_jobs=-1
)

plt.figure(dpi=150)
plt.plot(n_neighbors, np.mean(train_scores, axis=1), color='blue', label='training error')
plt.plot(n_neighbors, np.mean(val_scores, axis=1), color='red', label='validation error')
plt.title('Validation Curve')
plt.xlabel('model complexity KNeighborsRegressor n_neighbors')
plt.ylabel('model score: R^2')
plt.legend()
plt.show()

In [None]:
pipeline = make_pipeline(
    ce.TargetEncoder(),
    KNeighborsRegressor()
)

param_distributions = {
    'targetencoder__min_samples_leaf': stats.randint(1, 1000),
    'targetencoder__min_samples_leaf': stats.uniform(1, 1000),
    'kneighborsregressor__n_neighbors': range(1, 50, 1),
    'kneighborsregressor__leaf_size': range(1, 50, 1),
    'kneighborsregressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'kneighborsregressor__weights': ['uniform', 'distance']
}

In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    verbose=10,
    return_train_score=True,
    n_jobs=-1
)
search.fit(X_train, y_train)

In [None]:
print('Best hyperparameters "KNeighborsRegressor":', search.best_params_)
print('Cross-Validation R^2:', search.best_score_)

In [None]:
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score')

In [None]:
pipeline = search.best_estimator_

In [None]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

y_pred = pipeline.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
print(f'Train R^2 Score: {pipeline.score(X_train, y_train)}')
print(f'Train MAE: ${mae:,.0f}')

y_pred = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Val R^2 Score: {pipeline.score(X_val, y_val)}')
print(f'Val MAE: ${mae:,.0f}')

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Test R^2 Score: {pipeline.score(X_test, y_test)}')
print(f'Test MAE: ${mae:,.0f}')

In [None]:
# Get feature importances
knr = pipeline.named_steps['kneighborsregressor']
importances = pd.Series(knr.kneighbors, X_train.columns)
importances
knr.kneighbors_graph
# # Plot feature importances
# %matplotlib inline

# plt.figure(figsize=(8,6))
# plt.title('Feature Importance')
# importances.sort_values().plot.barh(color='grey');

In [None]:
#stripping NaN values
berlin_na_stripped = cancel_policy
berlin_na_stripped.shape


In [None]:
Berlin = berlin_na_stripped

In [None]:
# # Ofer's code begins here
# # Create Train/Test split:
# import pandas as pd
# from sklearn import datasets, linear_model
# from sklearn.model_selection import train_test_split
# from matplotlib import pyplot as plt

# # create training and testing vars

# X = Berlin.drop(columns='price')
# y = Berlin.price
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)


In [None]:
# # Get feature importances
# rf = pipeline.named_steps['randomforestregressor']
# importances = pd.Series(rf.feature_importances_, X_train.columns)

# # Plot feature importances
# %matplotlib inline
# import matplotlib.pyplot as plt

# plt.figure(figsize=(8,6))
# plt.title('Feature Importance')
# importances.sort_values().plot.barh(color='grey');

In [None]:
#try to graph it out
import plotly.express as px
px.scatter(Berlin, x='number_of_reviews', y= target)
#this shows the less reviews, the higher the price (this probably suggests that highly priced properties don't get booked much)

In [None]:
#try to graph it out
import seaborn as sns
sns.boxplot(y_train)

In [None]:
#throw some shapley values
# !pip install shap
# import shap

# X_train_encoded = encoder.transform(X_train)
# row = X_train_encoded

# explainer = shap.TreeExplainer(rf)
# shap_values = explainer.shap_values(row)

# shap.initjs()
# shap.force_plot(
# # shap.summary_plot( 
#     base_value=explainer.expected_value,
#     shap_values=shap_values,
#     features=row
# )

In [None]:
# # Feature Scaling
# from sklearn.preprocessing import StandardScaler

# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

In [None]:
# Arrange data into X features matrix and y target vector
target = 'price'

!pip install --upgrade category_encoders
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    # SimpleImputer(strategy='median'), 
    RandomForestRegressor(n_estimators=250, random_state=42, n_jobs=-1)
)

# Fit on train, score on test
pipeline.fit(X_train, y_train)
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

rf = pipeline.named_steps['randomforestregressor']
encoder = pipeline.named_steps['ordinalencoder']


# Print Results
print('Training R^2', pipeline.score(X_train, y_train))
print(f'Training MAE: {mean_absolute_error(y_train, y_pred_train)} dollars')
print('Validation R^2', pipeline.score(X_test, y_test))
print(f'Validation MAE: {mean_absolute_error(y_test, y_pred_test)} dollars')

In [None]:
X_test.head(4)

In [None]:

#I think this is predicting the first 4 rows prices? so, $32, $31, $113.9, $67 ?
#Can someone verify this? :)
y_pred = pipeline.predict(X_test[:3])
y_pred

In [None]:
#looking up the first 4 prices of y
y_test.head(4)
#So wait - I thought this was a 1 dimensional array
#what is the number on the left? on the right it's the price correct?