written by: Jana Vihs, vihsjana@student.hu-berlin.de, 604930
# Dear Jupyter Notebook Reader

fancy, seeing you here.

# Airbnb Price Predictor 

### Table of Contents
- Introduction
    - Meta Information
    - Tools 
        - Docker
        - DVC
- Explorative Data Analysis
    - Numeric Features about the Airbnb 
    - Numeric Features about the Host
    - Text Data 
        - Reviews
    - Images 
- Feature Engineering 
    - Distance to City Center
    - Host since in years
    - Text Length
    - Sentiment Analysis
    - Images 
        - Colors and Brightness
- Feature Selection
    - Feature Importance 
    - Grid Search
- Benchmark Models
    - Multivariate Linear Regression
    - Neural Networks  
- Model Evaluation
- Final Method
    - Hyperparameter Tuning
- Conclusion and Outlook
- References 

# Introduction


In [1]:
# import all necessary packages 
# Standards 
import pandas as pd 
import numpy as np
import os 
import math

# Visulaizations
import seaborn as sns
import folium
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import datetime
import warnings
warnings.filterwarnings('ignore')

# my little helpers
from src import helpers


## Meta Information

In [2]:
# read in data set 
train = helpers.read_df('data/raw/train.csv', index_col='listing_id')
test = helpers.read_df('data/raw/test.csv', index_col='listing_id')
reviews = helpers.read_df('data/raw/reviews.csv', index_col='listing_id')

In [3]:
#  change data types because of memory reasons
train = helpers.change_data_types(train)
test = helpers.change_data_types(test)
reviews = helpers.change_data_types(reviews)

In [None]:
# Merge reviews on train using listing_id
#train_review = train.merge(reviews, on='listing_id')

In [24]:
test.isnull().sum(
)

name                              10
summary                         1577
space                           9057
description                      943
experiences_offered                0
neighborhood_overview          10551
transit                        10672
house_rules                    12580
picture_url                        0
host_id                            0
host_since                        65
host_response_time              9572
host_response_rate              9572
host_is_superhost                  0
host_total_listings_count         65
host_has_profile_pic               0
host_identity_verified             0
neighbourhood                     86
neighbourhood_cleansed             0
zipcode                          635
latitude                           0
longitude                          0
property_type                      0
room_type                          0
accommodates                       0
bathrooms                         50
bedrooms                          29
b

In [4]:
# Missing values 
train.isnull().sum()

name                              14
summary                         2954
space                          16881
description                     1726
experiences_offered                0
neighborhood_overview          19506
transit                        19807
house_rules                    23378
picture_url                        0
host_id                            0
host_since                       111
host_response_time             17802
host_response_rate             17802
host_is_superhost                  0
host_total_listings_count        111
host_has_profile_pic               0
host_identity_verified             0
neighbourhood                    147
neighbourhood_cleansed             0
zipcode                         1272
latitude                           0
longitude                          0
property_type                      0
room_type                          0
accommodates                       0
bathrooms                         70
bedrooms                          62
b

In [5]:
# lets split our data set in 3 different categories to make analyse simpler
host, airbnb, review_scores = helpers.split_df(train)

In [23]:
len(airbnb[airbnb.picture_url.isnull()!=True].picture_url)

55284

In [10]:
host.isnull().sum()

host_id                          0
host_since                     111
host_response_time           17802
host_response_rate           17802
host_is_superhost                0
host_total_listings_count      111
host_has_profile_pic             0
host_identity_verified           0
dtype: int64

In [14]:
airbnb['description_length'] = airbnb.description.apply(lambda x: len(x))

1000

# Explorative Data Analysis

In [None]:
sns.countplot(train['neighbourhood_cleansed'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(10,10)
plt.title('Neighbourhood Group')

In [None]:
# Create map
lonlat = list(zip(train.longitude, train.latitude))
mapit = folium.Map( location=[52.667989, -1.464582], zoom_start=6 )
for coord in lonlat:
    folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( mapit )

mapit.save( 'map.html')