written by: Jana Vihs, vihsjana@student.hu-berlin.de, 604930
# Dear Jupyter Notebook Reader

fancy, seeing you here.

# Airbnb Price Predictor 

### Table of Contents
- Introduction
    - Meta Information
    - Tools 
        - Docker
        - DVC
- Explorative Data Analysis
    - Numeric Features about the Airbnb 
    - Numeric Features about the Host
    - Text Data 
        - Reviews
    - Images 
- Feature Engineering 
    - Distance to City Center
    - Host since in years
    - Text Length
    - Sentiment Analysis
    - Images 
        - Colors and Brightness
- Feature Selection
    - Feature Importance 
    - Grid Search
- Benchmark Models
    - Multivariate Linear Regression
    - Neural Networks  
- Model Evaluation
- Final Method
    - Hyperparameter Tuning
- Conclusion and Outlook
- References 

# Introduction


In [12]:
# import all necessary packages 
# Standards 
import pandas as pd 
import numpy as np
import os 
import math

# Visulaizations
import seaborn as sns
import folium
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import datetime
import warnings
warnings.filterwarnings('ignore')

# my little helpers
from src import helpers
from src import Textprocessor 
from src.Distance_Calculator import Distance_Calculator 



## Meta Information

In [3]:
# read in data set 
train = helpers.read_df('data/train.csv', index_col='listing_id')
test = helpers.read_df('data/test.csv', index_col='listing_id')
reviews = helpers.read_df('data/reviews.csv', index_col='listing_id')

In [4]:
#  change data types because of memory reasons
train = helpers.change_data_types(train)
test = helpers.change_data_types(test)
reviews = helpers.change_data_types(reviews)

In [4]:
dist_calc = Distance_Calculator()
longlat = dist_calc.zip_objects(train,lat_poi=51.510067,long_poi=-0.133869)
longlat

Unnamed: 0_level_0,longitude,latitude,longPoi,latPoi,originCoordinates,poiCoordinates
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9Q1RD6H7IO,-0.01062,51.474571,-0.133869,51.510067,"(51.474571228027344, -0.010619999840855598)","(51.510067, -0.133869)"
6XDPZPGLSR,-0.12416,51.583290,-0.133869,51.510067,"(51.583290100097656, -0.12415999919176102)","(51.510067, -0.133869)"
SND9OPN6IY,-0.18567,51.416340,-0.133869,51.510067,"(51.41633987426758, -0.18567000329494476)","(51.510067, -0.133869)"
SE3B7BXN9G,-0.14137,51.456219,-0.133869,51.510067,"(51.45621871948242, -0.1413699984550476)","(51.510067, -0.133869)"
E0C82VK0VF,-0.11219,51.481140,-0.133869,51.510067,"(51.48114013671875, -0.11219000071287155)","(51.510067, -0.133869)"
...,...,...,...,...,...,...
HSY40NEHOI,-0.10048,51.470779,-0.133869,51.510067,"(51.47077941894531, -0.10047999769449234)","(51.510067, -0.133869)"
XZXB1NS63Y,-0.00913,51.533409,-0.133869,51.510067,"(51.533409118652344, -0.00913000013679266)","(51.510067, -0.133869)"
Z2T8G6D82D,-0.14177,51.574631,-0.133869,51.510067,"(51.57463073730469, -0.14177000522613525)","(51.510067, -0.133869)"
8B1Q8UJWLH,-0.10193,51.565861,-0.133869,51.510067,"(51.565860748291016, -0.10192999988794327)","(51.510067, -0.133869)"


In [5]:
longlat['dist'] = [dist_calc.get_distance(**longlat[['originCoordinates','poiCoordinates']].iloc[i].to_dict()) for i in range(longlat.shape[0])]

In [None]:
# Merge reviews on train using listing_id
train_review = train.merge(reviews, on='listing_id')

In [5]:
# lets split our data set in 3 different categories to make analyse simpler
host, airbnb, review_scores = helpers.split_df(train)

In [None]:
airbnb.amenities[0]


In [None]:
airbnb.description[6]

In [None]:
host[host.host_has_profile_pic.isnull()]

In [15]:
clean = host[host.host_since.isnull()!=True]
clean['host_since_year'] = clean['host_since'].apply(lambda x: round(x.year,0))
clean['host_memship_in_years'] = datetime.date.today().year - clean.host_since_year

In [16]:
clean

Unnamed: 0_level_0,host_id,host_since,host_response_time,host_response_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,host_since_year,host_memship_in_years
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9Q1RD6H7IO,2167992,2012-04-18,within an hour,100%,True,1.0,True,False,2012,9
6XDPZPGLSR,16148175,2014-05-29,within a few hours,100%,True,1.0,True,True,2014,7
SND9OPN6IY,272000199,2019-06-28,within an hour,100%,False,2.0,True,False,2019,2
SE3B7BXN9G,3361233,2012-08-24,within a few hours,100%,False,1.0,True,False,2012,9
E0C82VK0VF,94317022,2016-09-09,within an hour,100%,True,2.0,True,False,2016,5
...,...,...,...,...,...,...,...,...,...,...
HSY40NEHOI,28354624,2015-02-25,,,False,2.0,True,False,2015,6
XZXB1NS63Y,207619004,2018-08-05,,,False,1.0,True,False,2018,3
Z2T8G6D82D,137970479,2017-07-01,within an hour,100%,False,4.0,True,False,2017,4
8B1Q8UJWLH,64339219,2016-03-24,,,False,1.0,True,True,2016,5


# Explorative Data Analysis

In [None]:
sns.countplot(train['neighbourhood_cleansed'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(10,10)
plt.title('Neighbourhood Group')

In [None]:
# Create map
lonlat = list(zip(train.longitude, train.latitude))
mapit = folium.Map( location=[52.667989, -1.464582], zoom_start=6 )
for coord in lonlat:
    folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( mapit )

mapit.save( 'map.html')