# Python notebook for analyzing AirBnB dataset from I2I #

### Importing modules ###

In [17]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go

# init_notebook_mode(connected=True)
# iplot does not work for now so no inline plotting

### Reading in data ###

In [38]:
df = pd.read_csv('listings_with_prices.csv')
df = df.drop(['name', 'host_name', 'neighbourhood_group', 'latitude', 'longitude', 'Lower', 'Upper', 'SELECTIE', 'FILTER'], axis=1)
print(df.columns)
df.head()

Index(['id', 'host_id', 'neighbourhood', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'LABEL'],
      dtype='object')


Unnamed: 0,id,host_id,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,LABEL
0,1037832,5713208,IJburg - Zeeburgereiland,Entire home/apt,111,5,10,2016-06-05,26.0,1,344,3153 - 3784
1,1025835,3040855,IJburg - Zeeburgereiland,Private room,65,1,44,2016-06-19,391.0,4,126,3153 - 3784
2,12788723,69619241,IJburg - Zeeburgereiland,Private room,80,1,0,,,1,78,3153 - 3784
3,12560490,68081836,IJburg - Zeeburgereiland,Entire home/apt,200,2,0,,,1,27,3153 - 3784
4,666704,3040855,IJburg - Zeeburgereiland,Private room,55,1,145,2016-06-17,311.0,4,248,3153 - 3784


### Analyzing financial stuff ###

In order to anylyze the Return On Investment (ROI) we need two measures:

1) the investment. We have the m^2 price, however we do not know how big the apparments are. Also we have to include other costs like elektricity and gas. 

2) the return. We have the price for 1 night, however we do not know how many nights per year are rented. 

In [39]:
df = df.sort_values(by=['LABEL'])
labels = df.LABEL.unique()
labels = [label for label in labels if str(label) != 'nan']
values = [len(df[df.LABEL == label])  for label in labels]
    
trace1 = go.Bar(
    x=labels, 
    y=values, 
    name='Boxplot of location cost')

layout = go.Layout(
    title='Locations costs', 
    xaxis=dict(title='location cost'), 
    yaxis=dict(title='frequency'))

data = [trace1]
figure = go.Figure(data=data, layout=layout)

plot(figure, filename='barplot_cost.html')

'file:///home/johan/Dropbox/Documents/Code/AirBnB/barplot_cost.html'

In [40]:
def average_m2_prices(df, labels):
    df_new = df.copy()
    for i, label in enumerate(labels):
        label_list = str.split(label)
        if len(label_list) == 3:
            a, _, b = label_list
            df_new.loc[df_new.LABEL == label, 'LABEL'] = (int(a)+int(b))/2
    return df_new
            
            
dfROI = average_m2_prices(df, labels)
dfROI.head()

Unnamed: 0,id,host_id,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,LABEL
3181,5362748,27800005,Bijlmer-Oost,Private room,45,1,8,2016-06-29,68,1,193,2207.5
3236,3651182,17092248,Bijlmer-Oost,Entire home/apt,100,2,40,2016-06-28,192,1,347,2207.5
3235,12272272,12308050,Bijlmer-Oost,Private room,65,3,7,2016-06-10,269,2,355,2207.5
3234,12735030,52920463,Bijlmer-Oost,Entire home/apt,45,3,1,2016-06-28,1,1,6,2207.5
3233,12680964,56894783,Bijlmer-Oost,Private room,30,1,3,2016-06-27,148,1,0,2207.5


We want to see if we can group the prices by the labels, i.e. how much they cost. 

Seems to work but need statistical test to make sure it is a significant difference.

In [45]:
df_grouped_by_label = [df.loc[df.LABEL == label] for label in labels]

for i, label in enumerate(labels):
    df_temp = df_grouped_by_label[i]
    average = np.mean(df_temp.price)
    y1 = df_temp.price
    y2 = np.full(len(df_temp), average)
    x = np.arange(len(df_temp))

    trace1 = go.Scatter(
        x=x, 
        y=y1, 
        name='Boxplot of prices',
        mode='markers'
    )
    
    trace2 = go.Scatter(
        x=x, 
        y=y2, 
        name='Boxplot of prices'
    )

    layout = go.Layout(
        title='prices for {}'.format(label), 
        xaxis=dict(title='id'), 
        yaxis=dict(title='price'))

    data = [trace1,trace2]
    figure = go.Figure(data=data, layout=layout)

    plot(figure, filename='price_{}.html'.format(label))

In [50]:
for i, label in enumerate(labels):
    print(i, label)
    df_temp = df_grouped_by_label[i]
    df_temp = average_m2_prices(df_temp, labels)
    average = np.mean(df_temp.price)
    y = [average*i for i in range(365)]
    x = [df_temp.LABEL]
    print(x)

#     trace1 = go.Scatter(
#         x=x, 
#         y=y1, 
#         name='Boxplot of prices',
#         mode='markers'
#     )
    
#     trace2 = go.Scatter(
#         x=x, 
#         y=y2, 
#         name='Boxplot of prices'
#     )

#     layout = go.Layout(
#         title='prices for {}'.format(label), 
#         xaxis=dict(title='id'), 
#         yaxis=dict(title='price'))

#     data = [trace1,trace2]
#     figure = go.Figure(data=data, layout=layout)

#     plot(figure, filename='price_{}.html'.format(label))

0 1892 - 2523
[3181    2207.5
3236    2207.5
3235    2207.5
3234    2207.5
3233    2207.5
3232    2207.5
3231    2207.5
3230    2207.5
3229    2207.5
3237    2207.5
3228    2207.5
3226    2207.5
3225    2207.5
3224    2207.5
3223    2207.5
3222    2207.5
3221    2207.5
3220    2207.5
3219    2207.5
3227    2207.5
3238    2207.5
3239    2207.5
3240    2207.5
975     2207.5
976     2207.5
977     2207.5
978     2207.5
979     2207.5
980     2207.5
981     2207.5
         ...  
3193    2207.5
3198    2207.5
3197    2207.5
3196    2207.5
3195    2207.5
3194    2207.5
3190    2207.5
3202    2207.5
3192    2207.5
3204    2207.5
3214    2207.5
3212    2207.5
3203    2207.5
41      2207.5
3213    2207.5
40      2207.5
35      2207.5
3211    2207.5
32      2207.5
33      2207.5
39      2207.5
30      2207.5
3210    2207.5
3209    2207.5
3208    2207.5
3207    2207.5
3206    2207.5
3205    2207.5
31      2207.5
36      2207.5
Name: LABEL, Length: 88, dtype: object]
1 2523 - 3153



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



TypeError: invalid type comparison

Above we added the avg price per m^2 for houses in the corresponding area's. Now we need some measure for the expected size of the appartment. 

In [None]:
costs = [ for i in range(365)]

In [20]:
df['yearly_returns'] = df.price * 365
df = df.sort_values(by=['yearly_returns'])

values = np.log(df.yearly_returns)
labels = np.arange(len(df))

trace1 = go.Scatter(
    x=labels, 
    y=values, 
    name='Boxplot of yearly returns (365 days)',
    mode='markers'
)

layout = go.Layout(
    title='Locations costs', 
    xaxis=dict(title='id'), 
    yaxis=dict(title='log of yearly returns'))

data = [trace1]
figure = go.Figure(data=data, layout=layout)

plot(figure, filename='yearly_returns.html')

'file:///home/johan/Dropbox/Documents/Code/AirBnB/yearly_returns.html'

In [None]:


values = df.yearly_returns / df.
labels = np.arange(len(df))

trace1 = go.Scatter(
    x=labels, 
    y=values, 
    name='Boxplot of yearly profit (365 days)',
    mode='markers'
)

layout = go.Layout(
    title='ROI', 
    xaxis=dict(title='id'), 
    yaxis=dict(title='log of yearly returns'))

data = [trace1]
figure = go.Figure(data=data, layout=layout)

plot(figure, filename='yearly_returns.html')

### Analyzing different groups ###

In [5]:
labels = df.room_type.unique()
values = [len(df[df.room_type == label])  for label in labels]

trace1 = go.Bar(
    x=labels, 
    y=values, 
    name='Boxplot of location cost')

layout = go.Layout(
    title='Locations costs', 
    xaxis=dict(title='locations'), 
    yaxis=dict(title='frequency'))

data = [trace1]
figure = go.Figure(data=data, layout=layout)

plot(figure, filename='Location types.html')

'file:///home/johan/Dropbox/Documents/Code/AirBnB/Location types.html'