In [None]:
!python --version

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.

In [None]:
listings_df = pd.read_csv('/kaggle/input/seattle/listings.csv')
calendar_df = pd.read_csv('/kaggle/input/seattle/calendar.csv')

In [None]:
print(listings_df.shape)
print(listings_df.info())
print(listings_df.head())
print(calendar_df.shape)
print(calendar_df.info())
print(calendar_df.head())

In [None]:
listings_df_new = listings_df[['id','host_id', 'price', 'city', 'neighbourhood'
                                                      , 'host_since', 'host_is_superhost'
                                                      , 'room_type', 'bed_type', 'beds', 'bedrooms', 'bathrooms'
                                                      , 'minimum_nights', 'cancellation_policy', 'instant_bookable']]

In [None]:
listing_calendar_df = listings_df_new.merge(calendar_df, left_on = 'id', right_on = 'listing_id', suffixes = ['_listing', '_calendar'])

In [None]:
listing_calendar_v2_df = listings_df_new.merge(calendar_df, left_on = 'id', right_on = 'listing_id', suffixes = ['_listing', '_calendar'])

In [None]:
listing_calendar_df.info()

In [None]:
listing_calendar_df = listing_calendar_df.drop(['id', 'listing_id'], axis = 1)
listing_calendar_df.info()

In [None]:
listing_calendar_df.isnull().sum()

# Fill Missing Values

A few columns contains missing values. Instead of drop all missing values, which will cause troubles for further analysis, I will impute the missing values. There are many common methods for missing value imputation. However, after review the dataset, I think the common methods are not the best options for this datasest. For example, host_is_superhost column contains 730 missing values. The superhost is a feature that AirBnB gave to the host who provided excellent services and had been approved by the visitors every 3 months. This is a flag that provide by the AirBnB. If a host doesn't have the value of is the superhost that highly represents this host is not superhost. Meanwhile, the records that don't have superhost data, don't have the host_since data either. I assume these records are from new hosts. Therefore, using the 'f' value to fillin all the missing values in host_is_superhost column instead of using common missing value imputation is more properity. This logic will apply to other missing value imputation. Using the values that meet the situation of the real business rather than the common mean, median or mode imputation.

In [None]:
listing_calendar_df.loc[(listing_calendar_df['city'] == '西雅图') |(listing_calendar_df['city'] == 'seattle')|(listing_calendar_df['city'] == 'Seattle '), 'city'] = 'Seattle'

In [None]:
listing_calendar_df.loc[listing_calendar_df.neighbourhood.isnull(), 'neighbourhood'] = listing_calendar_df.loc[listing_calendar_df.neighbourhood.isnull(), 'city']

In [None]:
listing_calendar_df.loc[listing_calendar_df.host_is_superhost.isnull(), 'host_is_superhost'] = 'f'

In [None]:
listing_calendar_df.loc[listing_calendar_df.host_since.isnull(), ].groupby('host_id').date.agg('min')
listing_calendar_df.loc[listing_calendar_df.host_since.isnull(), 'host_since'] = '2016-01-04'

In [None]:
listing_calendar_df.loc[listing_calendar_df.price_calendar.isnull() > 0, "price_calendar"] = listing_calendar_df.loc[listing_calendar_df.price_calendar.isnull() > 0, "price_listing"]

In [None]:
listing_calendar_df.loc[listing_calendar_df.price_calendar.isnull() > 0, "price_calendar"].isnull()

In [None]:
listing_calendar_df.drop(['price_listing'], axis = 1)
listing_calendar_df = listing_calendar_df.rename(columns = {'price_calendar':'price'})
listing_calendar_df.info()

In [None]:
listing_calendar_df.loc[listing_calendar_df.bedrooms.isnull(), 'bedrooms'] = 0

In [None]:
listing_calendar_df.loc[listing_calendar_df.beds.isnull(), 'bed_type'].unique()
listing_calendar_df.loc[listing_calendar_df.beds.isnull(), 'beds'] = 1

In [None]:
listing_calendar_df.loc[listing_calendar_df.bathrooms.isnull(), 'bathrooms'] = 0

In [None]:
listing_calendar_df.isnull().sum()

In [None]:
listing_calendar_df = listing_calendar_df.drop(['host_id', 'price_listing'], axis = 1)

# Convert data type

In [None]:
def data_conversion(df):
    '''
    INPUT: df - pandas dataframe you want to convert data type
    OUTPUT: df - a new dataframe that has the following characteristics:
    1. all the dates are datetime data type
    2. all the binary variables are bool type
    3. all the price variables are numeric
    '''
    for col in df.columns:
        if (df[col].dtype == 'O'):
            if (df[col].str.match("[1-2][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]").sum() > 0):
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='ignore')
            elif (list(df[col].unique()) == ['t', 'f'] or list(df[col].unique()) == ['f', 't']):
                bl_convert = {'t': True, 'f': False}
                df[col] = df[col].replace(bl_convert)
            elif (df[col].str.contains("\$", na=False).sum() > 0):
                df[col] = pd.to_numeric(df[col].str.replace('\$|\,', ''))
        else:
            continue
    return df

In [None]:
listing_calendar_df = data_conversion(listing_calendar_df)
listing_calendar_df.info()

In [None]:
def groupby_agg(df, groupby, agg):
    '''
    INPUT: df - pandas dataframe you want to put into groupby aggregation
           groupby - variable or variable list needs to group by
           agg: dictionary of aggregation variable and aggregation function 
           or list of dictionaries of aggregation variable and aggregation function
    OUTPUT: df - an aggreated pandas dataframe with reseted index
    '''
    df = df.groupby(groupby, as_index = False).agg(agg)
    return df


# Availability Analysis

In [None]:
listing_calendar_avail_agg = {'available':'mean'}
listing_calendar_avail_df = groupby_agg(listing_calendar_df, 'date', listing_calendar_avail_agg)

In [None]:
g = sns.lineplot(x = 'date', y = 'available', data = listing_calendar_avail_df)
g.set_title("Seattle Airbnb Availability(%) Trend")

The available listings kept increasing from January to April and the needs of AirBnB house increasing til July. And the needs of house/apt will decreasing.

In [None]:
listing_calendar_room_avail_gb = ['date', 'room_type']
listing_calendar_room_avail_agg = {'available':['count','sum', 'mean']}
listing_calendar_room_avail_df = groupby_agg(listing_calendar_df, listing_calendar_room_avail_gb, listing_calendar_room_avail_agg)

In [None]:
sns.lineplot(x=listing_calendar_room_avail_df['date'], y=listing_calendar_room_avail_df['available']['sum'], hue=listing_calendar_room_avail_df['room_type'])

The trend above shows the entire home/apt room and private room types share the same trends as the total available trends in the market. But the demand trends of the shared room consistently low. And this room type doesn't have many availibilities in the market. Based on this trend, we can't tell if the shared room type is more popular in the Seattle market or the listings of this type is lower than other two. Need to further analysis.

In [None]:
g = sns.lineplot(x=listing_calendar_room_avail_df['date'], y=listing_calendar_room_avail_df['available']['mean'], hue=listing_calendar_room_avail_df['room_type'])
g.set_title("Seattle Airbnb Availability(%) by Room Type")

The total listings of Shared Room type is lower than other two types. After check the available percentage of three types. All of them have the same trends. And the Shared Room type is the least popular in the Seattle market.While the most people like the entire home/apt types.

In [None]:
listing_calendar_city_avail_gb = ['date','city']
listing_calendar_city_avail_agg = {'available':['count', 'sum', 'mean']}
listing_calendar_city_avail_df = groupby_agg(listing_calendar_df, listing_calendar_city_avail_gb, listing_calendar_city_avail_agg)

In [None]:
g = sns.lineplot(x = listing_calendar_city_avail_df['date'], y = listing_calendar_city_avail_df['available']['mean'], hue = listing_calendar_city_avail_df['city'])
g.set_title("Seattle Airbnb Availability(%) by City")

In [None]:
g = sns.barplot(x = listing_calendar_city_avail_df['city'], y = listing_calendar_city_avail_df['available']['count'])
g.set_yscale("log")

The city of Seattle has much more listings than other cities. However, we may need to consider the situation that some listings are in other cities, such as West Seattle, but put Seattle as the city on the listings. But it still worthes to check the listings by city.

In [None]:
plt.figure(figsize=(16, 16))
listing_calendar_city_avail_df_v2 = listing_calendar_df.pivot_table(index='neighbourhood', columns='room_type', values='available', aggfunc=len)
sns.heatmap(listing_calendar_city_avail_df_v2, annot=True, fmt=".0f")

Seattle has the most lisings of entire home/apt and private room types. The community of capitol hill,bell town and ballard had a lot listings as well. Overall, a visitor had more choices if they plan to rent a home or have a private room in these areas. Capitol hill community also provided a lot options of shared room.

In [None]:
plt.figure(figsize=(16, 16))
listing_calendar_city_avail_df_v2 = listing_calendar_df.pivot_table(index='neighbourhood', columns='room_type', values='available', aggfunc=np.mean)
sns.heatmap(listing_calendar_city_avail_df_v2, annot=True, fmt=".1f")

Compare with the supplies and demands. It is difficult to find entire home/apt in north college part and licton springs. While it's the most difficult task to find a shared room in the lower queen anne and eastlake communities.

AirBnB has a feature called Superhost. According to AirBnB, the superhost "are experienced hosts who provide a shining example for other hosts, and extraordinary experiences for their guests." Hosts would become Superhosts if they meet certain criteria. These criteria including communication, commitment, guest satisfaction, and experience. Would superhosts feature have any impact on the availability?

In [None]:
listing_calendar_superhost_avail_gb = ['date', 'host_is_superhost']
listing_calendar_superhost_avail_agg = {'available':['sum', 'count', 'mean']}
listing_calendar_superhost_avail_df = groupby_agg(listing_calendar_df, listing_calendar_superhost_avail_gb, listing_calendar_superhost_avail_agg)

In [None]:
g = sns.lineplot(x = listing_calendar_superhost_avail_df['date'], y = listing_calendar_superhost_avail_df['available']['mean'], hue = listing_calendar_superhost_avail_df['host_is_superhost'])
g.set_title("Seattle Airbnb Availability(%) of Superhosts vs. Non-Superhosts")

Based on the trends above. There is no significant availability difference between superhosts and non-superhosts before 2016-07. Actually, the percentage of listings availbility of superhost is higher than the non-superhosts. Which means, the listings of superhosts had more available dates than the listings of non-superhosts. The reason is unknown from the trends above. However, the listings of superhost became popular in the market after 2016-07.

In [None]:
listing_calendar_superhost_avail_gb_v2 = 'host_is_superhost'
listing_calendar_superhost_avail_agg_v2 = {'available':'mean'}
listing_calendar_superhost_avail_df_v2 = groupby_agg(listing_calendar_df, listing_calendar_superhost_avail_gb_v2, listing_calendar_superhost_avail_agg_v2)
listing_calendar_superhost_avail_df_v2

Overall the average availability between superhosts' listings and non-superhosts' are close.

In [None]:
plt.figure(figsize=(16, 16))
listing_calendar_superhost_avail_df_v3 = listing_calendar_df.pivot_table(index='neighbourhood', columns='room_type', values='host_is_superhost', aggfunc=np.sum)
sns.heatmap(listing_calendar_superhost_avail_df_v3, annot=True, fmt=".0f")

Based on the heatmap above. It seems superhosts provide more entire home/apt and private rooms in the Seattle market.

In [None]:
listing_calendar_superhost_avail_gb_v4 = ['host_is_superhost', 'room_type']
listing_calendar_superhost_avail_agg_v4 = {'available':'mean'}
listing_calendar_superhost_avail_df_v4 = groupby_agg(listing_calendar_df, listing_calendar_superhost_avail_gb_v4, listing_calendar_superhost_avail_agg_v4)
listing_calendar_superhost_avail_df_v4.sort_values('room_type')

In [None]:
g = sns.catplot(x = 'host_is_superhost', y = 'available', data = listing_calendar_superhost_avail_df_v4, kind = 'bar', col = 'room_type')

Although the popularity of listings between superhosts and non-superhosts has no significant differences. The shared room type from the superhosts is more popular than the listings from the non-superhosts.

In [None]:
listing_calendar_superhost_avail_gb_v5 = ['date', 'host_is_superhost', 'room_type']
listing_calendar_superhost_avail_agg_v5 = {'available':'mean'}
listing_calendar_superhost_avail_df_v5 = groupby_agg(listing_calendar_df, listing_calendar_superhost_avail_gb_v5, listing_calendar_superhost_avail_agg_v5)
listing_calendar_superhost_avail_df_v5.sort_values('room_type')

In [None]:
g = sns.catplot(x = 'date', y = 'available', hue = 'host_is_superhost', data = listing_calendar_superhost_avail_df_v5, col = 'room_type')

The superhosts is a great feature that a visitor will have more chances to gain better experiences. I assume the listings from the superhosts will be more popular than these of non-superhosts. However, the results doesn't approve my assumption. Will be any other factor impact the decisions of visitors when they choose prior to finding a great host?

# Price Analysis

In [None]:
sns.distplot(listing_calendar_df.price, kde = False).set(ylabel = 'Frequency', title = 'Distribution of Price')

The distribution of price is left skewed and the most listings are inbetween the range 0-250

In [None]:
plt.figure(figsize=(16, 6))
g = listing_calendar_df.groupby('date').price.mean().plot()
g.set_title("Seattle Airbnb Avg Price Trends")

The price trend follows the needs of house available. The needs of house were high from april and consisted through the summer time. 

In [None]:
(listing_calendar_df.loc[(listing_calendar_df.date >= '2016-12-31'), 'price'].mean() - listing_calendar_df.loc[(listing_calendar_df.date <= '2016-02-01'), 'price'].mean())/listing_calendar_df.loc[(listing_calendar_df.date <= '2016-02-01'), 'price'].mean()

The average price in January 2017 has increased 8% compared to last year.

In [None]:
g = sns.boxplot(x = 'city', y = 'price', data = listing_calendar_df)
g.set_yscale("log")

The listings in Seattle had vary prices. And the prices of the listings of the other cities are in the range of 0-250 consistently. Ballard has some lower price listings. And the Ridge Seattle has the lowest price range.

In [None]:
listing_calendar_room_price_gb = ['date', 'room_type']
listing_calendar_room_price_agg = {'price':'mean'}
listing_calendar_room_price_df = groupby_agg(listing_calendar_df, listing_calendar_room_price_gb, listing_calendar_room_price_agg)

In [None]:
listing_calendar_room_price_df.head()

In [None]:
plt.figure(figsize=(16, 6))
g = sns.lineplot(x = 'date', y = 'price', hue = 'room_type', data = listing_calendar_room_price_df)
g.set_yscale("log")
plt.legend(loc='lower right')
g.set_title("Seattle Airbnb Avg Price Trends by Room Type")

It makes sense that more private spaces would have higher price.

In [None]:
groupby_agg(listing_calendar_df, listing_calendar_room_price_gb, {'price':'std'})

In [None]:
listing_calendar_superhost_price_gb = ['date', 'host_is_superhost']
listing_calendar_superhost_price_agg = {'price':'mean'}
listing_calendar_superhost_price_df = groupby_agg(listing_calendar_df, listing_calendar_superhost_price_gb, listing_calendar_superhost_price_agg)

In [None]:
listing_calendar_superhost_price_df.head()

In [None]:
plt.figure(figsize=(16, 6))
g = sns.lineplot(x = 'date', y = 'price', hue = 'host_is_superhost', data = listing_calendar_superhost_price_df)
g.set_title("Seattle Airbnb Listing Price Trend by Host Type")

The average price of listings of superhost is higher than the ones of non super host. But the availability doesn't have big differences. Can we assume the visitors may pricing-sensitive. And many of them may choose price prior to the better service.

In [None]:
listing_calendar_superhost_price_gb_v2 = ['host_is_superhost','room_type']
listing_calendar_superhost_price_agg_v2 = {'price':'mean'}
listing_calendar_superhost_price_df_v2 = groupby_agg(listing_calendar_df, listing_calendar_superhost_price_gb_v2, listing_calendar_superhost_price_agg_v2)

In [None]:
g = sns.catplot(x = 'host_is_superhost', y = 'price', data = listing_calendar_superhost_price_df_v2, kind = 'bar', col = 'room_type')

In [None]:
listing_calendar_superhost_price_gb_v3 = ['date','host_is_superhost','room_type']
listing_calendar_superhost_price_agg_v3 = {'price':'mean'}
listing_calendar_superhost_price_df_v3 = groupby_agg(listing_calendar_df, listing_calendar_superhost_price_gb_v3, listing_calendar_superhost_price_agg_v3)

In [None]:
g = sns.catplot(x = 'date', y = 'price', hue = 'host_is_superhost', data = listing_calendar_superhost_price_df_v3, col = 'room_type')

Based on the trends above. The prices of private room between superhosts and non-superhosts are close. While the prices of the entire home/apt and shared rooms, the listings of the superhosts were more expensive than these of non-superhosts.

# Price vs Availability

In [None]:
plt.figure(figsize = (16,16))
g = sns.heatmap(listing_calendar_df[['host_is_superhost', 'beds', 'available', 'price']].corr())

In [None]:
listing_calendar_df[['host_is_superhost', 'beds', 'available', 'price']].corr()

Interestingly, the price and available didn't have strong relationship

# Prediction Model

In [None]:
listing_calendar_df_lm = listing_calendar_df[['price', 'available', 'date', 'city', 'neighbourhood', 'host_since', 'host_is_superhost', 'room_type',
                                             'bed_type', 'beds', 'bedrooms', 'bathrooms'
                                            , 'minimum_nights', 'cancellation_policy', 'instant_bookable']]

In [None]:
listing_calendar_df_lm['host_since_days'] = (listing_calendar_df_lm['date'] - listing_calendar_df_lm['host_since']).astype('timedelta64[D]')

In [None]:
#need to convert the date into category
listing_calendar_df_lm['month'] = pd.DatetimeIndex(listing_calendar_df_lm['date']).month
listing_calendar_df_lm['month_cat'] = listing_calendar_df_lm['month'].astype('category')

In [None]:
listing_calendar_df_lm['month_cat'].dtype

In [None]:
listing_calendar_df_lm = pd.get_dummies(listing_calendar_df_lm[['price', 'available', 'month_cat', 'city', 'neighbourhood', 'host_since_days', 'host_is_superhost', 'room_type',
                                             'bed_type', 'beds', 'bedrooms', 'bathrooms'
                                            , 'minimum_nights', 'cancellation_policy', 'instant_bookable']], prefix_sep='_')

In [None]:
listing_calendar_df_lm.head()

In [None]:
def clean_fit_linear_mod(df, response_col, test_size=.3, rand_state=123):
    '''
    INPUT:
    df - a dataframe holding all the variables of interest
    response_col - a string holding the name of the column 
    test_size - a float between [0,1] about what proportion of data should be in the test dataset
    rand_state - an int that is provided as the random state for splitting the data into training and test 
    
    OUTPUT:
    test_score - float - r2 score on the test data
    train_score - float - r2 score on the test data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''

    X = df.drop(response_col, axis = 1)
    y = df[response_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = rand_state)
    lm_model = LinearRegression(normalize = True)
    lm_model.fit(X_train, y_train)
    y_train_pred = lm_model.predict(X_train)
    y_test_pred = lm_model.predict(X_test)
    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)
    return test_score, train_score, lm_model, X_train, X_test, y_train, y_test

In [None]:
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(listing_calendar_df_lm, 'price')

In [None]:
print(test_score, train_score)