**Preprocessing Airbnb Calendar for Time Series Analysis**

# Introduction

## Read in libraries, data, and set notebook preferences

**Read in libraries**

In [42]:
#Read in libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

**Read in Data**

In [43]:
#Set path to local machine for data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\02_Intermediate/'

#Read in Airbnb Calendar data
calendar = pd.read_csv(path + '2020_0407_Calendar_Cleaned.csv', sep = ',',dtype = {'listing_id':'category'},
                       parse_dates=['date'], low_memory=True,index_col=0)

**Set notebook preferences**

In [44]:
#Set plot aesthetics for notebook
plt.style.use('Solarize_Light2')

#Increase number of columns and rows displayed by Pandas
pd.set_option('display.max_rows',100)

#Set float format
pd.options.display.float_format = '{:.02f}'.format

#supress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Preview Data

## Airbnb Calendar Data

In [45]:
#View shape and preview calendar data
print(calendar.shape)
calendar.head()

(16010035, 3)


Unnamed: 0_level_0,available,listing_id,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-03,0,187730,80.0
2019-04-04,0,187730,80.0
2019-04-05,1,187730,82.0
2019-04-06,1,187730,82.0
2019-04-07,1,187730,81.0


# Date preparationand feature engineering

## Listings data for avaible and unavailable Airbnbs

In [46]:
#Create data frames for counts of listings(available and unavailable)
listings_df = calendar.groupby(['date','available'])['listing_id'].count().reset_index()

#Set index as date and rename columns
listings_df.set_index('date', inplace = True)
listings_df.columns=['available', 'count'] 

#Check
display(listings_df.head())

Unnamed: 0_level_0,available,count
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-10-03,0,5897
2018-10-03,1,910
2018-10-04,0,5806
2018-10-04,1,1001
2018-10-05,0,5847


## Price data for avaible and unavailable Airbnbs

In [47]:
#Create data frame to capture daily average price for available and unavaialble listings
prices_df = calendar.groupby(['date', 'available'])['price'].mean().reset_index()

#Set index as date and rename columns
prices_df.set_index('date', inplace = True)
prices_df.columns = ['available', 'avg_price']

#Check
display(prices_df.head())

Unnamed: 0_level_0,available,avg_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-10-03,0,
2018-10-03,1,384.78
2018-10-04,0,
2018-10-04,1,370.29
2018-10-05,0,


**Filter dates**

In [48]:
#Find index of first non-missing value for available == 0(we found this data issue in the EDA file)
print('First non-missing Avg_Price:', prices_df['avg_price'].loc[prices_df.available == 0].first_valid_index())

#Filter
prices_df = prices_df.loc[prices_df.index >= '2019-01-09']

#Check
display(prices_df.head())

First non-missing Avg_Price: 2019-01-09 00:00:00


Unnamed: 0_level_0,available,avg_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-09,0,216.67
2019-01-09,1,224.35
2019-01-10,0,206.04
2019-01-10,1,213.5
2019-01-11,0,203.29


# Write out dataframes

In [49]:
# #Set path to write listings
# path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\03_Processed/'

# #Write listings_df to path
# listings_df.to_csv(path +'2020_0411_Calendar_Listings.csv', sep=',')

# #Write prices_df to path
# prices_df.to_csv(path +'2020_0411_Calendar_Prices.csv', sep=',')