In [None]:
!pip install category_encoders --quiet

In [50]:
import pandas as pd
import seaborn as sns

In [51]:
cal_df = pd.read_csv('/content/drive/MyDrive/nyc_data/calendar.csv',
                     parse_dates = ['date'],
                     index_col = 'date').sort_index()

In [68]:
list_df = pd.read_csv('/content/drive/MyDrive/nyc_data/listings.csv',
                      parse_dates = ['first_review', 'last_review', 'host_since'])

In [64]:
reviews_df = pd.read_csv('/content/drive/MyDrive/nyc_data/reviews.csv',
                         parse_dates = ['date'])

In [65]:
# cleaning the calendar dataframe. because price and adjusted price have a 
# correlation of ~0.99, i'm going to drop adjusted price

cal_df['available'] = cal_df['available'].map({'f' : 0, 't' : 1})
cal_df['price'] = cal_df['price'].str.lstrip('$').str.replace(',','')
cal_df['adjusted_price'] = cal_df['adjusted_price'].str.lstrip('$').str.replace(',','')
cal_df[['price', 'adjusted_price']] = cal_df[['price', 'adjusted_price']].astype(float)

cal_df = cal_df.drop('adjusted_price', axis = 1)

In [69]:
# extracting the features from the listings df that are both relevant and useable 
# within an app

list_df = list_df[['id', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 
                   'room_type', 'accommodates', 'host_since']].copy()

list_df = list_df.rename(columns = {'neighbourhood_group_cleansed' : 'borough',                                     
                                     'review_scores_rating' : 'review_score',
                                     'id':'listing_id'})

In [70]:
# reducing the reviews dataframe to the relevant columns. we may be able to
# incorporate sentiment analysis into future models

reviews_df = reviews_df[['listing_id', 'date', 'comments']].copy()

In [71]:
# merging the calendar and listings dataframes

df_model = pd.merge(cal_df.reset_index(), list_df, how = 'left', on = 'listing_id')
df_model = df_model.set_index('date')

In [72]:
df_model['room_type'] = df_model['room_type'].map({'Entire home/apt' : 'entire home',
                                                   'Private room' : 'room',
                                                   'Shared room' : 'shared room',
                                                   'Hotel room' : 'hotel room'})

df_model['day_of_week'] = df_model.index.dayofweek

In [84]:
df_model['time_as_host'] = df_model.index - df_model['host_since']

In [89]:
df_model.reset_index().to_csv('/content/drive/MyDrive/nyc_data/model_df.csv', index = False)