In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm, trange

pd.set_option('display.max_rows', 80)
pd.set_option('display.width', 1920)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.random.seed(42)
plt.rcParams['figure.dpi'] = 150

In [2]:
df = pd.read_csv('./cleaned_nybnb.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14672 entries, 0 to 14671
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Host Since                      14672 non-null  object 
 1   Host Listings Count             14672 non-null  int64  
 2   Host Total Listings Count       14672 non-null  int64  
 3   Calculated host listings count  14672 non-null  int64  
 4   Latitude                        14672 non-null  float64
 5   Longitude                       14672 non-null  float64
 6   Accommodates                    14672 non-null  int64  
 7   Bathrooms                       14672 non-null  int64  
 8   Bedrooms                        14672 non-null  int64  
 9   Beds                            14672 non-null  int64  
 10  Price                           14672 non-null  float64
 11  Weekly Price                    14672 non-null  float64
 12  Monthly Price                   

In [4]:
df['Host Since'] = pd.to_datetime(df['Host Since'], infer_datetime_format=True)
df['First Review'] = pd.to_datetime(df['First Review'], infer_datetime_format=True)
df['Last Review'] = pd.to_datetime(df['Last Review'], infer_datetime_format=True)

In [5]:
df[:5]

Unnamed: 0,Host Since,Host Listings Count,Host Total Listings Count,Calculated host listings count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Number of Reviews,First Review,Last Review,Reviews per Month,Review Scores Rating,Property Type_Building,Property Type_Unit,Room Type_Entire home/apt,Room Type_Private room,Room Type_Shared room,Bed Type_Other,Bed Type_Real Bed,Cancellation Policy_flexible,Cancellation Policy_moderate,Cancellation Policy_strict
0,2016-06-12,1,1,1,40.74,-74.0,2,1,1,1,110.0,770.0,3300.0,200.0,75.0,1,0.0,8,1125,3,2016-06-30,2017-04-06,0.29,90.0,0,1,1,0,0,0,1,1,0,0
1,2013-08-31,1,1,1,40.74,-74.0,2,1,0,1,120.0,840.0,3600.0,120.0,75.0,1,0.0,3,1125,12,2015-09-09,2017-04-10,0.6,98.0,0,1,1,0,0,0,1,0,0,1
2,2010-09-15,1,1,1,40.75,-74.01,2,1,1,1,199.0,1393.0,5970.0,600.0,120.0,1,25.0,6,12,14,2011-07-09,2017-04-13,0.2,97.0,0,1,1,0,0,0,1,0,0,1
3,2013-10-28,1,1,1,40.74,-74.0,3,1,0,1,180.0,1260.0,3400.0,0.0,0.0,1,0.0,2,1125,17,2014-09-15,2016-09-18,0.53,92.0,0,1,1,0,0,0,1,0,1,0
4,2014-03-10,1,1,1,40.74,-74.0,1,1,1,1,165.0,952.0,3360.0,0.0,75.0,1,75.0,1,8,35,2014-05-27,2016-03-24,0.98,96.0,0,1,0,1,0,0,1,0,0,1


In [6]:
df["Host_Time"] = (df["Host Since"].max() - df['Host Since']).dt.days
df["Review Time Span"] = (df['Last Review'] - df['First Review']).dt.days

In [7]:
df[:5]

Unnamed: 0,Host Since,Host Listings Count,Host Total Listings Count,Calculated host listings count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Number of Reviews,First Review,Last Review,Reviews per Month,Review Scores Rating,Property Type_Building,Property Type_Unit,Room Type_Entire home/apt,Room Type_Private room,Room Type_Shared room,Bed Type_Other,Bed Type_Real Bed,Cancellation Policy_flexible,Cancellation Policy_moderate,Cancellation Policy_strict,Host_Time,Review Time Span
0,2016-06-12,1,1,1,40.74,-74.0,2,1,1,1,110.0,770.0,3300.0,200.0,75.0,1,0.0,8,1125,3,2016-06-30,2017-04-06,0.29,90.0,0,1,1,0,0,0,1,1,0,0,319,280
1,2013-08-31,1,1,1,40.74,-74.0,2,1,0,1,120.0,840.0,3600.0,120.0,75.0,1,0.0,3,1125,12,2015-09-09,2017-04-10,0.6,98.0,0,1,1,0,0,0,1,0,0,1,1335,579
2,2010-09-15,1,1,1,40.75,-74.01,2,1,1,1,199.0,1393.0,5970.0,600.0,120.0,1,25.0,6,12,14,2011-07-09,2017-04-13,0.2,97.0,0,1,1,0,0,0,1,0,0,1,2416,2105
3,2013-10-28,1,1,1,40.74,-74.0,3,1,0,1,180.0,1260.0,3400.0,0.0,0.0,1,0.0,2,1125,17,2014-09-15,2016-09-18,0.53,92.0,0,1,1,0,0,0,1,0,1,0,1277,734
4,2014-03-10,1,1,1,40.74,-74.0,1,1,1,1,165.0,952.0,3360.0,0.0,75.0,1,75.0,1,8,35,2014-05-27,2016-03-24,0.98,96.0,0,1,0,1,0,0,1,0,0,1,1144,667


In [8]:
df['Maximum Nights'].replace(2_147_483_647, 0, inplace=True)

In [9]:
df["High Review Score"] = (df["Review Scores Rating"] > 95).astype(np.uint8)

In [10]:
normalized_df = df.drop(columns=["Host Since", "First Review", "Last Review", "Review Scores Rating"])
normalized_df[:5]

Unnamed: 0,Host Listings Count,Host Total Listings Count,Calculated host listings count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Number of Reviews,Reviews per Month,Property Type_Building,Property Type_Unit,Room Type_Entire home/apt,Room Type_Private room,Room Type_Shared room,Bed Type_Other,Bed Type_Real Bed,Cancellation Policy_flexible,Cancellation Policy_moderate,Cancellation Policy_strict,Host_Time,Review Time Span,High Review Score
0,1,1,1,40.74,-74.0,2,1,1,1,110.0,770.0,3300.0,200.0,75.0,1,0.0,8,1125,3,0.29,0,1,1,0,0,0,1,1,0,0,319,280,0
1,1,1,1,40.74,-74.0,2,1,0,1,120.0,840.0,3600.0,120.0,75.0,1,0.0,3,1125,12,0.6,0,1,1,0,0,0,1,0,0,1,1335,579,1
2,1,1,1,40.75,-74.01,2,1,1,1,199.0,1393.0,5970.0,600.0,120.0,1,25.0,6,12,14,0.2,0,1,1,0,0,0,1,0,0,1,2416,2105,1
3,1,1,1,40.74,-74.0,3,1,0,1,180.0,1260.0,3400.0,0.0,0.0,1,0.0,2,1125,17,0.53,0,1,1,0,0,0,1,0,1,0,1277,734,0
4,1,1,1,40.74,-74.0,1,1,1,1,165.0,952.0,3360.0,0.0,75.0,1,75.0,1,8,35,0.98,0,1,0,1,0,0,1,0,0,1,1144,667,1


In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_df[[
    "Host Listings Count",
    "Host Total Listings Count",
    "Calculated host listings count",
    "Latitude",
    "Longitude",
    "Accommodates",
    "Bathrooms",
    "Bedrooms",
    "Beds",
    "Price",
    "Weekly Price",
    "Monthly Price",
    "Security Deposit",
    "Cleaning Fee",
    "Guests Included",
    "Extra People",
    "Minimum Nights",
    "Maximum Nights",
    "Number of Reviews",
    "Reviews per Month",
    "Host_Time",
    "Review Time Span",
]] = scaler.fit_transform(normalized_df[[
    "Host Listings Count",
    "Host Total Listings Count",
    "Calculated host listings count",
    "Latitude",
    "Longitude",
    "Accommodates",
    "Bathrooms",
    "Bedrooms",
    "Beds",
    "Price",
    "Weekly Price",
    "Monthly Price",
    "Security Deposit",
    "Cleaning Fee",
    "Guests Included",
    "Extra People",
    "Minimum Nights",
    "Maximum Nights",
    "Number of Reviews",
    "Reviews per Month",
    "Host_Time",
    "Review Time Span",
]])
from pickle import dump
dump(scaler, open('scaler.pkl', 'wb'))
normalized_df[:20]

Unnamed: 0,Host Listings Count,Host Total Listings Count,Calculated host listings count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Number of Reviews,Reviews per Month,Property Type_Building,Property Type_Unit,Room Type_Entire home/apt,Room Type_Private room,Room Type_Shared room,Bed Type_Other,Bed Type_Real Bed,Cancellation Policy_flexible,Cancellation Policy_moderate,Cancellation Policy_strict,Host_Time,Review Time Span,High Review Score
0,-0.08,-0.08,-0.27,-0.62,-1.27,-0.51,-0.24,-0.14,-0.56,-0.47,-0.45,-0.41,0.37,0.5,-0.49,-0.62,0.31,0.0,-0.57,-0.57,0,1,1,0,0,0,1,1,0,0,-1.23,-0.29,0
1,-0.08,-0.08,-0.27,-0.51,-1.15,-0.51,-0.24,-1.62,-0.56,-0.38,-0.37,-0.32,-0.06,0.5,-0.49,-0.62,-0.07,0.0,-0.3,-0.41,0,1,1,0,0,0,1,0,0,1,0.36,0.31,1
2,-0.08,-0.08,-0.27,-0.3,-1.47,-0.51,-0.24,-0.14,-0.56,0.3,0.31,0.37,2.52,1.43,-0.49,0.38,0.16,-0.04,-0.24,-0.62,0,1,1,0,0,0,1,0,0,1,2.05,3.41,1
3,-0.08,-0.08,-0.27,-0.51,-1.18,0.07,-0.24,-1.62,-0.56,0.14,0.15,-0.38,-0.71,-1.06,-0.49,-0.62,-0.14,0.0,-0.15,-0.44,0,1,1,0,0,0,1,0,1,0,0.27,0.63,0
4,-0.08,-0.08,-0.27,-0.6,-1.27,-1.08,-0.24,-0.14,-0.56,0.01,-0.23,-0.39,-0.71,0.5,-0.49,2.38,-0.22,-0.04,0.39,-0.2,0,1,0,1,0,0,1,0,0,1,0.06,0.49,1
5,-0.08,-0.08,-0.27,-0.47,-0.98,0.65,-0.24,-0.14,0.49,1.98,1.99,2.07,-0.71,-1.06,-0.49,-0.62,-0.07,0.0,-0.06,-0.36,0,1,1,0,0,0,1,0,1,0,1.53,0.69,0
6,-0.08,-0.08,-0.27,-0.32,-1.36,0.07,-0.24,-0.14,-0.56,-0.21,-0.19,-0.15,0.1,-0.02,-0.49,-0.62,-0.14,-0.04,-0.63,-0.56,0,1,1,0,0,0,1,0,0,1,-0.07,-0.86,1
7,-0.08,-0.08,-0.27,-0.37,-1.05,1.23,-0.24,1.34,1.54,3.32,3.32,3.41,1.72,1.02,2.42,0.38,-0.22,0.0,0.03,-0.24,0,1,1,0,0,0,1,0,0,1,0.32,0.61,0
8,-0.08,-0.08,-0.27,-0.59,-1.03,-0.51,-0.24,-0.14,-0.56,0.73,0.74,0.8,-0.71,0.6,-0.49,-0.62,-0.07,-0.03,0.24,-0.47,0,1,1,0,0,0,1,0,1,0,1.39,2.83,1
9,-0.08,-0.08,-0.27,-0.48,-1.2,-0.51,-0.24,-0.14,-0.56,0.14,0.15,0.2,0.64,0.6,-0.49,-0.62,-0.22,0.0,-0.09,-0.05,0,1,1,0,0,0,1,0,0,1,0.52,-0.03,1


In [12]:
normalized_df.to_csv("normalized_nybnb.csv", index=False)