## Summary

## Introduction

In [1]:
# imports

from lightgbm import LGBMRegressor
import numpy as np
import sklearn
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
import vegafusion
import altair as alt
import altair_ally as aly
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import pandas as pd
from scipy.stats import uniform, randint
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


## Data

The dataset utilized for this project originates from the Kaggle, available through the following link: (https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data). It comprises 16 columns/variables and encompasses a substantial volume of data, totaling 48895 rows. The available information includes details about the properties on Airbnb, the corresponding geographical information, price , and the room type. Airbnb could use this sort of model to predict how popular future listings might be before they are posted, perhaps to help guide hosts create more appealing listings

In [10]:
df = pd.read_csv("data/AB_NYC_2019.csv", encoding="utf-8")
train_df, test_df = train_test_split(df, test_size=0.4, random_state=123)
train_df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
17877,14010200,"Scandinavean design in Crown Heights, BK",1683437,Boram,Brooklyn,Crown Heights,40.66477,-73.95060,Entire home/apt,89,5,9,2018-05-28,0.25,1,0
14638,11563821,Private bedroom located in the heart of Chelsea,10307134,Anna,Manhattan,Chelsea,40.74118,-74.00012,Private room,110,1,48,2019-06-16,1.80,2,67
7479,5579629,Lovely sunlit room in Brooklyn,329917,Clémentine,Brooklyn,Greenpoint,40.72905,-73.95755,Private room,53,2,5,2016-10-21,0.13,1,0
47058,35575853,"Great view, 1 BR right next to Central Park!",35965489,Meygan,Manhattan,East Harlem,40.79755,-73.94797,Private room,100,2,0,,,1,7
9769,7509362,Great BIG Upper West Side Apartment,29156329,Andrew,Manhattan,Upper West Side,40.80120,-73.96382,Private room,87,3,9,2018-09-19,0.19,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7763,5885201,SUNNY ROOM A IN CHARMING AREA :),4291007,Graham And Ben,Brooklyn,Bedford-Stuyvesant,40.69363,-73.95980,Private room,95,30,40,2019-06-01,0.80,11,331
15377,12325045,IDEAL One bedroom apt by Central Park!,66501870,K Alexandra,Manhattan,Midtown,40.76016,-73.96910,Entire home/apt,139,2,132,2019-06-30,3.66,1,154
17730,13915004,"Sunlit, spacious NY apartment",7177483,Dani,Manhattan,Harlem,40.80380,-73.95569,Entire home/apt,250,3,10,2019-01-01,0.28,1,0
28030,21897845,One room.,159769278,Musieka,Bronx,Pelham Gardens,40.86706,-73.84674,Private room,40,2,17,2019-06-04,1.23,1,17


### EDA

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29337 entries, 17877 to 15725
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              29337 non-null  int64  
 1   name                            29328 non-null  object 
 2   host_id                         29337 non-null  int64  
 3   host_name                       29325 non-null  object 
 4   neighbourhood_group             29337 non-null  object 
 5   neighbourhood                   29337 non-null  object 
 6   latitude                        29337 non-null  float64
 7   longitude                       29337 non-null  float64
 8   room_type                       29337 non-null  object 
 9   price                           29337 non-null  int64  
 10  minimum_nights                  29337 non-null  int64  
 11  number_of_reviews               29337 non-null  int64  
 12  last_review                     2

In [6]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,29337.0,18919880.0,11021550.0,2539.0,9350729.0,19517510.0,29165310.0,36485610.0
host_id,29337.0,67145790.0,78354040.0,2438.0,7740184.0,30719070.0,106442900.0,274321300.0
latitude,29337.0,40.72901,0.05459419,40.50641,40.69009,40.72314,40.76328,40.91234
longitude,29337.0,-73.95222,0.04609134,-74.24442,-73.98303,-73.95553,-73.93643,-73.71299
price,29337.0,150.9391,228.2242,0.0,69.0,107.0,175.0,10000.0
minimum_nights,29337.0,7.141971,22.27211,1.0,1.0,3.0,5.0,1250.0
number_of_reviews,29337.0,23.3545,44.69248,0.0,1.0,5.0,23.0,629.0
reviews_per_month,23386.0,1.369867,1.706732,0.01,0.19,0.71,2.01,58.5
calculated_host_listings_count,29337.0,7.00334,32.51162,1.0,1.0,1.0,2.0,327.0
availability_365,29337.0,112.8036,131.5445,0.0,0.0,45.0,227.0,365.0


In [7]:
train_df.nunique()

id                                29337
name                              28894
host_id                           23964
host_name                          8375
neighbourhood_group                   5
neighbourhood                       217
latitude                          15269
longitude                         12151
room_type                             3
price                               591
minimum_nights                       94
number_of_reviews                   360
last_review                        1632
reviews_per_month                   867
calculated_host_listings_count       47
availability_365                    366
dtype: int64

### Distributions

### Distribution of categorical columns

## Preprocessing

## Baseline model and Linear model

## Different models

### Randomforest Regressor

### XGBoost Regressor

### LGBM Regressor

## Stacking

### Hyperparameter optimization

### Hyperparameter optimization and feature selection

### Interpretation and feature importances

## Conclusion

## References