# Question / Problem definition


    1. What can we learn about different hosts and areas?
    2 .What can we learn from predictions? (ex: locations, prices, reviews, etc)
    3. Which hosts are the busiest and why?
    4. Is there any noticeable difference of traffic among different areas and what could be the reason for it?


# Imports

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt


In [80]:
# Removing cached imports

import src.preprocessing
from importlib import reload

# Reloading editable custom own package files in order to get the latest version
reload(src.preprocessing)

<module 'src.preprocessing' from '/home/nandhu/Documents/kaggle-competitions/src/preprocessing/__init__.py'>

In [81]:
# Loading dataset

df = pd.read_csv("./dataset/AB_NYC_2019.csv")

# Automated EDA

In [82]:
profile = ProfileReport(df, title = "Automated EDA")

In [83]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Manual EDA

In [84]:
# Imports

from src.analyse_src import data_inspection
dt_inspection = data_inspection.DataTypeInspectionStrategy()
ss_inspection = data_inspection.SummaryStatisticsInspectionStrategy()

In [85]:
# Inspecting Datatypes of features

dt_inspection.inspect(df)


DataTypes and Non-Null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  

In [86]:
# Summary Statistics of the dataset

ss_inspection.inspect(df)

Descriptive Statistics of Numerical Datatypes:
                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.743213e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    48895.000000       48895.000000       38843.000000   
mean         7.029962          23.274466           1.373221   
std         20.510550          44.550582           1

# Preprocessing

## Missing Values

In [87]:
# List of columns with missing values

missing = df.isnull().sum()
missing = missing[missing>0]
print(missing)

name                    16
host_name               21
last_review          10052
reviews_per_month    10052
dtype: int64


### "name" & "host_name" Columns

In [88]:
# Filtering df where "name" is missed


df_name_missed = df[df["name"].isnull()]

In [89]:
if len(df["id"].unique()) == len(df["id"]):
    print("every id is unique")

every id is unique


 <p>The `name` and `host name` columns are nominal values. I dont want to consider these columns for out model.
 `name` might be useful on SEO basis. but i dont think it might make any difference on prices. i am dropping both columns</p>

In [90]:
df = df.drop(["name", "host_name"], axis=1)

### "last_review" & "reviews_per_month" Columns

In [91]:
# Filtering dataset where above mentioned column values are missing

df_last_review_missed = df[df["last_review"].isnull()]
df_review_per_month_missed = df[df["reviews_per_month"].isnull()]

<p>If last_review and review_per_month missing row are same. I am considering i dont have any review so far. I am repalcing null with 'zero'



</p>

In [92]:
# Importing Missing Value strategies

from src.preprocessing import missing_values
missing_const = missing_values.FillConstMissingValuesStrategy()

In [93]:
df = missing_const.handle(df =df, column="last_review", const = 0)
df = missing_const.handle(df=df, column = "reviews_per_month", const = 0)

In [94]:
df.isnull().sum()

id                                0
host_id                           0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Removing Duplicate values

In [95]:
# Finding duplicate data

duplicates = df.duplicated().sum()
print(duplicates)

0


In [96]:
print("NO duplicates found!")

NO duplicates found!


## Data Transformation

In [97]:
# Filtering categorical variables

categories = df.select_dtypes(include=["object", "category"]).columns
print(categories)

Index(['neighbourhood_group', 'neighbourhood', 'room_type', 'last_review'], dtype='object')


In [98]:
for category in categories:
    unique = df[category].unique()
    length  = len(unique)
    print(f"Unique categories in {category} has {len(unique)} unique values")
    if length< 10:
        print(unique)
    

Unique categories in neighbourhood_group has 5 unique values
['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']
Unique categories in neighbourhood has 221 unique values
Unique categories in room_type has 3 unique values
['Private room' 'Entire home/apt' 'Shared room']
Unique categories in last_review has 1765 unique values


<p>We have very few features. so i am going to use one hot encoding for `neighbourhood_group` and `room_type`</p>

### Encoding

In [99]:
# Importing Onehot encoding

from src.preprocessing import encoding
onehot_encoder = encoding.OneHotEncoding()

In [100]:
# neighbourhood_group feature encoding

encoded_df = onehot_encoder.encode(df=df, column=categories[0])
df.drop(labels=[categories[0]], axis=1, inplace =True)
df_encoded= pd.concat([df, encoded_df], axis =1)

In [101]:
df_encoded.head(3)

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island
0,2539,2787,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,0.0,1.0,0.0,0.0,0.0
1,2595,2845,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,0.0,0.0,1.0,0.0,0.0
2,3647,4632,Harlem,40.80902,-73.9419,Private room,150,3,0,0,0.0,1,365,0.0,0.0,1.0,0.0,0.0


In [102]:
# room_type onehot encoding

print(categories[2])
encoded_df = onehot_encoder.encode(df=df, column=categories[2])
df_encoded.drop(labels=[categories[2]],axis=1, inplace= True)
df_encoded = pd.concat([df_encoded,encoded_df],axis=1)

room_type


In [103]:
# neighbourhood hash encoding

from src.preprocessing.encoding import LabelEncoding

label_encoding = LabelEncoding()

df_encoded = label_encoding.encode(df=df_encoded, column="neighbourhood")

In [104]:
df_encoded.head(3)

Unnamed: 0,id,host_id,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,2539,2787,108,40.64749,-73.97237,149,1,9,2018-10-19,0.21,6,365,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2595,2845,127,40.75362,-73.98377,225,1,45,2019-05-21,0.38,2,355,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3647,4632,94,40.80902,-73.9419,150,3,0,0,0.0,1,365,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Normalisation

In [121]:
df_encoded.drop("last_review", axis=1, inplace=True)

### Train test split

In [123]:
features = df_encoded.drop("price", axis=1)
target = df_encoded["price"]

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.33)

In [126]:
x_train.shape

(32759, 18)

In [127]:
x_test.shape

(16136, 18)

In [128]:
y_train.shape

(32759,)

In [129]:
y_test.shape

(16136,)

## Machine Learning

### Linear Regression


In [139]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [135]:
linear_regressor = LinearRegression()

In [136]:
linear_regressor_fitted = linear_regressor.fit(x_train, y_train)

In [137]:
y_predict = linear_regressor_fitted.predict(x_test)

#### Evaluation

In [142]:
# Mean Absolute Error

mae = mean_absolute_error(y_test, y_predict)
print(f"Mean absolute Error: {mae}")

Mean absolute Error: 75.5198279994184


In [143]:
# Mean Squared Error

mse = mean_squared_error(y_test, y_predict)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 67162.19708314474


In [144]:
# Root mean squared error

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squarred Error: 259.15670372024863


In [146]:
# R- Squared 

r2 = r2_score(y_test, y_predict)
print(f"R-Squared score: {r2}")

R-Squared score: 0.0783217624336533


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### Random Forest

In [None]:
rfs=RandomForestClassifier()

In [None]:
rff=rfs.fit(x_train,y_train)

In [None]:
y_predict=rff.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_RF01.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

### Support Vector Machine

In [None]:
sv=svm()

In [None]:
svr=rfs.fit(x_train_f,y_train)

In [None]:
y_predict=svr.predict(x_test_f)

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### K-Nearest Neighbours

In [None]:
KNN=KNeighborsClassifier()

In [None]:
knn=KNN.fit(x_train,y_train)

In [None]:
y_predict=knn.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_KNN01.csv')

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### XGBoostClassifier

In [None]:
xg=XGBClassifier()

In [None]:
xgf=xg.fit(x_train,y_train)

In [None]:
y_predict=xgf.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_XG01.csv')