In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the data
nyc_data = pd.read_csv('AB_NYC_2019.csv')
nyc_data.head()

# Data Wrangling

In [None]:
#checking the data
nyc_data.info()

In [None]:
#checking null values
nyc_data.isnull().sum().sort_values(ascending=False)

In [None]:
# converting last_review to datetime
# replacing NaN values with 0 
nyc_data['last_review']=pd.to_datetime(nyc_data['last_review'])
nyc_data.last_review.fillna(max(nyc_data.last_review),inplace=True)
nyc_data.reviews_per_month.fillna(0,inplace=True)

In [None]:
#removing unwanted columns
nyc_data.drop(['name','host_name'],axis=1,inplace=True)

In [None]:
#checking if any null values present now
nyc_data.isna().sum().sum()

# Exploratory Data Analysis

1. Host_id

In [None]:
#let's see what hosts (IDs) have the most listings on Airbnb platform and taking advantage of this service
top_host=nyc_data.host_id.value_counts().head(10)
top_host

In [None]:
#setting figure size for future visualizations
sns.set(rc={'figure.figsize':(10,8)})
viz_1=top_host.plot(kind='bar',cmap='plasma')
viz_1.set_title('Hosts with the most listings in NYC')
viz_1.set_ylabel('Count of listings')
viz_1.set_xlabel('Host IDs')

* Observation:The host with maximum number of listings registered for airbnb in NewYork has 372 listings.

2. Neighbourhood group

In [None]:
a=nyc_data.groupby('neighbourhood_group').calculated_host_listings_count.sum()
plt.style.use('ggplot')
a.plot(kind='bar')

In [None]:
sns.countplot(x='neighbourhood_group',data=nyc_data)

* Obeservations:
    a. Manhattan neighbourhood group has highest number of listings in whole New York area.
    
    b. Number of hosts ownning the listings are highest in Manhattan followed by Brooklyn

3. Room type

In [None]:
sns.countplot(x='room_type',data=nyc_data,edgecolor=sns.color_palette("dark", 3))
nyc_data.room_type.value_counts()

In [None]:
b=nyc_data.room_type.value_counts()/len(nyc_data.room_type)
b.plot.pie(autopct='%.2f',fontsize=12,figsize=(8,8))
plt.title('Room types availability in AirBnB',fontsize=20)

In [None]:
nyc_data.groupby(['neighbourhood_group','room_type']).room_type.count().plot.barh(stacked=True)
plt.ylabel('Neighbourhood wise room types')
plt.xlabel('Number of Rooms')
plt.title('Neighbourhood groups Vs Room types availability')

* Observation:

a. In all the listings of rooms registered in AirBnB more than 50% of listings offer complete houses or apartments and remaining are private rooms. Only 2% of listings offer shared rooms. 

b. Brooklyn is a zone which has maximum listings offering private rooms while manhattan is an hub of apartment offerings, followed by Brooklyn.

c. Bronx, Queen and Staten Island has least number of listings registered and does not offer much of the services.

# Price Analysis

1. Price relation to Room Type

In [None]:
title = 'Price per Room Type for Properties under $175'
data_filtered = nyc_data.loc[nyc_data['price'] < 175]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x='room_type', y='price', data=data_filtered, notch=True, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()

title = 'Price per Room Type for Properties more than $175'
data_filtered = nyc_data.loc[nyc_data['price'] > 175]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x='room_type', y='price', data=data_filtered, notch=False, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()


2. Price relation to the number of review per month


In [None]:
sns.set_palette("muted")
x = 'reviews_per_month'
y = 'price'

title = 'Price relation to number of review per month for Properties under $175'
data_filtered = nyc_data.loc[(nyc_data['price'] < 175) & (nyc_data['reviews_per_month'] < 30)]
f, ax = plt.subplots(figsize=(8, 6))
plt.scatter(x=x, y=y, data=data_filtered)
plt.title(title)
plt.ioff()

title = 'Price relation to number of review per month for Properties more than $175'
data_filtered = nyc_data.loc[nyc_data['price'] > 175]
f, ax = plt.subplots(figsize=(8, 6))
plt.scatter(x=x, y=y, data=data_filtered)
plt.title(title)
plt.ioff()


3. Price relation to Neighbourhood Group

In [None]:
title = 'Median Price per Neighbourhood Group'
result = nyc_data.groupby(["neighbourhood_group"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='neighbourhood_group', y="price", data=nyc_data, order=result['neighbourhood_group'])
plt.title(title)
plt.ioff()

* Observation: Properties in Manhattan are more expensive

4. Price Relation to Minimum Nights

In [None]:
plt.scatter(x='minimum_nights',y='price',data=nyc_data)

* Observation: Minimum number of night stays has no significant impact on prices.

# Designing price prediction ML model

In [None]:
# We will make model to only use listings which has price set up. Their are multiple listings with no prices. 
# We will also use listings which has availability_365>0
nyc_data=nyc_data[nyc_data.price>0]
nyc_data=nyc_data[nyc_data.availability_365>0]

In [None]:
# Setting the target variable and independent variable
X=['latitude','longitude','minimum_nights','number_of_reviews','availability_365','room_type','neighbourhood_group','neighbourhood']
y='price'

In [None]:
data_X=nyc_data[X]

In [None]:
data_y=nyc_data[y]

In [None]:
# encoding the categorical data for making data suitable for machine to learn
X=pd.get_dummies(data_X,prefix_sep='_',drop_first=True)

In [None]:
#Prices are not normally distributed as well as there is alot of noise. Logarithmic conversion of data with huge variance can be normalised by logarithmic algorithm.
y=np.log10(data_y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=1)

In [None]:
# importing important LinearRegression ML models
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)


In [None]:
# Evaluation of model

from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test, lr.predict(X_test))), 2))
print('R2 score train:', np.round(r2_score(y_train, lr.predict(X_train), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test, lr.predict(X_test), multioutput='variance_weighted'), 2))


In [None]:
from sklearn.linear_model import BayesianRidge
br=BayesianRidge()
br.fit(X_train,y_train)
y_predict=br.predict(X_test)

In [None]:
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test, lr.predict(X_test))), 2))
print('R2 score train:', np.round(r2_score(y_train, lr.predict(X_train), multioutput='variance_weighted'), 2)*100)
print('R2 score test:', np.round(r2_score(y_test, lr.predict(X_test), multioutput='variance_weighted'), 2))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()                                            # Fit label encoder
le.fit(nyc_data['neighbourhood_group'])
nyc_data['neighbourhood_group']=le.transform(nyc_data['neighbourhood_group'])    # Transform labels to normalized encoding.

le = LabelEncoder()
le.fit(nyc_data['neighbourhood'])
nyc_data['neighbourhood']=le.transform(nyc_data['neighbourhood'])

le =LabelEncoder()
le.fit(nyc_data['room_type'])
nyc_data['room_type']=le.transform(nyc_data['room_type'])

nyc_data.sort_values(by='price',ascending=True,inplace=True)

nyc_data.head()

In [None]:
lm = LinearRegression()

X = nyc_data[['neighbourhood_group','neighbourhood','room_type','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']]
y = np.log10(nyc_data['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

lm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
y_predicts = lm.predict(X_test)

print("""
        Mean Squared Error: {}
        R2 Score: {}
        Mean Absolute Error: {}
     """.format(
        np.sqrt(metrics.mean_squared_error(y_test, y_predicts)),
        r2_score(y_test,y_predicts) * 100,
        mean_absolute_error(y_test,y_predicts)
        ))
