In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import math

data=pd.read_csv('Updated_NYC.csv')
data.head(10)

# Removing Variables that are not needed for predicting the model
data.drop(['Unnamed: 0','name','id','host_name','last_review'], axis=1, inplace=True)

#examing the changes
data.head(5)

num_var=data._get_numeric_data().columns
cat_var=set(data.columns)-set(num_var)

#Graph for number of neighbourhood values
places=data.neighbourhood_group.value_counts()
print(places)
place=places.plot(kind='bar')
place.set_title('Hosts with the most listings in NYC')
place.set_ylabel('Count of listings')
place.set_xlabel('Host Places')
#place.set_xticklabels(place.get_xticklabels(), rotation=45)

# Using latitude and longitude into data format for prediction of the model

for i in range(data.shape[0]):
    data.iloc[i,3]=math.cos(data.iloc[i,3]) * math.cos(data.iloc[i,3])
    data.iloc[i,4]=math.cos(data.iloc[i,4]) * math.cos(data.iloc[i,4])
    data.iloc[i,0]=math.sin(data.iloc[i,3]) 
    
#factorizing the categorical data
for i in cat_var:
    #print(i)
    data[i]=data[i].factorize()[0]
    
#Correlation plot
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

#Creating Test and Training Set
x = data.iloc[:,[1,2,3,4,5,7,8,9,10,11]]
y = data['price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)
x_train.head()
y_train.head()
x_train.drop(['number_of_reviews','reviews_per_month'], axis=1, inplace=True)
x_test.drop(['number_of_reviews','reviews_per_month'], axis=1, inplace=True)

#Creating RandomForest Algorithm
reg= RandomForestRegressor(n_estimators = 100, random_state = 0)
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
rms = sqrt(mean_squared_error(y_test, y_pred))
print(rms)

#Scatter Plot with y_pred and y_actual
plt.scatter(y_pred,y_test)

#Predicting the Final output of hotel price
a=pd.DataFrame({'minimum_nights':[1],
       'calculated_host_listings_count':[2], 'availability_365':[355],'neighbourhood_group':[1], 
                'neighbourhood':[5], 'room_type':[2]
               ,'latitude':0.963,'longitude':0.024})
value=reg.predict(a)
print("Price at Manhattan for a Private room",value)

#Predicting the Final output of hotel price
a=pd.DataFrame({'minimum_nights':[1],
       'calculated_host_listings_count':[2], 'availability_365':[355],'neighbourhood_group':[0], 
                'neighbourhood':[0], 'room_type':[1]
               ,'latitude':0.963,'longitude':0.024})
value=reg.predict(a)
print("Price at Bronx for a Entire Apartment",value)