# Airbnb Price Prediction

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Removing cached imports
import src.preproccessing
from importlib import reload
reload(src.preproccessing)

In [None]:
# Define a function to load data/missing_value import DroppingMissingValueStrategy 
def load_data(filepath):
    """
    Loading dataset from a specified filepath.
    """

    return pd.read_csv(filepath)

In [None]:
df= load_data("./dataset/AB_NYC_2019.csv")

## Dataset Properties

In [None]:
# Properties of the dataset

df.info()

In [None]:
df.head(5)

In [None]:
df.describe()

## Data cleaning

### Handling Missing Values

In [None]:
# List of columns with missing values

missing = df.isnull().sum()
missing = missing[missing>0]
print(missing)

In [None]:
# Handling Name

df_name_missed = df[df["name"].isnull()]

In [None]:
if len(df["id"].unique()) == len(df["id"]):
    print("every id is unique")

 <p>The `name` and `host name` columns are nominal values. I dont want to consider these columns for out model.
 `name` might be useful on SEO basis. but i dont think it might make any difference on prices. i am dropping both columns</p>

In [None]:
df = df.drop(["name", "host_name"], axis=1)

In [None]:
# Handling last_review and reviews_per_month page

df_last_review_missed = df[df["last_review"].isnull()]
df_review_per_month_missed = df[df["reviews_per_month"].isnull()]

<p>If last_review and review_per_month missing row are same. I am considering i dont have any review so far. I am repalcing null with 'zero'



</p>

In [None]:
# Importing Missing Value strategies

from src.preproccessing import missing_values
missing_const = missing_values.FillConstMissingValuesStrategy(const = 0)

In [None]:
df = missing_const.handle(df =df, column="last_review")
df = missing_const.handle(df=df, column = "reviews_per_month")

In [None]:
df.isnull().sum()

### Removing Duplicate values

In [None]:
# Finding suplicate data

duplicates = df.duplicated().sum()
print(duplicates)

In [None]:
print("NO duplicates found!")

## Data Transformation

In [None]:
# Filtering categorical variables

categories = df.select_dtypes(include=["object", "category"]).columns
print(categories)

In [None]:
for category in categories:
    unique = df[category].unique()
    length  = len(unique)
    print(f"Unique categories in {category} has {len(unique)} unique values")
    if length< 10:
        print(unique)
    

In [None]:
brooklyn_neighbourhoods = df[df['neighbourhood_group'] == "Brooklyn"]['neighbourhood'].unique().tolist()
manhattan_neighbourhoods = df[df['neighbourhood_group'] == "Manhattan"]['neighbourhood'].unique().tolist()
queens_neighbourhoods = df[df['neighbourhood_group'] == "Queens"]['neighbourhood'].unique().tolist()
staten_neighbourhoods = df[df['neighbourhood_group'] == "Staten Island"]['neighbourhood'].unique().tolist()
bronx_neighbourhoods = df[df['neighbourhood_group'] == "Bronx"]['neighbourhood'].unique().tolist()
neighbourhoods = df["neighbourhood_group"].unique()

<p>We have very few features. so i am going to use one hot encoding for `neighbourhood_group` and `room_type`</p>

### Encoding

In [None]:
# Importing Onehot encoding

from src.preproccessing import encoding
onehot_encoder = encoding.OneHotEncoding()

In [None]:
# neighbourhood_group feature encoding
print(categories[0])
encoded_df = onehot_encoder.encode(df=df, column=categories[0])
df.drop(labels=[categories[0]], axis=1, inplace =True)
df = pd.concat([df, encoded_df], axis =1)

In [None]:
df.columns

In [None]:
# room_type onehot encoding

print(categories[2])
encoded_df = onehot_encoder.encode(df=df, column=categories[2])
df.drop(labels=[categories[2]],axis=1)
df = pd.concat([df,encoded_df],axis=1)

In [None]:
df.columns

### Feature Engineering

<p>`neighbourhood` is have 221 unique value which comes under neighbourhood_groups. Distance to the neighbourhood going to be a new feature using `google maps distance matric api`</p>

In [None]:
# Importing Geocoding api keys

df_api = pd.read_csv("geocoding_api.csv")

In [None]:
# # getting langtitude and latitude for neighbourhoods using openweathermap api
# import requests

# def get_lat_long(city):
#     api_key = df_api[df_api["api"]=="openweather_map"]["api_key"][0]
#     limit = 1
#     url = f"http://api.openweathermap.org/geo/1.0/direct?q={city}, NY,US&limit={limit}&appid={api_key}"
#     response = requests.get(url)
#     if response.status_code ==200:
#         data =  response.json()
#         print(f"Geographical data for {city}:")
#         for entry in data:
#             print(f"Name: {entry['name']}, Latitude: {entry['lat']}, Longitude: {entry['lon']}")
#             return [city,(entry["lat"], entry["lon"])]
#     else:
#         print(f"Error: Unable to fetch data (status code:{response.status_code})")
    

In [None]:
# #Getting geocodes for all neighbourhood

# unique_neighbourhood = df["neighbourhood"].unique()
# geo_code = {}
# count = 1
# for city in unique_neighbourhood:
    
#     print(f"getting city {count}: {city}")
#     result = get_lat_long(city)
#     if result != None:
#         geo_code[city] = result[1]
#     else:
#         geo_code[city] = None
#     count +=1

# # storing collected data into csv file
# df_geocode = pd.DataFrame(list(geo_code.items()), columns=['neighbourhood', 'geocode'])
# df_geocode.to_csv('geo_codes.csv', index=False)

##### Opencage forward geocoding api

In [None]:
# # getting geocodes for remaining neighbourhoods using opencagedata api
# from opencage.geocoder import OpenCageGeocode

# from pprint import pprint
# api_key = df_api[df_api["api"]=="opencagedata"]["api_key"][1]
# geocoder = OpenCageGeocode(api_key)

# def opencage(city, county):
#     query = f"{city}, {county}, New York, United states of America"
#     results = geocoder.geocode(query, countrycode="us",limit =4)
#     current = {}
#     count = 1
#     for result in results:
#         if count ==1:
#             current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         else:
#             if current["best"][2] < result["confidence"]:
#                 current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         count+=1

#     return current["best"]

In [None]:
# geocode = {}
# for neighbourhood in neighbourhoods:
#     if neighbourhood =="Brooklyn":
#         current_group = brooklyn_neighbourhoods
#     elif neighbourhood =="Manhattan":
#         current_group = manhattan_neighbourhoods
#     elif neighbourhood =="Queens":
#         current_group = queens_neighbourhoods
#     elif neighbourhood =="Staten Island":
#         current_group = staten_neighbourhoods
#     else:
#         current_group = bronx_neighbourhoods
   
#     for city in current_group:
#         print(f"current city: {city}")
#         result = opencage(city, neighbourhood)
#         geocode[city] = result[1]
#         print(f"geocoded: {result[1]}")


In [None]:
# df_opencage_geocoded = pd.DataFrame(list(geocode.items()), columns=["neighbourhood","geometry"])
# df_opencage_geocoded.to_csv("neighbourhood_opencage_geometry.csv", index = False)

##### Feature Engineering - Feature `distance bw unit and neighbourhood`

In [None]:

pprint(current["best"])

In [None]:
pprint(results[0]["formatted"])

In [None]:
destination = df["neighbourhood"][0]

origin = (df["latitude"][0], df["longitude"][0])
mode = "driving"

In [None]:
print(origin)

In [None]:
dist = get_distance(origin, destination, mode = mode)

<p>We have latitude and longitude data. Even its metric data we cant use it bluntly. we can create new features like "distance between subway and the unit" and "distance to nearest public transport" and "distance to city center", "distance to nearest airport"</p>

## Machine Learning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### Random Forest

In [None]:
rfs=RandomForestClassifier()

In [None]:
rff=rfs.fit(x_train,y_train)

In [None]:
y_predict=rff.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_RF01.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

### Support Vector Machine

In [None]:
sv=svm()

In [None]:
svr=rfs.fit(x_train_f,y_train)

In [None]:
y_predict=svr.predict(x_test_f)

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### K-Nearest Neighbours

In [None]:
KNN=KNeighborsClassifier()

In [None]:
knn=KNN.fit(x_train,y_train)

In [None]:
y_predict=knn.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_KNN01.csv')

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### XGBoostClassifier

In [None]:
xg=XGBClassifier()

In [None]:
xgf=xg.fit(x_train,y_train)

In [None]:
y_predict=xgf.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_XG01.csv')