# Airbnb Price Prediction

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# Removing cached imports
import src.preproccessing
from importlib import reload
reload(src.preproccessing)

<module 'src.preproccessing' from '/home/nandhu/Documents/kaggle-competitions/src/preproccessing/__init__.py'>

In [8]:
# Define a function to load data/missing_value import DroppingMissingValueStrategy 
def load_data(filepath):
    """
    Loading dataset from a specified filepath.
    """

    return pd.read_csv(filepath)

In [9]:
df= load_data("./dataset/AB_NYC_2019.csv")

## Dataset Properties

In [10]:
# Properties of the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [11]:
df.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [12]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


## Data cleaning

### Handling Missing Values

In [13]:
# List of columns with missing values

missing = df.isnull().sum()
missing = missing[missing>0]
print(missing)

name                    16
host_name               21
last_review          10052
reviews_per_month    10052
dtype: int64


In [14]:
# Handling Name

df_name_missed = df[df["name"].isnull()]

In [15]:
if len(df["id"].unique()) == len(df["id"]):
    print("every id is unique")

every id is unique


 <p>The `name` and `host name` columns are nominal values. I dont want to consider these columns for out model.
 `name` might be useful on SEO basis. but i dont think it might make any difference on prices. i am dropping both columns</p>

In [16]:
df = df.drop(["name", "host_name"], axis=1)

In [17]:
# Handling last_review and reviews_per_month page

df_last_review_missed = df[df["last_review"].isnull()]
df_review_per_month_missed = df[df["reviews_per_month"].isnull()]

<p>If last_review and review_per_month missing row are same. I am considering i dont have any review so far. I am repalcing null with 'zero'



</p>

In [18]:
# Importing Missing Value strategies

from src.preproccessing import missing_values
missing_const = missing_values.FillConstMissingValuesStrategy(const = 0)

In [19]:
df = missing_const.handle(df =df, column="last_review")
df = missing_const.handle(df=df, column = "reviews_per_month")

In [20]:
df.isnull().sum()

id                                0
host_id                           0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

### Removing Duplicate values

In [21]:
# Finding suplicate data

duplicates = df.duplicated().sum()
print(duplicates)

0


In [22]:
print("NO duplicates found!")

NO duplicates found!


## Data Transformation

In [23]:
# Filtering categorical variables

categories = df.select_dtypes(include=["object", "category"]).columns
print(categories)

Index(['neighbourhood_group', 'neighbourhood', 'room_type', 'last_review'], dtype='object')


In [24]:
for category in categories:
    unique = df[category].unique()
    length  = len(unique)
    print(f"Unique categories in {category} has {len(unique)} unique values")
    if length< 10:
        print(unique)
    

Unique categories in neighbourhood_group has 5 unique values
['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']
Unique categories in neighbourhood has 221 unique values
Unique categories in room_type has 3 unique values
['Private room' 'Entire home/apt' 'Shared room']
Unique categories in last_review has 1765 unique values


In [25]:
brooklyn_neighbourhoods = df[df['neighbourhood_group'] == "Brooklyn"]['neighbourhood'].unique().tolist()
manhattan_neighbourhoods = df[df['neighbourhood_group'] == "Manhattan"]['neighbourhood'].unique().tolist()
queens_neighbourhoods = df[df['neighbourhood_group'] == "Queens"]['neighbourhood'].unique().tolist()
staten_neighbourhoods = df[df['neighbourhood_group'] == "Staten Island"]['neighbourhood'].unique().tolist()
bronx_neighbourhoods = df[df['neighbourhood_group'] == "Bronx"]['neighbourhood'].unique().tolist()
neighbourhoods = df["neighbourhood_group"].unique()

<p>We have very few features. so i am going to use one hot encoding for `neighbourhood_group` and `room_type`</p>

### Encoding

In [26]:
# Importing Onehot encoding

from src.preproccessing import encoding
onehot_encoder = encoding.OneHotEncoding()

In [27]:
# neighbourhood_group feature encoding
print(categories[0])
encoded_df = onehot_encoder.encode(df=df, column=categories[0])
df.drop(labels=[categories[0]], axis=1, inplace =True)
df = pd.concat([df, encoded_df], axis =1)

neighbourhood_group


In [28]:
df.columns

Index(['id', 'host_id', 'neighbourhood', 'latitude', 'longitude', 'room_type',
       'price', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'neighbourhood_group_Bronx',
       'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan',
       'neighbourhood_group_Queens', 'neighbourhood_group_Staten Island'],
      dtype='object')

In [29]:
# room_type onehot encoding

print(categories[2])
encoded_df = onehot_encoder.encode(df=df, column=categories[2])
df.drop(labels=[categories[2]],axis=1)
df = pd.concat([df,encoded_df],axis=1)

room_type


### Feature Engineering

<p>`neighbourhood` is have 221 unique value which comes under neighbourhood_groups. Distance to the neighbourhood going to be a new feature using `google maps distance matric api`</p>

In [30]:
# Importing Geocoding api keys

df_api = pd.read_csv("geocoding_api.csv")

In [31]:
# # getting langtitude and latitude for neighbourhoods using openweathermap api
# import requests

# def get_lat_long(city):
#     api_key = df_api[df_api["api"]=="openweather_map"]["api_key"][0]
#     limit = 1
#     url = f"http://api.openweathermap.org/geo/1.0/direct?q={city}, NY,US&limit={limit}&appid={api_key}"
#     response = requests.get(url)
#     if response.status_code ==200:
#         data =  response.json()
#         print(f"Geographical data for {city}:")
#         for entry in data:
#             print(f"Name: {entry['name']}, Latitude: {entry['lat']}, Longitude: {entry['lon']}")
#             return [city,(entry["lat"], entry["lon"])]
#     else:
#         print(f"Error: Unable to fetch data (status code:{response.status_code})")
    

In [32]:
# #Getting geocodes for all neighbourhood

# unique_neighbourhood = df["neighbourhood"].unique()
# geo_code = {}
# count = 1
# for city in unique_neighbourhood:
    
#     print(f"getting city {count}: {city}")
#     result = get_lat_long(city)
#     if result != None:
#         geo_code[city] = result[1]
#     else:
#         geo_code[city] = None
#     count +=1

# # storing collected data into csv file
# df_geocode = pd.DataFrame(list(geo_code.items()), columns=['neighbourhood', 'geocode'])
# df_geocode.to_csv('geo_codes.csv', index=False)

##### Opencage forward geocoding api

In [33]:
# getting geocodes for remaining neighbourhoods using opencagedata api
# from opencage.geocoder import OpenCageGeocode

# from pprint import pprint
# api_key = df_api[df_api["api"]=="opencagedata"]["api_key"][1]
# geocoder = OpenCageGeocode(api_key)

# def opencage(city, county):
#     query = f"{city}, {county}, New York, United states of America"
#     results = geocoder.geocode(query, countrycode="us",limit =4)
#     current = {}
#     count = 1
#     for result in results:
#         if count ==1:
#             current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         else:
#             if current["best"][2] < result["confidence"]:
#                 current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         count+=1

#     return current["best"]

In [34]:
# geocode = {}
# for neighbourhood in neighbourhoods:
#     if neighbourhood =="Brooklyn":
#         current_group = brooklyn_neighbourhoods
#     elif neighbourhood =="Manhattan":
#         current_group = manhattan_neighbourhoods
#     elif neighbourhood =="Queens":
#         current_group = queens_neighbourhoods
#     elif neighbourhood =="Staten Island":
#         current_group = staten_neighbourhoods
#     else:
#         current_group = bronx_neighbourhoods
   
#     for city in current_group:
#         print(f"current city: {city}")
#         result = opencage(city, neighbourhood)
#         geocode[city] = {"lat":result[1]["lat"], "lng":result[1]["lng"]}
#         print(f"geocoded: {result[1]}")


In [35]:
# import json
# with open("neighbourhood_geocode.json", "w") as outfile: 
#     json.dump(geocode,outfile)

##### Feature Engineering - Feature `distance bw unit and neighbourhood`

In [36]:
df_feature = df[["id", "neighbourhood", "latitude", "longitude"]]

In [37]:
df_feature.head()

Unnamed: 0,id,neighbourhood,latitude,longitude
0,2539,Kensington,40.64749,-73.97237
1,2595,Midtown,40.75362,-73.98377
2,3647,Harlem,40.80902,-73.9419
3,3831,Clinton Hill,40.68514,-73.95976
4,5022,East Harlem,40.79851,-73.94399


In [38]:
import json

with open("neighbourhood_geocode.json", "r") as json_file:
    neighbourhood = json.load(json_file)

In [40]:
neighbourhoods = df_feature["neighbourhood"].unique()

In [41]:
for item in neighbourhoods:
    df_item = df_feature[df["neighbourhood"]==item]
    print(f"{item} - {df_item.count()}")

Kensington - id               175
neighbourhood    175
latitude         175
longitude        175
dtype: int64
Midtown - id               1545
neighbourhood    1545
latitude         1545
longitude        1545
dtype: int64
Harlem - id               2658
neighbourhood    2658
latitude         2658
longitude        2658
dtype: int64
Clinton Hill - id               572
neighbourhood    572
latitude         572
longitude        572
dtype: int64
East Harlem - id               1117
neighbourhood    1117
latitude         1117
longitude        1117
dtype: int64
Murray Hill - id               485
neighbourhood    485
latitude         485
longitude        485
dtype: int64
Bedford-Stuyvesant - id               3714
neighbourhood    3714
latitude         3714
longitude        3714
dtype: int64
Hell's Kitchen - id               1958
neighbourhood    1958
latitude         1958
longitude        1958
dtype: int64
Upper West Side - id               1971
neighbourhood    1971
latitude         1971
longitu

In [114]:
city = row1["neighbourhood"]


In [115]:
geocode["Kensington"]

{'lat': 40.6462149, 'lng': -73.970694}

In [116]:
origin = {"lat":row1["latitude"], "lng":row1["longitude"]}
destination = {"lat":geocode["Kensington"]["lat"], "lng":geocode["Kensington"]["lng"]}

In [118]:
origin

{'lat': np.float64(40.75362), 'lng': np.float64(-73.98377)}

In [121]:
import requests
import json

# Your MapQuest API key
api_key = df_api[df_api["api"]=="mapquest"]["api_key"][2]

# API URL
url = f'https://www.mapquestapi.com/directions/v2/routematrix?key={api_key}'

# Define the locations (one origin, multiple destinations)
locations = [
    { "latLng": origin},
    { "latLng": {
        "lat": 40.750307,
        "lng": -108.999472
      }},
    {
        "latLng":destination}
]

# Define the request body
payload = {
    "locations": locations,
    "options": {
        "manyToOne": False,  # Set to True for many-to-one, False for one-to-many
        "allToAll": False,   # Set to True if you want distances between all locations
        "unit": "m"          # Unit can be 'm' for miles or 'k' for kilometers
    }
}

# Set headers for the POST request
headers = {
    'Content-Type': 'application/json'
}

# Make the API request
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Display distances and times
    distances = data.get('distance', [])
    times = data.get('time', [])
    
    print("Distances (miles):", distances)
    print("Travel times (seconds):", times)
else:
    print(f"Error: {response.status_code}")
    print(response.text)


Distances (miles): [0, 1986.2944, 11.66]
Travel times (seconds): [0, 103833, 1478]


In [117]:
# using mapquest

import requests
def mapquest(origin, destination):
    api_key = df_api[df_api["api"]=="mapquest"]["api_key"][2]
    url = "http://www.mapquestapi.com/directions/v2/route"
    
    # Define coordinates
    params = {
        "key": api_key,
        "from": f"{origin["lat"]},{origin["lng"]}", 
        "to": f"{destination["lat"]}, {destination["lng"]}",    
        "outFormat": "json",
        "unit": "k"  # For kilometers
    }
       
    # Send request
    response = requests.get(url, params=params)
    data = response.json()
    
    # Extract distance
    distance = data['route']['distance']
return distance


Driving distance: 18.8 km


In [None]:
def row_to_distance(row):


    return distance

In [None]:

pprint(current["best"])

In [None]:
pprint(results[0]["formatted"])

In [None]:
destination = df["neighbourhood"][0]

origin = (df["latitude"][0], df["longitude"][0])
mode = "driving"

In [None]:
print(origin)

In [None]:
dist = get_distance(origin, destination, mode = mode)

<p>We have latitude and longitude data. Even its metric data we cant use it bluntly. we can create new features like "distance between subway and the unit" and "distance to nearest public transport" and "distance to city center", "distance to nearest airport"</p>

## Machine Learning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### Random Forest

In [None]:
rfs=RandomForestClassifier()

In [None]:
rff=rfs.fit(x_train,y_train)

In [None]:
y_predict=rff.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_RF01.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

### Support Vector Machine

In [None]:
sv=svm()

In [None]:
svr=rfs.fit(x_train_f,y_train)

In [None]:
y_predict=svr.predict(x_test_f)

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### K-Nearest Neighbours

In [None]:
KNN=KNeighborsClassifier()

In [None]:
knn=KNN.fit(x_train,y_train)

In [None]:
y_predict=knn.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_KNN01.csv')

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### XGBoostClassifier

In [None]:
xg=XGBClassifier()

In [None]:
xgf=xg.fit(x_train,y_train)

In [None]:
y_predict=xgf.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_XG01.csv')