# Imports

In [2]:
# Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# Removing cached imports

import src.preprocessing
from importlib import reload

# Reloading editable custom own package files in order to get the latest version
reload(src.preprocessing)

<module 'src.preprocessing' from '/home/nandhu/Documents/kaggle-competitions/src/preprocessing/__init__.py'>

In [5]:
# Loading dataset

df = pd.read_csv("./dataset/AB_NYC_2019.csv")

# Dataset Properties

In [6]:
# Imports

from src.analyse_src import data_inspection
dt_inspection = data_inspection.DataTypeInspectionStrategy()
ss_inspection = data_inspection.SummaryStatisticsInspectionStrategy()

In [7]:
# Inspecting Datatypes of features

dt_inspection.inspect(df)


DataTypes and Non-Null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  

In [8]:
# Summary Statistics of the dataset

ss_inspection.inspect(df)

Descriptive Statistics of Numerical Datatypes:
                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.743213e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    48895.000000       48895.000000       38843.000000   
mean         7.029962          23.274466           1.373221   
std         20.510550          44.550582           1

# Preprocessing

## Missing Values

In [9]:
# List of columns with missing values

missing = df.isnull().sum()
missing = missing[missing>0]
print(missing)

name                    16
host_name               21
last_review          10052
reviews_per_month    10052
dtype: int64


### "name" & "host_name" Columns

In [11]:
# Filtering df where "name" is missed


df_name_missed = df[df["name"].isnull()]

In [12]:
if len(df["id"].unique()) == len(df["id"]):
    print("every id is unique")

every id is unique


 <p>The `name` and `host name` columns are nominal values. I dont want to consider these columns for out model.
 `name` might be useful on SEO basis. but i dont think it might make any difference on prices. i am dropping both columns</p>

In [13]:
df = df.drop(["name", "host_name"], axis=1)

### "last_review" & "reviews_per_month" Columns

In [14]:
# Filtering dataset where above mentioned column values are missing

df_last_review_missed = df[df["last_review"].isnull()]
df_review_per_month_missed = df[df["reviews_per_month"].isnull()]

<p>If last_review and review_per_month missing row are same. I am considering i dont have any review so far. I am repalcing null with 'zero'



</p>

In [17]:
# Importing Missing Value strategies

from src.preprocessing import missing_values
missing_const = missing_values.FillConstMissingValuesStrategy()

In [18]:
df = missing_const.handle(df =df, column="last_review", const = 0)
df = missing_const.handle(df=df, column = "reviews_per_month", const = 0)

In [19]:
df.isnull().sum()

id                                0
host_id                           0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Removing Duplicate values

In [20]:
# Finding duplicate data

duplicates = df.duplicated().sum()
print(duplicates)

0


In [21]:
print("NO duplicates found!")

NO duplicates found!


## Data Transformation

In [None]:
# Filtering categorical variables

categories = df.select_dtypes(include=["object", "category"]).columns
print(categories)

In [None]:
for category in categories:
    unique = df[category].unique()
    length  = len(unique)
    print(f"Unique categories in {category} has {len(unique)} unique values")
    if length< 10:
        print(unique)
    

In [None]:
brooklyn_neighbourhoods = df[df['neighbourhood_group'] == "Brooklyn"]['neighbourhood'].unique().tolist()
manhattan_neighbourhoods = df[df['neighbourhood_group'] == "Manhattan"]['neighbourhood'].unique().tolist()
queens_neighbourhoods = df[df['neighbourhood_group'] == "Queens"]['neighbourhood'].unique().tolist()
staten_neighbourhoods = df[df['neighbourhood_group'] == "Staten Island"]['neighbourhood'].unique().tolist()
bronx_neighbourhoods = df[df['neighbourhood_group'] == "Bronx"]['neighbourhood'].unique().tolist()
neighbourhoods = df["neighbourhood_group"].unique()

<p>We have very few features. so i am going to use one hot encoding for `neighbourhood_group` and `room_type`</p>

### Encoding

In [None]:
# Importing Onehot encoding

from src.preproccessing import encoding
onehot_encoder = encoding.OneHotEncoding()

In [None]:
# neighbourhood_group feature encoding
print(categories[0])
encoded_df = onehot_encoder.encode(df=df, column=categories[0])
df.drop(labels=[categories[0]], axis=1, inplace =True)
df = pd.concat([df, encoded_df], axis =1)

In [None]:
df.columns

In [None]:
# room_type onehot encoding

print(categories[2])
encoded_df = onehot_encoder.encode(df=df, column=categories[2])
df.drop(labels=[categories[2]],axis=1)
df = pd.concat([df,encoded_df],axis=1)

In [None]:
df.columns

### Feature Engineering

<p>`neighbourhood` is have 221 unique value which comes under neighbourhood_groups. Distance to the neighbourhood going to be a new feature using `google maps distance matric api`</p>

In [None]:
# Importing Geocoding api keys

df_api = pd.read_csv("geocoding_api.csv")

In [None]:
# # getting langtitude and latitude for neighbourhoods using openweathermap api
# import requests

# def get_lat_long(city):
#     api_key = df_api[df_api["api"]=="openweather_map"]["api_key"][0]
#     limit = 1
#     url = f"http://api.openweathermap.org/geo/1.0/direct?q={city}, NY,US&limit={limit}&appid={api_key}"
#     response = requests.get(url)
#     if response.status_code ==200:
#         data =  response.json()
#         print(f"Geographical data for {city}:")
#         for entry in data:
#             print(f"Name: {entry['name']}, Latitude: {entry['lat']}, Longitude: {entry['lon']}")
#             return [city,(entry["lat"], entry["lon"])]
#     else:
#         print(f"Error: Unable to fetch data (status code:{response.status_code})")
    

In [None]:
# #Getting geocodes for all neighbourhood

# unique_neighbourhood = df["neighbourhood"].unique()
# geo_code = {}
# count = 1
# for city in unique_neighbourhood:
    
#     print(f"getting city {count}: {city}")
#     result = get_lat_long(city)
#     if result != None:
#         geo_code[city] = result[1]
#     else:
#         geo_code[city] = None
#     count +=1

# # storing collected data into csv file
# df_geocode = pd.DataFrame(list(geo_code.items()), columns=['neighbourhood', 'geocode'])
# df_geocode.to_csv('geo_codes.csv', index=False)

##### Opencage forward geocoding api

In [None]:
# # getting geocodes for remaining neighbourhoods using opencagedata api
# from opencage.geocoder import OpenCageGeocode

# from pprint import pprint
# api_key = df_api[df_api["api"]=="opencagedata"]["api_key"][1]
# geocoder = OpenCageGeocode(api_key)

# def opencage(city, county):
#     query = f"{city}, {county}, New York, United states of America"
#     results = geocoder.geocode(query, countrycode="us",limit =4)
#     current = {}
#     count = 1
#     for result in results:
#         if count ==1:
#             current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         else:
#             if current["best"][2] < result["confidence"]:
#                 current["best"] = [result["formatted"], result["geometry"], result["confidence"]]
#         count+=1

#     return current["best"]

In [None]:
# geocode = {}
# for neighbourhood in neighbourhoods:
#     if neighbourhood =="Brooklyn":
#         current_group = brooklyn_neighbourhoods
#     elif neighbourhood =="Manhattan":
#         current_group = manhattan_neighbourhoods
#     elif neighbourhood =="Queens":
#         current_group = queens_neighbourhoods
#     elif neighbourhood =="Staten Island":
#         current_group = staten_neighbourhoods
#     else:
#         current_group = bronx_neighbourhoods
   
#     for city in current_group:
#         print(f"current city: {city}")
#         result = opencage(city, neighbourhood)
#         geocode[city] = result[1]
#         print(f"geocoded: {result[1]}")


In [None]:
# df_opencage_geocoded = pd.DataFrame(list(geocode.items()), columns=["neighbourhood","geometry"])
# df_opencage_geocoded.to_csv("neighbourhood_opencage_geometry.csv", index = False)

##### Feature Engineering - Feature `distance bw unit and neighbourhood`

In [None]:

pprint(current["best"])

In [None]:
pprint(results[0]["formatted"])

In [None]:
destination = df["neighbourhood"][0]

origin = (df["latitude"][0], df["longitude"][0])
mode = "driving"

In [None]:
print(origin)

In [None]:
dist = get_distance(origin, destination, mode = mode)

<p>We have latitude and longitude data. Even its metric data we cant use it bluntly. we can create new features like "distance between subway and the unit" and "distance to nearest public transport" and "distance to city center", "distance to nearest airport"</p>

## Machine Learning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

### Random Forest

In [None]:
rfs=RandomForestClassifier()

In [None]:
rff=rfs.fit(x_train,y_train)

In [None]:
y_predict=rff.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_RF01.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

### Support Vector Machine

In [None]:
sv=svm()

In [None]:
svr=rfs.fit(x_train_f,y_train)

In [None]:
y_predict=svr.predict(x_test_f)

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### K-Nearest Neighbours

In [None]:
KNN=KNeighborsClassifier()

In [None]:
knn=KNN.fit(x_train,y_train)

In [None]:
y_predict=knn.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_KNN01.csv')

In [None]:
y_predict

In [None]:
# Saving the results in Csv
data={'Id':range(1461,2920),'SalePrice':y_predict}
pd.DataFrame(data).set_index('Id').to_csv(r'submission files/submission_RF02.csv')

In [None]:
best_score=pd.read_csv(r'submission files/submission_RF01.csv')

In [None]:
a=best_score['SalePrice']-y_predict

In [None]:
#fig,ax1=plt.subplot(1,1)
sns.distplot(a,bins=30)


In [None]:
best_score['new']=y_predict

In [None]:
sns.heatmap(best_score[['SalePrice','new']].corr(),annot=True)

### XGBoostClassifier

In [None]:
xg=XGBClassifier()

In [None]:
xgf=xg.fit(x_train,y_train)

In [None]:
y_predict=xgf.predict(x_test)

In [None]:
y_predic=pd.DataFrame(y_predict)

In [None]:
reverse_map={0:dict(zip(list(range(1,len(cate)+1)),cate))}
y_predic.replace(reverse_map,inplace=True)

In [None]:
# Saving the results in Csv
data={'id':test.id,'country':y_predic[0]}
pd.DataFrame(data).set_index('id').to_csv(r'submission files/submission_XG01.csv')