In [83]:
import numpy as np
import pandas as pd

## Helper functions for data extraction

In [84]:
from geopy.distance import geodesic

def distance_from_center(latitude, longitude, center_coordinates):
    """
    Calculates the distance between a listing's coordinates and a central point of interest.
    
    Parameters:
        latitude (float): Latitude of the listing.
        longitude (float): Longitude of the listing.
        center_coordinates (tuple): Latitude and longitude of the central point.
        
    Returns:
        float: Distance between the listing and the central point in kilometers.
    """
    listing_coordinates = (latitude, longitude)
    return geodesic(listing_coordinates, center_coordinates).kilometers

def neighborhood_density(neighbourhood, listings_per_neighbourhood):
    """
    Computes the density of listings in a neighbourhood.
    
    Parameters:
        neighbourhood (str): Name of the neighbourhood.
        listings_per_neighbourhood (dict): Dictionary mapping neighbourhood names to the number of listings.
        
    Returns:
        float: Density of listings in the neighbourhood.
    """
    return listings_per_neighbourhood.get(neighbourhood, 0)

def room_type_encoded(room_type):
    """
    Encodes room types into numerical values.
    
    Parameters:
        room_type (str): Type of room.
        
    Returns:
        int: Encoded value representing the room type.
    """
    room_types = {'Shared room': 0, 'Private room': 1, 'Entire home/apt': 2}
    return room_types.get(room_type, -1)

def neighbourhood_encoded(neighbourhood):
    """
    Encodes neighbourhoods into numerical values.
    
    Parameters:
        neighbourhood (str): Name of neighbourhood.
        
    Returns:
        int: Encoded value representing the neighbourhood.
    """
    neighbourhoods_mapping = {'Staten Island': 0,
                              'Bronx': 1,
                              'Queens': 2,
                              'Brooklyn': 3,
                              'Manhattan': 4}
    return neighbourhoods_mapping.get(neighbourhood, -1)

def neighbourhood_popularity_score(neighbourhood_reviews):
    """
    Calculates the popularity score of a neighbourhood based on the number of reviews.
    
    Parameters:
        neighbourhood_reviews (int): Total number of reviews in the neighbourhood.
        
    Returns:
        float: Popularity score of the neighbourhood.
    """
    # Assuming higher number of reviews indicates higher popularity
    return neighbourhood_reviews

def booking_density(availability_365, minimum_nights):
    """
    Calculates the booking density of a listing.
    
    Parameters:
        availability_365 (int): Number of days the listing is available in a year.
        minimum_nights (int): Minimum number of nights required for booking.
        
    Returns:
        float: Booking density ratio.
    """
    return availability_365 / minimum_nights

def availability_ratio(availability_365):
    """
    Computes the availability ratio of a listing.
    
    Parameters:
        availability_365 (int): Number of days the listing is available in a year.
        
    Returns:
        float: Availability ratio.
    """
    total_days_in_year = 365
    return availability_365 / total_days_in_year

## Data extraction using pandas

In [85]:
airbnb=pd.read_csv('AB_NYC_2019.csv')

In [86]:
airbnb

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [87]:
airbnb.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

## Data celaning

- dropping NaN values
- deleting duplicates

In [88]:
cleaned_airbnb = airbnb.copy()
cleaned_airbnb = cleaned_airbnb.drop_duplicates()
cleaned_airbnb = cleaned_airbnb.dropna()
cleaned_airbnb['last_review'] = pd.to_datetime(cleaned_airbnb['last_review'])
cleaned_airbnb

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48782,36425863,Lovely Privet Bedroom with Privet Restroom,83554966,Rusaa,Manhattan,Upper East Side,40.78099,-73.95366,Private room,129,1,1,2019-07-07,1.00,1,147
48790,36427429,No.2 with queen size bed,257683179,H Ai,Queens,Flushing,40.75104,-73.81459,Private room,45,1,1,2019-07-07,1.00,6,339
48799,36438336,Seas The Moment,211644523,Ben,Staten Island,Great Kills,40.54179,-74.14275,Private room,235,1,1,2019-07-07,1.00,1,87
48805,36442252,1B-1B apartment near by Metro,273841667,Blaine,Bronx,Mott Haven,40.80787,-73.92400,Entire home/apt,100,1,2,2019-07-07,2.00,1,40


## Feature engineering

In [89]:
nyc_center_coordinates = (40.7128, -74.0060)  # Latitude and longitude of New York City center
cleaned_airbnb['distance_from_center'] = cleaned_airbnb.apply(lambda row: distance_from_center(row['latitude'], row['longitude'], nyc_center_coordinates), axis=1)

cleaned_airbnb['room_type_encoded'] = cleaned_airbnb['room_type'].apply(room_type_encoded)

cleaned_airbnb['neighbourhood_group_encoded'] = cleaned_airbnb['neighbourhood_group'].apply(neighbourhood_encoded)

cleaned_airbnb['booking_density'] = cleaned_airbnb.apply(lambda row: booking_density(row['availability_365'], row['minimum_nights']), axis=1)

cleaned_airbnb['availability_ratio'] = cleaned_airbnb['availability_365'].apply(availability_ratio)


In [90]:
cleaned_airbnb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38821 entries, 0 to 48852
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              38821 non-null  int64         
 1   name                            38821 non-null  object        
 2   host_id                         38821 non-null  int64         
 3   host_name                       38821 non-null  object        
 4   neighbourhood_group             38821 non-null  object        
 5   neighbourhood                   38821 non-null  object        
 6   latitude                        38821 non-null  float64       
 7   longitude                       38821 non-null  float64       
 8   room_type                       38821 non-null  object        
 9   price                           38821 non-null  int64         
 10  minimum_nights                  38821 non-null  int64         
 11  number_

In [91]:
cleaned_airbnb.drop(['id', 'host_id', 'name', 'host_name', 'last_review'], axis=1, inplace=True)

In [92]:
cleaned_airbnb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38821 entries, 0 to 48852
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             38821 non-null  object 
 1   neighbourhood                   38821 non-null  object 
 2   latitude                        38821 non-null  float64
 3   longitude                       38821 non-null  float64
 4   room_type                       38821 non-null  object 
 5   price                           38821 non-null  int64  
 6   minimum_nights                  38821 non-null  int64  
 7   number_of_reviews               38821 non-null  int64  
 8   reviews_per_month               38821 non-null  float64
 9   calculated_host_listings_count  38821 non-null  int64  
 10  availability_365                38821 non-null  int64  
 11  distance_from_center            38821 non-null  float64
 12  room_type_encoded               38821

In [93]:
cleaned_airbnb.describe()["price"]

count    38821.000000
mean       142.332526
std        196.994756
min          0.000000
25%         69.000000
50%        101.000000
75%        170.000000
max      10000.000000
Name: price, dtype: float64

In [94]:
Q1 = cleaned_airbnb['price'].quantile(0.25)
Q3 = cleaned_airbnb['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

cleaner_airbnb = cleaned_airbnb[(cleaned_airbnb['price'] >= lower_bound) & (cleaned_airbnb['price'] <= upper_bound)]

In [95]:
cleaner_airbnb

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,distance_from_center,room_type_encoded,neighbourhood_group_encoded,booking_density,availability_ratio
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365,7.789889,1,3,365.0,1.000000
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355,4.906576,2,4,355.0,0.972603
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194,4.970672,2,3,194.0,0.531507
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.10,1,0,10.863385,2,4,0.0,0.000000
5,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,0.59,1,129,4.674671,2,4,43.0,0.353425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48782,Manhattan,Upper East Side,40.78099,-73.95366,Private room,129,1,1,1.00,1,147,8.768238,1,4,147.0,0.402740
48790,Queens,Flushing,40.75104,-73.81459,Private room,45,1,1,1.00,6,339,16.717709,1,2,339.0,0.928767
48799,Staten Island,Great Kills,40.54179,-74.14275,Private room,235,1,1,1.00,1,87,22.237122,1,0,87.0,0.238356
48805,Bronx,Mott Haven,40.80787,-73.92400,Entire home/apt,100,1,2,2.00,1,40,12.625442,2,1,40.0,0.109589


In [96]:
cleaner_airbnb.to_csv('newairbnb.csv')