In [1]:
import pandas as pd
import numpy as np
from math import *

### Read in the data

In [2]:
df = pd.read_csv('new_york_hotels.csv', encoding='cp1252')

In [3]:
df.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [4]:
type(df["name"])

pandas.core.series.Series

In [5]:
df.loc[4:10,["name", "city"]]

Unnamed: 0,name,city
4,CrestHill Suites SUNY University Albany,Albany
5,The Desmond Hotel Albany,Albany
6,Ramada Plaza Albany,Albany
7,Hampton Inn & Suites Albany-Downtown,Albany
8,Albany Marriott,Albany
9,Best Western Sovereign Hotel - Albany,Albany
10,Express Inn & Suites,Albany


In [6]:
df.loc[df["star_rating"] >= 4, :]

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
114,482259,Topping Rose House,One Bridgehampton - Sag Harbor Turnpike,Bridgehampton,NY,11932,40.93780,-72.30069,4.5,995.3900,295.3900
129,339951,Sheraton Brooklyn New York Hotel,228 Duffield Street,Brooklyn,NY,11201,40.69160,-73.98436,4.0,409.1485,259.0916
134,406834,McCarren Hotel & Pool,160 N 12th St,Brooklyn,NY,11249,40.72100,-73.95543,4.0,575.4100,330.4100
142,403899,The Box House Hotel,77 Box Street,Brooklyn,NY,11222,40.73749,-73.95329,4.0,379.4200,249.4200
154,136344,New York Marriott at the Brooklyn Bridge,333 Adams St,Brooklyn,NY,11201,40.69365,-73.98870,4.0,489.0700,212.0400
...,...,...,...,...,...,...,...,...,...,...,...
1592,333528,"Viana Hotel & Spa, BW Premier Collection",3998 Brush Hollow Rd,Westbury,NY,11590,40.77629,-73.55967,4.0,509.9900,133.9500
1600,324657,"The Ritz-Carlton New York, Westchester",3 Renaissance Square,White Plains,NY,10601,41.03257,-73.76664,5.0,479.4042,349.2972
1601,514006,Furnished Quarters Bank Street Commons,15/25 Bank Street,White Plains,NY,10606,41.03050,-73.77419,4.0,180.0000,179.0000
1602,566642,Global Luxury Apartments in White Plains,15 Bank Street,White Plains,NY,10606,41.03150,-73.77401,4.0,175.2200,125.1500


In [7]:
df.to_csv()

',ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate\n0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216\n1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0\n2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16\n3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597\n4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39\n5,125200,The Desmond Hotel Albany,660 Albany Shaker Rd,Albany,NY,12211,42.72874,-73.79807,3.5,189.0266,153.0644\n6,109728,Ramada Plaza Albany,3 Watervliet Avenue Ext,Albany,NY,12206,42.68031,-73.78444,3.0,158.6321,89.036\n7,235037,Hampton Inn & Suites Albany-Downtown,25 Chapel St,Albany,NY,12210,

In [8]:
df.index

RangeIndex(start=0, stop=1631, step=1)

In [9]:
df.columns

Index(['ean_hotel_id', 'name', 'address1', 'city', 'state_province',
       'postal_code', 'latitude', 'longitude', 'star_rating', 'high_rate',
       'low_rate'],
      dtype='object')

## Benchmarking example

#### Define the normalization function

In [10]:
def normalize(df, pd_series):
    pd_series = pd_series.astype(float)

    # Find upper and lower bound for outliers
    avg = np.mean(pd_series)
    sd  = np.std(pd_series)
    lower_bound = avg - 2*sd
    upper_bound = avg + 2*sd

    # Collapse in the outliers
    df.loc[pd_series < lower_bound , "cutoff_rate" ] = lower_bound
    df.loc[pd_series > upper_bound , "cutoff_rate" ] = upper_bound

    # Finally, take the log
    normalized_price = np.log(df["cutoff_rate"].astype(float))
    
    return normalized_price

In [11]:
normalize(df, df['high_rate'])

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
1626   NaN
1627   NaN
1628   NaN
1629   NaN
1630   NaN
Name: cutoff_rate, Length: 1631, dtype: float64

In [12]:
# ~ reverses the bool variable
df.loc[~df["cutoff_rate"].isna()]

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate,cutoff_rate
837,198937,The Kimberly Hotel & Suites,145 E 50th St,New York,NY,10022,40.75614,-73.97151,4.0,1650.2533,188.286,1281.343206
901,383834,Dream Downtown,355 W 16th St,New York,NY,10011,40.74195,-74.00369,4.5,5000.3501,299.36,1281.343206
905,325735,Trump Soho New York,246 Spring Street,New York,NY,10013,40.72563,-74.00558,5.0,1905.3199,345.32,1281.343206
939,374895,Hotel 48LEX New York,517 Lexington Avenue,New York,NY,10017,40.75525,-73.9732,4.5,1539.38,339.06,1281.343206
972,106394,JW Marriott Essex House New York,160 Central Park S,New York,NY,10019,40.76652,-73.97825,4.5,1499.02,314.02,1281.343206
973,117363,The London NYC,151 W 54th St,New York,NY,10019,40.76328,-73.98065,4.5,2579.1799,299.06,1281.343206
1012,129023,"The Carlyle, A Rosewood Hotel",35 E 76th St,New York,NY,10021,40.77461,-73.96338,5.0,4100.04,730.03,1281.343206
1023,219368,The Surrey,20 E 76th St,New York,NY,10021,40.77449,-73.96387,5.0,1945.49,595.0,1281.343206
1032,118630,The Lowell,28 East 63rd Street,New York,NY,10065,40.76587,-73.9691,5.0,2085.46,795.46,1281.343206
1045,141417,The Peninsula New York,700 Fifth Avenue at 55th street,New York,NY,10019,40.76165,-73.9749,5.0,3145.22,795.21,1281.343206


#### Timing the normalization function

In [13]:
%timeit df['high_rate_normalized'] = normalize(df, df['high_rate'])

1.68 ms ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### Profiling the normalization function

In [14]:
%load_ext line_profiler

In [15]:
%lprun -f normalize df['high_rate_normalized'] =\
        normalize(df, df['high_rate'])

## Haversine definition

In [16]:
def haversine(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

## Iterrows Haversine

In [17]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

235 ms ± 7.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Apply Haversine on rows

### Timing "apply"

In [18]:
%timeit df['distance'] =\
df.apply(lambda row: haversine(40.671, -73.985,\
                               row['latitude'], row['longitude']), axis=1)

81.5 ms ± 2.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Profiling "apply"

In [19]:
# Haversine applied on rows
%lprun -f haversine \
df.apply(lambda row: haversine(40.671, -73.985,\
                               row['latitude'], row['longitude']), axis=1)

## Vectorized implementation of Haversine applied on Pandas series

#### Timing vectorized implementation

In [20]:
# Vectorized implementation of Haversine applied on Pandas series
%timeit df['distance'] = haversine(40.671, -73.985,\
                                   df['latitude'], df['longitude'])

2.25 ms ± 90.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Profiling vectorized implementation

In [21]:
# Vectorized implementation profile
%lprun -f haversine haversine(40.671, -73.985,\
                              df['latitude'], df['longitude'])

## Vectorized implementation of Haversine applied on NumPy arrays

#### Timing vectorized implementation

In [22]:
# Vectorized implementation of Haversine applied on NumPy arrays
%timeit df['distance'] = haversine(40.671, -73.985,\
                         df['latitude'].values, df['longitude'].values)

303 µs ± 7.56 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%%timeit
# Convert pandas arrays to NumPy ndarrays
np_lat = df['latitude'].values
np_lon = df['longitude'].values

9.58 µs ± 167 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


#### Profiling vectorized implementation

In [24]:
%lprun -f haversine df['distance'] = haversine(40.671, -73.985,\
                        df['latitude'].values, df['longitude'].values)

## Cythonize that loop

#### Load the cython extension

In [25]:
%load_ext cython

#### Run unaltered Haversine through Cython

In [26]:
%%cython -a

# Haversine cythonized (no other edits)
import numpy as np
cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

#### Time it

In [27]:
%timeit df['distance'] =\
       df.apply(lambda row: haversine_cy(40.671, -73.985,\
                row['latitude'], row['longitude']), axis=1)

79.7 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Redefine Haversine with data types and C libraries

In [28]:
%%cython -a
# Haversine cythonized
from libc.math cimport sin, cos, acos, asin, sqrt

cdef deg2rad_cy(float deg):
    cdef float rad
    rad = 0.01745329252*deg
    return rad
    
cpdef haversine_cy_dtyped(float lat1, float lon1, float lat2, float lon2):
    cdef: 
        float dlon
        float dlat
        float a
        float c
        float mi
    
    lat1, lon1, lat2, lon2 = map(deg2rad_cy, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mi = 3959 * c
    return mi


#### Time it

In [29]:
%timeit df['distance'] =\
df.apply(lambda row: haversine_cy_dtyped(40.671, -73.985,\
                              row['latitude'], row['longitude']), axis=1)

35.3 ms ± 622 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
