In [1]:
import re
import pandas as pd
import numpy as np
import random
import geopandas as gpd
import openpyxl
import os
random.seed(10)
pd.options.display.float_format = '{:,.2f}'.format



In [2]:
# Read in taxi_zones lookup table
zones = pd.read_csv("../data/raw/external_data_and_taxi_zones/taxi_zone_lookup.csv")
zones = zones.drop([263, 264]) # drop the unknown zones

# Preprocess property sales data

In [3]:
relative_directory = '../data/raw/external_data_and_taxi_zones/'

all_files = os.listdir("../data/raw/external_data_and_taxi_zones/")    
property_sale_files = list(filter(lambda x: x.endswith('.xlsx'), all_files))

header = ['BOROUGH', 'NEIGHBORHOOD','BUILDING CLASS CATEGORY', 
          'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 
          'EASE-MENT', 'BUILDING CLASS AT PRESENT', 'ADDRESS',
          'APARTMENT NUMBER', 'ZIP CODE', 'RESIDENTIAL UNITS',
          'COMMERCIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET',
          'GROSS SQUARE FEET', 'YEAR BUILT', 'TAX CLASS AT TIME OF SALE',
          'BUILDING CLASS AT TIME OF SALE', 'SALE PRICE', 'SALE DATE']


file_names = [relative_directory + file for file in property_sale_files]


def read_xlxs_for_mapping(data):
    return pd.read_excel(data, 
                        names = header, 
                        parse_dates = ['SALE DATE', ],
                        engine = 'openpyxl')

property_sales = pd.concat(map(read_xlxs_for_mapping, file_names))

## Clean the data

In [4]:
def remove_outliers(data, columns):
    '''
    remove outliers from data that is 1.5 iqr away from q1 or q3
    '''
    new_data = data.copy()
    q1 = np.array([np.quantile(data[column], 0.25) for column in columns])
    q3 = np.array([np.quantile(data[column], 0.75) for column in columns])
    iqr = q3 - q1
    for i in range(len(columns)):
        column = columns[i]
        new_data = new_data[(new_data[column] > q1[i] - 3 * iqr[i]) & (new_data[column] < q3[i] + 3 * iqr[i])]
    return new_data

In [5]:
# Check data types
property_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206017 entries, 0 to 21716
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   BOROUGH                         152654 non-null  object
 1   NEIGHBORHOOD                    152614 non-null  object
 2   BUILDING CLASS CATEGORY         152614 non-null  object
 3   TAX CLASS AT PRESENT            152383 non-null  object
 4   BLOCK                           152614 non-null  object
 5   LOT                             152614 non-null  object
 6   EASE-MENT                       10 non-null      object
 7   BUILDING CLASS AT PRESENT       152383 non-null  object
 8   ADDRESS                         152614 non-null  object
 9   APARTMENT NUMBER                33410 non-null   object
 10  ZIP CODE                        152589 non-null  object
 11  RESIDENTIAL UNITS               122566 non-null  object
 12  COMMERCIAL UNITS               

In [6]:
# Remove irrelevant rows
property_sales = property_sales[property_sales['BOROUGH'].isin(['1', '2', '3', '4', '5'])]

# Drop rows with NaN valus in required fields
property_sales.dropna(how='any', subset=['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS AT PRESENT',
                                         'TOTAL UNITS', 'GROSS SQUARE FEET', 'BUILDING CLASS AT TIME OF SALE', 
                                         'SALE PRICE', 'SALE DATE'], inplace=True)

# Change data type
property_sales['BOROUGH'] = property_sales['BOROUGH'].astype(int)
property_sales['SALE DATE'] = pd.to_datetime(property_sales['SALE DATE'])
property_sales['SALE PRICE'] = pd.to_numeric(property_sales['SALE PRICE'])
property_sales['GROSS SQUARE FEET'] = pd.to_numeric(property_sales['GROSS SQUARE FEET'])
property_sales['TOTAL UNITS'] = pd.to_numeric(property_sales['TOTAL UNITS'])


# According to property sales data, 1 for man, 2 for bronx, 3 for brooklyn, 4 for queens, 5 for staten island
borough_dict = {4: 'Queens', 2: 'Bronx', 1: 'Manhattan', 5: 'Staten Island', 3: 'Brooklyn'}
property_sales['BOROUGH'] = property_sales['BOROUGH'].apply(lambda x: borough_dict[x])




# Filter data that fits our analysis
# Type ABCD are family dwellings and apartments, H is hotel
condition = (property_sales['SALE DATE'] > '01-01-2019') & (property_sales['SALE DATE'] <= '29-02-2020') &\
            (property_sales['SALE PRICE'] > 0) &\
            (property_sales['GROSS SQUARE FEET'] > 0) &\
            (property_sales['TOTAL UNITS'] > 0) &\
            (property_sales['BUILDING CLASS AT PRESENT'] == property_sales['BUILDING CLASS AT TIME OF SALE']) &\
            (property_sales['BUILDING CLASS AT TIME OF SALE'].str.contains('^[ABCDH]', regex=True))


property_sales = property_sales.loc[condition]


# Drop unnecessary columns
property_sales.drop(columns = ['BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT', 'BLOCK', 
                               'LOT', 'EASE-MENT', 'BUILDING CLASS AT PRESENT', 
                               'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE', 
                               'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 
                               'LAND SQUARE FEET', 'YEAR BUILT', 'TAX CLASS AT TIME OF SALE'], inplace=True)


# Add 'PRICE PER UNIT' and 'PRICE PER SQUARE FEET' as features

property_sales['PRICE PER UNIT'] = property_sales['SALE PRICE'] / property_sales['TOTAL UNITS']

property_sales['PRICE PER SQUARE FEET'] = property_sales['SALE PRICE'] / property_sales['GROSS SQUARE FEET']

# Assume the numeric data are normally distributed, remove the outliers
property_sales = remove_outliers(property_sales, ['SALE PRICE', 'GROSS SQUARE FEET', 'TOTAL UNITS'])

In [7]:
# Aggregate results

aggregated_sales_2019 = property_sales[(property_sales['SALE DATE'] > '01-01-2019') & \
                                       (property_sales['SALE DATE'] <= '31-12-2019')] \
                        .groupby(['NEIGHBORHOOD'], as_index = False).mean()

aggregated_sales_2020 = property_sales[(property_sales['SALE DATE'] > '01-01-2020') & \
                                       (property_sales['SALE DATE'] <= '29-02-2020')] \
                        .groupby(['NEIGHBORHOOD'], as_index = False).mean()


## Data linkage

In [8]:
# Link taxi zones data to property sales data based on their name of locations
# for taxi zones data is the 'Zone', for property sales data is the 'Neighborhood'

def t(name):
    """
    return a transformed zone name that is splitted into several parts
    """
    name = name.lower()
    name = re.sub('[\/\-0-9()]+', ' ', name)
    names = name.split()
    return names



def link(name1, name_list_2, threshold_confidence):
    """
    return a possible match of name1 from name_list_2
    """
    confident_pairs = [[(name1, name2),
                        len(set(t(name1)).intersection(t(name2))) 
                        / len(t(name1))
                        ] \
                        for name2 in name_list_2]
    max_confidence = max([confidence for pair, confidence in confident_pairs])
    most_confident_pairs = [pair for pair, confidence in confident_pairs if confidence == max_confidence]
    number_of_most_confident_pairs = len(most_confident_pairs)
    if max_confidence >= threshold_confidence:
        # we randomly choose one pair, because if two zones have similar names they are likely 
        # to have similar locations
        return most_confident_pairs[random.randint(0, len(most_confident_pairs) - 1)][1]



# Find the corresponding neighborhood to each taxi zone
zones_dict = {}
for zone in zones['Zone'].unique():
    zones_dict[zone] = link(zone, 
                            property_sales['NEIGHBORHOOD'].unique(), 
                            threshold_confidence = 0.25) # set a low threshold confidence to get high recall



zones['Neighborhood'] = zones['Zone'].apply(lambda x: zones_dict[x])
f'Data linkage has {len([i for i in zones_dict.values() if i != None])} matches'

'Data linkage has 235 matches'

In [9]:
# Some errors still exist but not a big problem
zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone,Neighborhood
0,1,EWR,Newark Airport,EWR,AIRPORT LA GUARDIA
1,2,Queens,Jamaica Bay,Boro Zone,JAMAICA BAY
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,PELHAM GARDENS
3,4,Manhattan,Alphabet City,Yellow Zone,ALPHABET CITY
4,5,Staten Island,Arden Heights,Boro Zone,ARDEN HEIGHTS


In [10]:
# Join the taxi zones with property sales
zones_2019 = zones.merge(aggregated_sales_2019, left_on = ['Neighborhood'], right_on = ['NEIGHBORHOOD'], how = 'left')
zones_2020 = zones.merge(aggregated_sales_2020, left_on = ['Neighborhood'], right_on = ['NEIGHBORHOOD'], how = 'left')

# Fill na with Borough mean if the Neighborhood is not found
for i in range(6, 11):
    col = zones_2019.columns[i]
    zones_2019[col] = zones_2019[col].fillna(zones_2019.groupby('Borough')[col].transform('mean'))
    zones_2020[col] = zones_2020[col].fillna(zones_2020.groupby('Borough')[col].transform('mean'))
    
    
selected_columns = ['LocationID', 'TOTAL UNITS', 
                    'GROSS SQUARE FEET', 'SALE PRICE', 
                    'PRICE PER UNIT', 'PRICE PER SQUARE FEET']

renamed_columns = {'TOTAL UNITS': 'Total_units', 
                   'GROSS SQUARE FEET': 'Gross_square_feet', 
                   'SALE PRICE': 'Sale_price',
                   'PRICE PER UNIT': 'Price_per_unit', 
                   'PRICE PER SQUARE FEET': 'Price_per_square_feet'}
    
zones_2019 = zones_2019[selected_columns].rename(columns = renamed_columns)
zones_2020 = zones_2020[selected_columns].rename(columns = renamed_columns)

# Preprocess population data

In [11]:
# read in population by neigborhood data and its shape file
population = pd.read_csv(relative_directory + 'nyc_population_by_neighborhood.csv')
population_sf = gpd.read_file("../data/raw/external_data_and_taxi_zones/nynta2010_22b/nynta2010.shp")

# read in taxi zones shape file
zones_sf = gpd.read_file("../data/raw/external_data_and_taxi_zones/taxi_zones/taxi_zones.shp")
zones_sf['geometry'] = zones_sf['geometry'].to_crs(2830) # 2830 is the EPSG code for New York
zones_gdf = gpd.GeoDataFrame(
    pd.merge(zones, zones_sf, on='LocationID', how='inner')
)
zones_gdf = zones_gdf.drop_duplicates('LocationID') # Drop duplicated id

# Convert the geometry shape to to latitude and longitude
population_sf['geometry'] = population_sf['geometry'].to_crs(2830)

# we will use only 2010 data
population = population[population['Year'] == 2010]

# Merge
population_gdf = gpd.GeoDataFrame(
    pd.merge(population, population_sf, left_on = 'NTA Code', right_on = 'NTACode', how='inner')
)

## Inference
Since the metadata does not specify the unit of the areas, but we know that the area of New York is 783.8 km2. <br>
By trying out a few units, we can deduce that the internal unit is square foot. The size of New York in square feet is about 8.43675e+9.

In [12]:
f"New York is {population_gdf['Shape_Area'].sum():.6} square feet large"

'New York is 8.42299e+09 square feet large'

# Preprocess population data (continue.)

In [13]:
population_gdf['Shape_Area'] = population_gdf['Shape_Area'] / 27878400 # change square feet to square miles
population_gdf = population_gdf[['NTA Code', 'Population', 'Shape_Area', 'geometry']]

In [14]:
# Find the interceptions of all area between neighborhood and service zones
merged = gpd.overlay(zones_gdf, population_gdf, how = 'intersection', keep_geom_type = True)
merged = merged[['LocationID', 'NTA Code', 'Shape_Area_2', 
                 'Population', 'geometry' 
                ]].rename(columns = {'Shape_Area_2': 'Area_in_square_miles', 'NTA Code': 'NTA_Code'})

In [15]:
# Assume that population in each NTA are evenly distributed

# Merge again with the population_sf to calculate the proportion of intersection in NTA
merged = pd.merge(merged, population_sf, 
                  left_on = 'NTA_Code', right_on = 'NTACode', 
                  how='inner', 
                  suffixes=('_merged', '_population'))

merged['area_proportion'] = merged['geometry_merged'].area / merged['geometry_population'].area
merged['Partial_Population'] = merged['Population'] * merged['area_proportion']
merged['Population_By_LocationID'] = merged.groupby('LocationID')['Partial_Population'].transform('sum')

In [16]:
# Finalise the preprocessing for population data
zones_population = pd.merge(zones_gdf, merged, on = 'LocationID', how = 'left')

zones_population['Density_per_square_metre'] =  zones_population['Population_By_LocationID'] \
                                                / zones_population['geometry'].area
zones_population = zones_population[['LocationID', 'Population_By_LocationID', 'Density_per_square_metre']]
zones_population.drop_duplicates(inplace = True)
zones_population.reset_index(drop = True, inplace = True)

In [17]:
# Combine population infomation with taxi zones and property sales
new_zones_2019 = pd.merge(zones_2019, zones_population, on = 'LocationID', how = 'inner')
new_zones_2020 = pd.merge(zones_2020, zones_population, on = 'LocationID', how = 'inner')

# Write out the files
new_zones_2019.to_csv("../data/curated/new_zones_2019.csv", index = False)
new_zones_2020.to_csv("../data/curated/new_zones_2020.csv", index = False)

In [18]:
# View the processed data
new_zones_2019

Unnamed: 0,LocationID,Total_units,Gross_square_feet,Sale_price,Price_per_unit,Price_per_square_feet,Population_By_LocationID,Density_per_square_metre
0,1,1.40,1459.20,831000.00,658500.00,599.66,,
1,2,1.57,1728.00,473214.29,303035.71,294.48,176.83,0.00
2,3,1.36,1950.72,582088.33,466184.68,307.00,28902.34,0.01
3,4,4.00,4154.00,600000.00,150000.00,144.44,25123.60,0.03
4,5,1.13,1565.75,454392.92,411766.36,293.74,25233.23,0.01
...,...,...,...,...,...,...,...,...
255,259,1.70,1884.46,476704.42,306228.19,270.59,42466.74,0.01
256,260,1.84,1857.74,852615.26,537847.56,493.52,45107.94,0.01
257,261,2.08,2664.27,759143.49,387088.11,328.53,7241.92,0.02
258,262,1.72,1804.43,734973.24,484928.18,421.67,39675.98,0.06


# Preprocess the weather data

## about how to get this data
1. Open https://www.visualcrossing.com/weather/weather-data-services
2. Create a free acount with 1,000 rows queries available
3. Summit the query for New York City weather from January 1st to December 31st of 2019
4. Summit the query for New York City weather from January 1st to February 29th of 2020

In [19]:
# Read the weather data
weather_2019 = pd.read_csv("../data/raw/external_data_and_taxi_zones/nyc_weather_2019_Jan_to_Dec.csv",
                           parse_dates = ['datetime', ])
weather_2020 = pd.read_csv("../data/raw/external_data_and_taxi_zones/nyc_weather_2020_Jan_to_Feb.csv",
                           parse_dates = ['datetime', ])

# Using feelslike temperature is more suitable in the case of tip amount analysis
selected_columns = ['month', 'day_of_month', 'feelslike', 
                    'feelslikemax', 'feelslikemin', 'feelslike_temp_diff',
                    'precip', 'precipcover', 'snow', 
                    'snowdepth', 'windspeed', 'cloudcover', 'visibility'
                   ]

def transform_weather_data(weather):
    weather['day_of_month'] = weather['datetime'].dt.day
    weather['month'] = weather['datetime'].dt.month
    weather['feelslike_temp_diff'] = weather['feelslikemax'] - weather['feelslikemin']
    weather = weather[selected_columns]
    return weather

weather_2019 = transform_weather_data(weather_2019)
weather_2020 = transform_weather_data(weather_2020)


# Write out the files
weather_2019.to_csv("../data/curated/weather_2019.csv", index = False)
weather_2020.to_csv("../data/curated/weather_2020.csv", index = False)

In [20]:
# View the processed data
weather_2019

Unnamed: 0,month,day_of_month,feelslike,feelslikemax,feelslikemin,feelslike_temp_diff,precip,precipcover,snow,snowdepth,windspeed,cloudcover,visibility
0,1,1,8.60,14.60,1.20,13.40,7.57,29.17,0.00,0.00,39.90,68.70,13.70
1,1,2,-0.20,3.80,-2.30,6.10,0.00,0.00,0.00,0.00,21.60,48.20,16.00
2,1,3,2.20,6.40,-0.90,7.30,0.00,0.00,0.00,0.00,33.20,66.10,15.90
3,1,4,1.90,7.70,-2.10,9.80,0.00,0.00,0.00,0.00,26.20,29.90,15.90
4,1,5,3.30,8.10,1.80,6.30,11.93,75.00,0.00,0.00,29.20,78.50,8.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,12,27,9.40,12.20,5.70,6.50,0.00,0.00,0.00,0.00,19.60,70.80,15.70
361,12,28,7.30,10.30,5.60,4.70,0.00,0.00,0.00,0.00,16.80,27.70,14.10
362,12,29,3.40,6.20,1.00,5.20,5.92,29.17,0.00,0.00,14.70,63.50,13.00
363,12,30,0.00,2.10,-1.80,3.90,17.08,87.50,0.00,0.00,38.90,95.80,6.10
