In [1]:
import re
import pandas as pd
import random
random.seed(10)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Read in taxi_zones lookup table
zones = pd.read_csv("../data/raw/external_data_and_taxi_zones/taxi_zone_lookup.csv")
zones = zones.drop([263, 264]) # drop the unknown zones

# Preprocess property sales data

## about how to access this data
1. download 2019&2020 property sales data (in xlsx format) for all 5 boroughs from "https://www1.nyc.gov/site/finance/taxes/property-annualized-sales-update.page"
2. for macbook, open a data file in Excel, save as '.csv' format
3. open the csv files with text edit, delete the first few rows that are not data manually
4. repeat for all 10 files
5. move the files under this directory
6. rename '2019_statenisland' as '2019_staten_island'

In [3]:
relative_directory = '../data/raw/external_data_and_taxi_zones/'

header = ['BOROUGH', 'NEIGHBORHOOD','BUILDING CLASS CATEGORY', 
          'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 
          'EASE-MENT', 'BUILDING CLASS AT PRESENT', 'ADDRESS',
          'APARTMENT NUMBER', 'ZIP CODE', 'RESIDENTIAL UNITS',
          'COMMERCIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET',
          'GROSS SQUARE FEET', 'YEAR BUILT', 'TAX CLASS AT TIME OF SALE',
          'BUILDING CLASS AT TIME OF SALE', 'SALE PRICE', 'SALE DATE']

boroughs = ['bronx', 'brooklyn', 'manhattan', 'queens', 'staten_island']

years = ['2019', '2020']

file_names = [relative_directory + year + '_' + borough + '.csv' for borough in boroughs \
                                                                      for year in years]

# pass the parameters to pd.read_csv in advanced
def read_csv_for_mapping(data):
    return pd.read_csv(data, 
                       names = header, 
                       parse_dates = ['SALE DATE', ], 
                       dayfirst = True)

property_sales = pd.concat(map(read_csv_for_mapping, file_names))

## Clean the data

In [4]:
# Check data types
property_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205962 entries, 0 to 21709
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   BOROUGH                         152604 non-null  float64       
 1   NEIGHBORHOOD                    152604 non-null  object        
 2   BUILDING CLASS CATEGORY         152604 non-null  object        
 3   TAX CLASS AT PRESENT            152373 non-null  object        
 4   BLOCK                           152604 non-null  float64       
 5   LOT                             152604 non-null  float64       
 6   EASE-MENT                       0 non-null       float64       
 7   BUILDING CLASS AT PRESENT       152373 non-null  object        
 8   ADDRESS                         152604 non-null  object        
 9   APARTMENT NUMBER                33400 non-null   object        
 10  ZIP CODE                        152579 non-null  float64 

In [5]:
# Drop rows with NaN valus in required fields
property_sales.dropna(how='any', subset=['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS AT PRESENT',
                                         'TOTAL UNITS', 'GROSS SQUARE FEET', 'BUILDING CLASS AT TIME OF SALE', 
                                         'SALE PRICE', 'SALE DATE'], inplace=True)


# Cast numeric data type to price, size and number of units
# Note that Column 'TOTAL UNITS' contains both float and str, cast it to str first
property_sales['SALE PRICE'] = property_sales['SALE PRICE'].str.replace(',', '').astype(float)
property_sales['GROSS SQUARE FEET'] = property_sales['GROSS SQUARE FEET'].str.replace(',', '').astype(float)
property_sales['TOTAL UNITS'] = property_sales['TOTAL UNITS'].astype(str).str.replace(',', '').astype(float) 



# Filter data that fits our analysis
# Type ABCD are family dwellings and apartments, H is hotel
condition = (property_sales['SALE DATE'] > '01-01-2019') & (property_sales['SALE DATE'] <= '29-02-2020') &\
            (property_sales['SALE PRICE'] > 0) &\
            (property_sales['GROSS SQUARE FEET'] > 0) &\
            (property_sales['TOTAL UNITS'] > 0) &\
            (property_sales['BUILDING CLASS AT PRESENT'] == property_sales['BUILDING CLASS AT TIME OF SALE']) &\
            (property_sales['BUILDING CLASS AT TIME OF SALE'].str.contains('^[ABCDH]', regex=True))


property_sales = property_sales.loc[condition]


# Drop unnecessary columns
property_sales.drop(columns = ['BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT', 'BLOCK', 
                               'LOT', 'EASE-MENT', 'BUILDING CLASS AT PRESENT', 
                               'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE', 
                               'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 
                               'LAND SQUARE FEET', 'YEAR BUILT', 'TAX CLASS AT TIME OF SALE'], inplace=True)


# Add 'PRICE PER UNIT' and 'PRICE PER SQUARE FEET' as features

property_sales['PRICE PER UNIT'] = property_sales['SALE PRICE'] / property_sales['TOTAL UNITS']

property_sales['PRICE PER SQUARE FEET'] = property_sales['SALE PRICE'] / property_sales['GROSS SQUARE FEET']



In [12]:
# Aggregate results

borough_dict = {'Queens': 4, 'Bronx': 2, 'Manhattan': 1, 'Staten Island': 5, 'Brooklyn': 3}

aggregated_sales_2019 = property_sales[(property_sales['SALE DATE'] > '01-01-2019') & \
                                       (property_sales['SALE DATE'] <= '31-12-2019')] \
                        .groupby(['NEIGHBORHOOD'], axis=1).mean()

aggregated_sales_2020 = property_sales[(property_sales['SALE DATE'] > '01-01-2020') & \
                                       (property_sales['SALE DATE'] <= '29-02-2020')] \
                        .groupby(['NEIGHBORHOOD']).mean()


ValueError: cannot reindex from a duplicate axis

In [11]:
# Data linkage

def t(name):
    """
    return a transformed zone name that is splitted into several parts
    """
    name = name.lower()
    name = re.sub('[\/\-0-9()]+', ' ', name)
    names = name.split()
    return names



def link(name1, name_list_2, threshold_confidence):
    """
    return a possible match of name1 from name_list_2
    """
    confident_pairs = [[(name1, name2),
                        len(set(t(name1)).intersection(t(name2))) 
                        / len(t(name1))
                        ] \
                        for name2 in name_list_2]
    max_confidence = max([confidence for pair, confidence in confident_pairs])
    most_confident_pairs = [pair for pair, confidence in confident_pairs if confidence == max_confidence]
    number_of_most_confident_pairs = len(most_confident_pairs)
    if max_confidence >= threshold_confidence:
        # we randomly choose one pair 
        return most_confident_pairs[random.randint(0, len(most_confident_pairs) - 1)][1]



# Find the corresponding neighborhood to each taxi zone
zones_dict = {}
for zone in zones['Zone'].unique():
    zones_dict[zone] = link(zone, 
                            property_sales['NEIGHBORHOOD'].unique(), 
                            threshold_confidence = 0.25)


    
f'Data linkage has {len([i for i in zones_dict.values() if i != None])} matches'

'Data linkage has 240 matches'

In [8]:
zones['Neighborhood'] = zones['Zone'].apply(lambda x: zones_dict[x])
zones_2019 = zones.merge(aggregated_sales_2019, left_on = 'Neighborhood', right_on = 'NEIGHBORHOOD', how = 'left')

In [9]:
# According to property sales data, 1 for man, 2 for bronx, 3 for brooklyn, 4 for queens, 5 for staten island
borough_dict = {'EWR': None, 'Queens': 4, 'Bronx': 2, 'Manhattan': 1, 'Staten Island': 5, 'Brooklyn': 3}
zones_2019[zones_2019['Borough'].apply(lambda x: borough_dict[x]) != zones_2019['BOROUGH']]


Unnamed: 0,LocationID,Borough,Zone,service_zone,Neighborhood,BOROUGH,TOTAL UNITS,GROSS SQUARE FEET,SALE PRICE,PRICE PER UNIT,PRICE PER SQUARE FEET
0,1,EWR,Newark Airport,EWR,AIRPORT LA GUARDIA,4.00,1.40,1459.20,831000.00,658500.00,599.66
7,8,Queens,Astoria Park,Boro Zone,SUNSET PARK,3.00,3.51,3162.25,1315530.56,562807.59,543.60
8,9,Queens,Auburndale,Boro Zone,,,,,,,
9,10,Queens,Baisley Park,Boro Zone,MARINE PARK,3.00,1.26,1513.30,669421.75,574864.96,466.41
11,12,Manhattan,Battery Park,Yellow Zone,ROCKAWAY PARK,4.00,5.21,4431.07,1124656.48,297027.50,298.36
...,...,...,...,...,...,...,...,...,...,...,...
233,234,Manhattan,Union Sq,Yellow Zone,,,,,,,
240,241,Bronx,Van Cortlandt Village,Boro Zone,QUEENS VILLAGE,4.00,1.24,1500.55,522866.19,457697.05,366.72
246,247,Bronx,West Concourse,Boro Zone,MIDTOWN WEST,1.00,3.42,94702.58,47281783.75,14984653.75,540.12
247,248,Bronx,West Farms/Bronx River,Boro Zone,UPPER WEST SIDE (79-96),1.00,23.50,49557.71,6374996.92,1802995.78,644.84


In [10]:
aggregated_sales_2019

Unnamed: 0_level_0,BOROUGH,TOTAL UNITS,GROSS SQUARE FEET,SALE PRICE,PRICE PER UNIT,PRICE PER SQUARE FEET
NEIGHBORHOOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AIRPORT LA GUARDIA,4.00,1.40,1459.20,831000.00,658500.00,599.66
ALPHABET CITY,1.00,14.54,10781.54,2578953.42,253057.64,301.97
ANNADALE,5.00,1.34,2480.31,749613.76,615627.88,343.32
ARDEN HEIGHTS,5.00,1.13,1565.75,454392.92,411766.36,293.74
ARROCHAR,5.00,1.25,1844.60,670435.35,584453.68,395.66
...,...,...,...,...,...,...
WOODHAVEN,4.00,1.64,1714.44,603571.52,407833.51,362.20
WOODLAWN,2.00,1.67,2179.31,544539.64,368575.03,264.41
WOODROW,5.00,1.36,1828.48,620101.77,492298.60,354.12
WOODSIDE,4.00,4.01,3576.37,1876887.25,555582.67,523.79
