In [1]:
import os
import sys
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

pd.options.display.max_columns = 500

In [3]:
#collision = pd.read_csv(".\\data\\NYC_collision.csv")

In [12]:
split_int = np.linspace(0, len(collision), 10).astype(int)

In [19]:
for s in range(len(split_int) - 1):
    
    data_copy = collision.loc[split_int[s]:split_int[s+1]].reset_index(drop = True).copy()
    
    data_copy["ZIP CODE"] = data_copy["ZIP CODE"].apply(lambda v: np.nan if isinstance(v, str)
                                                        and v.isspace()
                                                        else int(v) if isinstance(v, str)
                                                        else v if pd.isnull(v)
                                                        else int(v))
    
    
    data_copy.to_feather(f".\\data\\NYC_collision_{s+1}.feather")

## EDA & Basic Cleaning

In [2]:
####################################################
prices = pd.read_csv(".\\data\\NYC_house_price.csv")
####################################################

prices = prices.drop(["Unnamed: 0.1", "Unnamed: 0", "EASE-MENT",
                      "BUILDING CLASS AT PRESENT"], axis = 1)
prices["SALE DATE"] = pd.to_datetime(prices["SALE DATE"], format = "%Y-%m-%d")
prices["SALE YEAR"] = prices["SALE DATE"].dt.year
prices["SALE MONTH"] = prices["SALE DATE"].dt.month
prices["SALE MONTH"] = prices["SALE MONTH"].map({1: "JAN", 2: "FEB", 3: "MAR",
                                                 4: "APR", 5: "MAY", 6: "JUN",
                                                 7: "JUL", 8: "AUG", 9: "SEP",
                                                 10: "OCT", 11: "NOV", 12: "DEC"})

for pcol in ["SALE PRICE", "GROSS SQUARE FEET", "LAND SQUARE FEET"]:
    prices[pcol] = prices[pcol].apply(lambda v: np.nan
                                      if v == ' -  ' else v).astype(np.float32)
    
prices["ZIP CODE"] = prices["ZIP CODE"].astype(str)

prices["BOROUGH"] = prices["BOROUGH"].map({1:"MANHATTAN", 2:"BRONX",
                                           3:"BROOKLYN", 4: "QUEENS",
                                           5: "STATEN ISLAND"})

prices["BUILDING CLASS CATEGORY"] = prices["BUILDING CLASS CATEGORY"].str.strip()

# Fill 'LAND SQUARE FEET' nan with zero with a condition
prices.loc[(prices["LAND SQUARE FEET"].isnull()) &
           (prices["BUILDING CLASS CATEGORY"] != "44 CONDO PARKING"), "LAND SQUARE FEET"] = 0

# Divide by 1 M
prices["SALE PRICE"] = prices["SALE PRICE"]/1e6

In [3]:
# PERIMETER: np.nan 'GROSS SQUARE FEET' & 'TOTAL UNITS' = 0
prices = prices[~((prices["GROSS SQUARE FEET"].isnull()) & (prices["TOTAL UNITS"] == 0))]

# PERIMETER: year built cannot be missing
prices = prices[prices["YEAR BUILT"] != 0]

# PERIMETER: drop other nan brutally
prices = prices[(prices["LAND SQUARE FEET"].notna()) & (prices["GROSS SQUARE FEET"].notna())]

In [4]:
prices.sample(5, random_state = 101)

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE YEAR,SALE MONTH
43218,BROOKLYN,OCEAN PARKWAY-NORTH,01 ONE FAMILY DWELLINGS,1,6503,130,346 WALSH COURT,,11230,1,0,1,2239.0,1120.0,1920,1,A1,0.805,2016-09-27,2016,SEP
74320,QUEENS,ST. ALBANS,01 ONE FAMILY DWELLINGS,1,12640,48,205-06 118TH AVENUE,,11412,1,0,1,1808.0,1152.0,1930,1,A5,0.4342,2017-03-09,2017,MAR
36358,BROOKLYN,EAST NEW YORK,02 TWO FAMILY DWELLINGS,1,3738,5,271 PENNSYLVANIA AVENUE,,11207,2,0,2,1620.0,2000.0,1910,1,B1,0.779899,2016-09-29,2016,SEP
1970,MANHATTAN,FASHION,07 RENTALS - WALKUP APARTMENTS,2,778,28,221 WEST 28TH STREET,,10001,22,2,24,2460.0,11538.0,1920,2,C4,11.0,2017-02-06,2017,FEB
74946,QUEENS,WHITESTONE,01 ONE FAMILY DWELLINGS,1,4625,66,15-30 146TH STREET,,11357,1,0,1,3600.0,1578.0,1940,1,A1,0.6,2017-04-06,2017,APR


In [5]:
####################################################
airbnb = pd.read_csv(".\\data\\NYC_Airbnb_2017.csv")
####################################################

airbnb = airbnb.drop(["id", "name", "last_review", "host_name",
                      "neighbourhood", "host_id"], axis = 1)
airbnb["reviews_per_month"] = airbnb["reviews_per_month"].fillna(0)

# Geographical data

boroughs_geographic = airbnb[["neighbourhood_group", "latitude",
                              "longitude"]].copy()

airbnb["neighbourhood_group"] = airbnb["neighbourhood_group"].str.upper()
airbnb["latitude"] = pd.cut(airbnb["latitude"], list(np.arange(403, 412, 1)/10))
airbnb["longitude"] = pd.cut(airbnb["longitude"], list((-np.arange(736, 744, 1)/10)[::-1]))

#### replace zero with mean
#airbnb.loc[airbnb["availability_365"] == 0,
#           "availability_365"] = int(airbnb.loc[airbnb["availability_365"] > 0,
#                                                "availability_365"].mean())
airbnb = airbnb.pivot_table(index = "neighbourhood_group",
                            values = list(airbnb.columns[3:]),
                            aggfunc = {'room_type': lambda v: stats.mode(v).mode[0],
                                       'price': np.mean,
                                       'minimum_nights': np.mean,
                                       'number_of_reviews': np.mean,
                                       'reviews_per_month': np.mean,
                                       'calculated_host_listings_count': np.mean,
                                       'availability_365': np.mean}).reset_index()

airbnb

Unnamed: 0,neighbourhood_group,availability_365,calculated_host_listings_count,minimum_nights,number_of_reviews,price,reviews_per_month,room_type
0,BRONX,197.254971,1.88655,2.863158,18.138012,87.11345,1.353415,Private room
1,BROOKLYN,121.269844,1.76684,3.988037,17.853909,118.199901,1.033828,Private room
2,MANHATTAN,115.039239,1.536606,4.246006,18.611664,183.309904,1.080067,Entire home/apt
3,QUEENS,168.468922,2.394571,3.326121,19.971479,98.099921,1.48831,Private room
4,STATEN ISLAND,225.929712,2.370607,2.801917,23.166134,128.153355,1.332109,Private room


In [12]:
#####################################################
collision = pd.concat([pd.read_feather(f".\\data\\{f}") for f in os.listdir(".\\data")
                       if "NYC_collision" in f])
collision = collision.applymap(lambda v: np.nan if v is None else v)
#####################################################

collision = collision.drop(["LOCATION", "COLLISION_ID", "ON STREET NAME",
                            "CROSS STREET NAME", "OFF STREET NAME", "ACCIDENT TIME",
                            "CONTRIBUTING FACTOR VEHICLE 1", "CONTRIBUTING FACTOR VEHICLE 2",
                            "CONTRIBUTING FACTOR VEHICLE 3", "CONTRIBUTING FACTOR VEHICLE 4",
                            "CONTRIBUTING FACTOR VEHICLE 5", "ZIP CODE"], axis = 1)

collision["NUMBER OF PERSONS KILLED"] = collision["NUMBER OF PERSONS KILLED"].fillna(0)
collision["NUMBER OF PERSONS INJURED"] = collision["NUMBER OF PERSONS INJURED"].fillna(0)

collision["ACCIDENT DATE"] = pd.to_datetime(collision["ACCIDENT DATE"].apply(lambda v: v.split("T")[0]
                                                                             if isinstance(v, str) else v),
                                            format = "%Y-%m-%d")

# CONSIDER ONLY PRESENCE OF VEHICLE
for vtc in range(5):
    collision[f"VEHICLE TYPE CODE {vtc+1}"] = collision[f"VEHICLE TYPE CODE {vtc+1}"]\
                                                .apply(lambda v: 0 if pd.isnull(v) else 1)
    
collision["VEHICLES INVOLVED"] = collision[["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2",
                                        "VEHICLE TYPE CODE 3", "VEHICLE TYPE CODE 4",
                                        "VEHICLE TYPE CODE 5"]].sum(axis = 1)
collision = collision.drop(["VEHICLE TYPE CODE 1", "VEHICLE TYPE CODE 2",
                            "VEHICLE TYPE CODE 3", "VEHICLE TYPE CODE 4",
                            "VEHICLE TYPE CODE 5"], axis = 1)



# FILL SOME VALUES WITH THE CLOSEST BOROUGH ACCORDING TO AVERAGE COORDINATES FROM AIRBNB
boroughs_geographic_pivot = boroughs_geographic.pivot_table(index = "neighbourhood_group",
                                                            values = ["latitude", "longitude"],
                                                            aggfunc = np.mean).reset_index()

boroughs_geographic_pivot = boroughs_geographic_pivot.values

boroughs_geographic_pivot[:, 1] = boroughs_geographic_pivot[:, 1].astype(np.float64)
boroughs_geographic_pivot[:, 2] = boroughs_geographic_pivot[:, 2].astype(np.float64)

collision.loc[(collision["BOROUGH"].isnull()) &
          (collision["LONGITUDE"].notna()) &
          (collision["LATITUDE"].notna()),
              "BOROUGH"] = collision[(collision["BOROUGH"].isnull()) &
                           (collision["LONGITUDE"].notna()) &
                           (collision["LATITUDE"].notna())].apply(lambda row:
                                                                  boroughs_geographic_pivot[\
                                                                    np.sqrt((np.power(boroughs_geographic_pivot[:, 1] -\
                                                                                      row["LATITUDE"], 2) +\
                                                                             np.power(boroughs_geographic_pivot[:, 2] -\
                                                                                      row["LONGITUDE"], 2))\
                                                                            .astype(np.float64)).argmin(), 0].upper(),
                                                                  axis = 1)

# Drop unknown location & some other columns
collision = collision[collision["BOROUGH"].notna()].drop(["LATITUDE", "LONGITUDE"], axis = 1)

NameError: name 'boroughs_geographic' is not defined

In [21]:
# Develop custom "merge" for 'PRICES' based on data and borough

collision[(collision["ACCIDENT DATE"] < prices.loc[0, "SALE DATE"]) &
          (collision["BOROUGH"] == prices.loc[0, "BOROUGH"])].pivot_table(index = "BOROUGH",
                      values = ["NUMBER OF PERSONS INJURED",
                                "NUMBER OF PERSONS KILLED", "NUMBER OF PEDESTRIANS INJURED",
                                "NUMBER OF PEDESTRIANS KILLED", "NUMBER OF CYCLIST INJURED",
                                "NUMBER OF CYCLIST KILLED", "NUMBER OF MOTORIST INJURED",
                                "NUMBER OF MOTORIST KILLED", "VEHICLES INVOLVED"],
                      aggfunc = {"NUMBER OF PERSONS INJURED": np.nanmean,
                                 "NUMBER OF PERSONS KILLED": np.nanmean,
                                 "NUMBER OF PEDESTRIANS INJURED": np.nanmean,
                                 "NUMBER OF PEDESTRIANS KILLED": np.nanmean,
                                 "NUMBER OF CYCLIST INJURED": np.nanmean,
                                 "NUMBER OF CYCLIST KILLED": np.nanmean,
                                 "NUMBER OF MOTORIST INJURED": np.nanmean,
                                 "NUMBER OF MOTORIST KILLED": np.nanmean,
                                 "VEHICLES INVOLVED": np.nanmean})

Unnamed: 0_level_0,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,VEHICLES INVOLVED
BOROUGH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MANHATTAN,0.027924,0.000101,0.092274,0.000153,0.057536,0.000625,0.177263,0.000879,1.940395


In [20]:
collision.pivot_table(index = "BOROUGH",
                      values = ["NUMBER OF PERSONS INJURED",
                                "NUMBER OF PERSONS KILLED", "NUMBER OF PEDESTRIANS INJURED",
                                "NUMBER OF PEDESTRIANS KILLED", "NUMBER OF CYCLIST INJURED",
                                "NUMBER OF CYCLIST KILLED", "NUMBER OF MOTORIST INJURED",
                                "NUMBER OF MOTORIST KILLED", "VEHICLES INVOLVED"],
                      aggfunc = {"NUMBER OF PERSONS INJURED": np.nanmean,
                                 "NUMBER OF PERSONS KILLED": np.nanmean,
                                 "NUMBER OF PEDESTRIANS INJURED": np.nanmean,
                                 "NUMBER OF PEDESTRIANS KILLED": np.nanmean,
                                 "NUMBER OF CYCLIST INJURED": np.nanmean,
                                 "NUMBER OF CYCLIST KILLED": np.nanmean,
                                 "NUMBER OF MOTORIST INJURED": np.nanmean,
                                 "NUMBER OF MOTORIST KILLED": np.nanmean,
                                 "VEHICLES INVOLVED": np.nanmean})

Unnamed: 0_level_0,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,VEHICLES INVOLVED
BOROUGH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BRONX,0.014385,6.5e-05,0.226212,0.00057,0.058504,0.000687,0.299055,0.001318,1.912973
BROOKLYN,0.029884,0.000137,0.206987,0.000395,0.060318,0.000634,0.297021,0.001172,1.939377
MANHATTAN,0.029397,0.000108,0.09676,0.000167,0.056405,0.000608,0.182266,0.000883,1.912978
QUEENS,0.014832,7.4e-05,0.216512,0.000596,0.043409,0.000641,0.27473,0.001311,1.981189
STATEN ISLAND,0.005543,3.9e-05,0.223357,0.000748,0.033376,0.000593,0.262237,0.001379,1.947661
