In [1]:
import warnings

import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/Gaukhar-ai/for_my_Thinkful_work/master/capstone/NY_real_estate/nyc-rolling-sales.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [4]:
df.dtypes

Unnamed: 0                         int64
BOROUGH                            int64
NEIGHBORHOOD                      object
BUILDING CLASS CATEGORY           object
TAX CLASS AT PRESENT              object
BLOCK                              int64
LOT                                int64
EASE-MENT                         object
BUILDING CLASS AT PRESENT         object
ADDRESS                           object
APARTMENT NUMBER                  object
ZIP CODE                           int64
RESIDENTIAL UNITS                  int64
COMMERCIAL UNITS                   int64
TOTAL UNITS                        int64
LAND SQUARE FEET                  object
GROSS SQUARE FEET                 object
YEAR BUILT                         int64
TAX CLASS AT TIME OF SALE          int64
BUILDING CLASS AT TIME OF SALE    object
SALE PRICE                        object
SALE DATE                         object
dtype: object

In [5]:
drop_cols = ['TAX CLASS AT PRESENT', 'Unnamed: 0', 'NEIGHBORHOOD', 'BOROUGH', 'BLOCK', 
             'BUILDING CLASS AT PRESENT', 'LOT', 'TOTAL UNITS']
df.drop(columns = drop_cols, inplace=True) 

In [6]:
df['ADDRESS'] = df['ADDRESS'] + ' NYC'

In [7]:
num_cols = []

bin_cols = []

cat_cols = ['BUILDING CLASS CATEGORY']
drop_cats = []


In [8]:
df.head()

Unnamed: 0,BUILDING CLASS CATEGORY,EASE-MENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,07 RENTALS - WALKUP APARTMENTS,,153 AVENUE B NYC,,10009,5,0,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,07 RENTALS - WALKUP APARTMENTS,,234 EAST 4TH STREET NYC,,10009,28,3,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,07 RENTALS - WALKUP APARTMENTS,,197 EAST 3RD STREET NYC,,10009,16,1,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,07 RENTALS - WALKUP APARTMENTS,,154 EAST 7TH STREET NYC,,10009,10,0,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,07 RENTALS - WALKUP APARTMENTS,,301 EAST 10TH STREET NYC,,10009,6,0,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [9]:
df['ZIP CODE'].value_counts()

10314    1687
11354    1384
11201    1324
11235    1312
11234    1165
         ... 
10105       5
10281       1
10167       1
10803       1
11430       1
Name: ZIP CODE, Length: 186, dtype: int64

In [10]:
df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'], errors='coerce')


In [11]:
df['SALE PRICE'].dtypes

dtype('float64')

In [12]:
df["BUILDING CLASS CATEGORY"].value_counts()

01 ONE FAMILY DWELLINGS                         18235
02 TWO FAMILY DWELLINGS                         15828
13 CONDOS - ELEVATOR APARTMENTS                 12989
10 COOPS - ELEVATOR APARTMENTS                  12902
03 THREE FAMILY DWELLINGS                        4384
07 RENTALS - WALKUP APARTMENTS                   3466
09 COOPS - WALKUP APARTMENTS                     2767
04 TAX CLASS 1 CONDOS                            1656
44 CONDO PARKING                                 1441
15 CONDOS - 2-10 UNIT RESIDENTIAL                1281
05 TAX CLASS 1 VACANT LAND                       1248
17 CONDO COOPS                                   1201
22 STORE BUILDINGS                                935
12 CONDOS - WALKUP APARTMENTS                     926
14 RENTALS - 4-10 UNIT                            671
29 COMMERCIAL GARAGES                             587
43 CONDO OFFICE BUILDINGS                         475
31 COMMERCIAL VACANT LAND                         463
08 RENTALS - ELEVATOR APARTM

In [13]:
building_cat = pd.get_dummies(df, columns=["BUILDING CLASS CATEGORY"], drop_first=True)


In [14]:
df.isnull().sum()

BUILDING CLASS CATEGORY               0
EASE-MENT                             0
ADDRESS                               0
APARTMENT NUMBER                      0
ZIP CODE                              0
RESIDENTIAL UNITS                     0
COMMERCIAL UNITS                      0
LAND SQUARE FEET                      0
GROSS SQUARE FEET                     0
YEAR BUILT                            0
TAX CLASS AT TIME OF SALE             0
BUILDING CLASS AT TIME OF SALE        0
SALE PRICE                        14561
SALE DATE                             0
dtype: int64

In [15]:
df.head()

Unnamed: 0,BUILDING CLASS CATEGORY,EASE-MENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,07 RENTALS - WALKUP APARTMENTS,,153 AVENUE B NYC,,10009,5,0,1633,6440,1900,2,C2,6625000.0,2017-07-19 00:00:00
1,07 RENTALS - WALKUP APARTMENTS,,234 EAST 4TH STREET NYC,,10009,28,3,4616,18690,1900,2,C7,,2016-12-14 00:00:00
2,07 RENTALS - WALKUP APARTMENTS,,197 EAST 3RD STREET NYC,,10009,16,1,2212,7803,1900,2,C7,,2016-12-09 00:00:00
3,07 RENTALS - WALKUP APARTMENTS,,154 EAST 7TH STREET NYC,,10009,10,0,2272,6794,1913,2,C4,3936272.0,2016-09-23 00:00:00
4,07 RENTALS - WALKUP APARTMENTS,,301 EAST 10TH STREET NYC,,10009,6,0,2369,4615,1900,2,C2,8000000.0,2016-11-17 00:00:00


In [16]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="NY")
result=df['ADDRESS'].head().apply(geolocator.geocode)

In [17]:
print(result[0].latitude)

40.726572950000005


In [18]:
result[0]

Location(153, Avenue B, Alphabet City, Manhattan Community Board 3, Manhattan, New York County, New York, 10009, United States of America, (40.726572950000005, -73.97987037365662, 0.0))

In [19]:
from functools import partial

In [20]:
def get_latitude(geoloc):
    return geoloc.latitude
def get_longitude(geoloc):
    return geoloc.longitude

In [21]:
result.apply(get_latitude)

0    40.726573
1    40.723315
2    40.722933
3    40.725413
4    40.727782
Name: ADDRESS, dtype: float64

In [22]:
result.apply(get_longitude)

0   -73.979870
1   -73.983137
2   -73.983285
3   -73.982441
4   -73.981660
Name: ADDRESS, dtype: float64

In [23]:
df['LAT'] = result.apply(get_latitude)
df['LONG'] = result.apply(get_longitude)
df.to_csv('latlong.csv', index = False)

In [24]:
df.head()

Unnamed: 0,BUILDING CLASS CATEGORY,EASE-MENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,LAT,LONG
0,07 RENTALS - WALKUP APARTMENTS,,153 AVENUE B NYC,,10009,5,0,1633,6440,1900,2,C2,6625000.0,2017-07-19 00:00:00,40.726573,-73.97987
1,07 RENTALS - WALKUP APARTMENTS,,234 EAST 4TH STREET NYC,,10009,28,3,4616,18690,1900,2,C7,,2016-12-14 00:00:00,40.723315,-73.983137
2,07 RENTALS - WALKUP APARTMENTS,,197 EAST 3RD STREET NYC,,10009,16,1,2212,7803,1900,2,C7,,2016-12-09 00:00:00,40.722933,-73.983285
3,07 RENTALS - WALKUP APARTMENTS,,154 EAST 7TH STREET NYC,,10009,10,0,2272,6794,1913,2,C4,3936272.0,2016-09-23 00:00:00,40.725413,-73.982441
4,07 RENTALS - WALKUP APARTMENTS,,301 EAST 10TH STREET NYC,,10009,6,0,2369,4615,1900,2,C2,8000000.0,2016-11-17 00:00:00,40.727782,-73.98166


In [25]:
df["SALE PRICE"] = df["SALE PRICE"].replace(" - ", np.nan, regex=True)


In [26]:
#

df.ADDRESS[ : 10]

In [27]:
df[0:11]


Unnamed: 0,BUILDING CLASS CATEGORY,EASE-MENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,LAT,LONG
0,07 RENTALS - WALKUP APARTMENTS,,153 AVENUE B NYC,,10009,5,0,1633,6440,1900,2,C2,6625000.0,2017-07-19 00:00:00,40.726573,-73.97987
1,07 RENTALS - WALKUP APARTMENTS,,234 EAST 4TH STREET NYC,,10009,28,3,4616,18690,1900,2,C7,,2016-12-14 00:00:00,40.723315,-73.983137
2,07 RENTALS - WALKUP APARTMENTS,,197 EAST 3RD STREET NYC,,10009,16,1,2212,7803,1900,2,C7,,2016-12-09 00:00:00,40.722933,-73.983285
3,07 RENTALS - WALKUP APARTMENTS,,154 EAST 7TH STREET NYC,,10009,10,0,2272,6794,1913,2,C4,3936272.0,2016-09-23 00:00:00,40.725413,-73.982441
4,07 RENTALS - WALKUP APARTMENTS,,301 EAST 10TH STREET NYC,,10009,6,0,2369,4615,1900,2,C2,8000000.0,2016-11-17 00:00:00,40.727782,-73.98166
5,07 RENTALS - WALKUP APARTMENTS,,516 EAST 12TH STREET NYC,,10009,20,0,2581,9730,1900,2,C4,,2017-07-20 00:00:00,,
6,07 RENTALS - WALKUP APARTMENTS,,210 AVENUE B NYC,,10009,8,0,1750,4226,1920,2,C4,3192840.0,2016-09-23 00:00:00,,
7,07 RENTALS - WALKUP APARTMENTS,,520 EAST 14TH STREET NYC,,10009,44,2,5163,21007,1900,2,C7,,2017-07-20 00:00:00,,
8,08 RENTALS - ELEVATOR APARTMENTS,,141 AVENUE D NYC,,10009,15,0,1534,9198,1920,2,D5,,2017-06-20 00:00:00,,
9,08 RENTALS - ELEVATOR APARTMENTS,,629 EAST 5TH STREET NYC,,10009,24,0,4489,18523,1920,2,D9,16232000.0,2016-11-07 00:00:00,,


In [28]:
  for address in df['ADDRESS']:
          g = geolocator.geocode(address)
          print(g.address)
          print((g.latitude, g.longitude))
          LatLong.append((g.latitude, g.longitude))
        


153, Avenue B, Alphabet City, Manhattan Community Board 3, Manhattan, New York County, New York, 10009, United States of America
(40.726572950000005, -73.97987037365662)


NameError: name 'LatLong' is not defined

In [None]:
print(len(df))


RESULT = df['ADDRESS'].apply(geolocator.geocode)

In [None]:
df['LONG']=RESULT.apply(get_longitude)

In [None]:
df['LAT']=RESULT.apply(get_latitude)

In [None]:
df['ADDRESS'].apply(geopy)

In [None]:
boros = geopandas.read_file(geopandas.datasets.get_path("nybb"))

or can do for loop to apply to everything one by one. 
sale date convert to 'epoch'

In [None]:
for i in range(0, len(df)):
    address = df['ADDRESS'(0)]