In [62]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('Manhattan12.csv')

In [3]:
data.isnull().sum()

Manhattan Rolling Sales File.  All Sales From August 2012 - August 2013.     0
Unnamed: 1                                                                   3
Unnamed: 2                                                                   3
Unnamed: 3                                                                   3
Unnamed: 4                                                                   3
Unnamed: 5                                                                   3
Unnamed: 6                                                                   3
Unnamed: 7                                                                   3
Unnamed: 8                                                                   3
Unnamed: 9                                                                   3
Unnamed: 10                                                                  3
Unnamed: 11                                                                  3
Unnamed: 12                                         

In [4]:
data.head()

Unnamed: 0,Manhattan Rolling Sales File. All Sales From August 2012 - August 2013.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,Sales File as of 08/30/2013 Coop Sales Files ...,,,,,,,,,,...,,,,,,,,,,
1,"Neighborhood Name 09/06/13, Descriptive Data i...",,,,,,,,,,...,,,,,,,,,,
2,Building Class Category is based on Building C...,,,,,,,,,,...,,,,,,,,,,
3,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APART\r\nMENT\r\nNUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE\r\nPRICE,SALE DATE
4,1,,13 CONDOS - ELEVATOR APARTMENTS,,738,1306,,,345 WEST 14TH STREET,,...,0,0,0,0,0,0,2,R4,"$2,214,693",20/05/2013


### Showing the shape of the dataset

In [5]:
data.shape

(27399, 21)

### Rename incorrectly formatted column names 

In [6]:
df = data[4:-1]
df.columns = data.iloc[3]
df.columns = df.columns.str.replace('\r\n','')
df.rename(columns={'SALEPRICE': 'SALE PRICE','APARTMENTNUMBER':'APARTMENT NUMBER'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [7]:
df.columns


Index(['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'EASE-MENT',
       'BUILDING CLASS AT PRESENT', 'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE',
       'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE'],
      dtype='object', name=3)

### Dropping the columns 'BOROUGH', 'EASE-MENT', 'APARTMENT NUMBER'

In [8]:
df.drop(['BOROUGH', 'EASE-MENT', 'APARTMENT NUMBER'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
df.tail()

3,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
27393,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2170,26,V1,653 WEST 187 STREET,10033,0,0,0,1582,0,0,4,V1,"$185,000",23/08/2012
27394,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2170,400,V1,44 FAIRVIEW AVENUE,10040,0,0,0,16217,0,0,4,V1,$0,29/04/2013
27395,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180,75,V1,BENNETT AVENUE,10033,0,0,0,4150,0,0,4,V1,"$495,000",26/06/2013
27396,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180,75,V1,BENNETT AVENUE,10033,0,0,0,4150,0,0,4,V1,$0,02/04/2013
27397,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180,76,V1,N/A MAGAW PLACE,10033,0,0,0,760,0,0,4,V1,$0,26/06/2013


### For each numerical column, remove the , the 'dollar' for the sale price, and then convert them to numeric.

In [33]:
num_cols = ['BLOCK', 'LOT', 'ZIP CODE', 'RESIDENTIAL UNITS', 'COMMERCIAL UNITS',
            'TOTAL UNITS', 'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
            'TAX CLASS AT TIME OF SALE', 'SALE PRICE']
df[num_cols] = df[num_cols].replace('[\$,]', '', regex=True)

In [34]:
df[num_cols] = df[num_cols].astype(float)

### Create list of categorical variables and another for the numerical variables

In [20]:
cat_list = list(df.select_dtypes(include=['object']).columns)
num_list = list(df.select_dtypes(include=['int', 'float']).columns)


In [21]:
cat_list

['NEIGHBORHOOD',
 'BUILDING CLASS CATEGORY',
 'TAX CLASS AT PRESENT',
 'BUILDING CLASS AT PRESENT',
 'ADDRESS',
 'BUILDING CLASS AT TIME OF SALE',
 'SALE DATE']

In [22]:
num_list

['BLOCK',
 'LOT',
 'ZIP CODE',
 'RESIDENTIAL UNITS',
 'COMMERCIAL UNITS',
 'TOTAL UNITS',
 'LAND SQUARE FEET',
 'GROSS SQUARE FEET',
 'YEAR BUILT',
 'TAX CLASS AT TIME OF SALE',
 'SALE PRICE']

### Convert SALE DATE to datetime datatype

In [27]:
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], format='%d/%m/%Y')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], format='%d/%m/%Y')


In [28]:
print(df['SALE DATE'].dtype)

datetime64[ns]


### For each categorical variable, remove the spaces, and then replace the empty string '' by NaN

In [36]:
df[cat_list] = df[cat_list].applymap(lambda x: x.strip() if isinstance(x, str) else x)
df[cat_list] = df[cat_list].replace('', pd.NA)

### Replace the zeros in Prices, Land squares, etc. by NaN

In [41]:
df[num_cols] = df[num_cols].replace(0, pd.NA)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [43]:
df.tail()

3,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
27393,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2170.0,26.0,V1,653 WEST 187 STREET,10033.0,,,,1582.0,,,4.0,V1,185000.0,2012-08-23
27394,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2170.0,400.0,V1,44 FAIRVIEW AVENUE,10040.0,,,,16217.0,,,4.0,V1,,2013-04-29
27395,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180.0,75.0,V1,BENNETT AVENUE,10033.0,,,,4150.0,,,4.0,V1,495000.0,2013-06-26
27396,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180.0,75.0,V1,BENNETT AVENUE,10033.0,,,,4150.0,,,4.0,V1,,2013-04-02
27397,WASHINGTON HEIGHTS UPPER,31 COMMERCIAL VACANT LAND,4,2180.0,76.0,V1,N/A MAGAW PLACE,10033.0,,,,760.0,,,4.0,V1,,2013-06-26


### Show a summary of all missing values as well as the summary statistics

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27394 entries, 4 to 27397
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   NEIGHBORHOOD                    27377 non-null  object        
 1   BUILDING CLASS CATEGORY         23940 non-null  object        
 2   TAX CLASS AT PRESENT            27251 non-null  object        
 3   BLOCK                           27394 non-null  float64       
 4   LOT                             27394 non-null  float64       
 5   BUILDING CLASS AT PRESENT       27251 non-null  object        
 6   ADDRESS                         27394 non-null  object        
 7   ZIP CODE                        27393 non-null  object        
 8   RESIDENTIAL UNITS               11023 non-null  object        
 9   COMMERCIAL UNITS                3433 non-null   object        
 10  TOTAL UNITS                     17691 non-null  object        
 11  LA

In [45]:
df.describe()

3,BLOCK,LOT,TAX CLASS AT TIME OF SALE
count,27394.0,27394.0,27394.0
mean,1109.627656,741.81339,2.488063
std,465.958098,819.427506,0.891692
min,7.0,1.0,1.0
25%,877.0,37.0,2.0
50%,1047.0,1007.0,2.0
75%,1411.0,1233.0,4.0
max,2250.0,9117.0,4.0


### dropping duplicates

In [46]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [48]:
df.shape

(25801, 18)

### dropping rows with NaN

In [49]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [50]:
df.shape

(470, 18)

### identify and remove outliers if any and showing shape

In [57]:
z_scores = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
df = df[(z_scores < 3).all(axis=1)]

In [58]:
df.shape

(467, 18)

### Consider the log of the prices and normalise the data

In [67]:
df['SALE PRICE'] = np.log(df['SALE PRICE'].astype('float64'))