# Analysis of the New York City Property Prices Dataset <a class="tocSkip">

In [None]:
# Loading useful extensions
%load_ext autoreload
%autoreload
%load_ext nb_black
%matplotlib inline

In [None]:
# Import packages
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import skew
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.linear_model import Lasso, LassoCV, LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# Load data

In [None]:
df = pd.read_csv("sales_data_2015.csv")

## EDA

The data has 26 dimensions, which are listed below. Some of the attribute names are quite ambiguous, such are the codes for tax and building classes. For more information you can have a look at
https://www1.nyc.gov/assets/finance/downloads/pdf/07pdf/glossary_rsf071607.pdf
and
https://www1.nyc.gov/assets/finance/jump/hlpbldgcode.html .

In [None]:
# Checking for NaN values
df.isna().sum().sort_values(ascending=False)

In [None]:
# Dimension of the data
df.shape

In [None]:
sum(df.duplicated(df.columns))

In [None]:
df_eda = general_data_cleaning(df)

In [None]:
sns.distplot(df_eda["price"])
# sns.distplot(df_eda["price"]).get_figure().savefig("output.png", dpi=600)

In [None]:
sales = np.log(df_eda["price"])
print(sales.skew())
sns.distplot(sales)

## Data Cleaning

In [None]:
def general_data_cleaning(df):
    """
    Cleaning the data set by deleting unused columns and filtering columns for unrealistic data
    """
    # Deleting 0 values from the data set
    df = df[(df.yr_built != 0) & (df.tot_sqft != 0) & (df.price != 0)]
    # Deleting columns that are mostly NaN values and unused columns
    df = df.copy().drop(["easmnt", "apt", "Unnamed: 0", "usable"], axis=1)
    # Drop duplicates
    df = df.drop_duplicates(df.columns, keep="last")
    # Drop nan values and reseting the index
    df = df.dropna()
    df = df.reset_index(drop=True)
    # Remove observations that fall outside those caps
    df = df[(df["price"] > 100000) & (df["price"] < 5000000)]
    df = df[(df["tot_unit"] > 0) & (df["tot_unit"] != 2261)]
    return df

In [None]:
def scaling_nummerical_features(df):
    """
    Scaling all the nummerical columns
    """
    # Selecting all the nummerical features
    df_nummerical = df[
        [
            "block",
            "lot",
            "zip",
            "res_unit",
            "com_unit",
            "tot_unit",
            "land_sqft",
            "tot_sqft",
            "yr_built",
            "price",
        ]
    ]
    # Transform the numeric features using log(x + 1)
    skewed = df_nummerical[df_nummerical.columns].apply(
        lambda x: skew(x.dropna().astype(float))
    )
    skewed = skewed[skewed > 0.75]
    skewed = skewed.index
    df_nummerical[skewed] = np.log1p(df_nummerical[skewed])
    # Scale the features
    scaler = StandardScaler()
    scaler.fit(df_nummerical[df_nummerical.columns])
    scaled = scaler.transform(df_nummerical[df_nummerical.columns])

    for i, col in enumerate(df_nummerical.columns):
        df_nummerical[col] = scaled[:, i]

    return df_nummerical

In [None]:
def one_hot_encoding(df):
    """
    One hot encoding all the categorical features
    """
    df_categorical = df[["borough", "bldg_ctgy", "tax_cls_s", "tax_cls_p"]]
    # Changing the data type
    df_categorical["borough"] = df_categorical["borough"].astype(object)
    df_categorical["tax_cls_s"] = df_categorical["tax_cls_s"].astype(object)
    # Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
    one_hot_encoded = pd.get_dummies(df_categorical)
    return one_hot_encoded

In [None]:
def regression_data_cleaning(df):
    """
    Cleaning data for regression models by removing unusable columns 
    Scaling nummerical columns
    One hot encoding categorical columns
    """
    # General data cleaning first
    df = general_data_cleaning(df)
    # Extracting Sale_id
    df_sale = df[["Sale_id"]]
    # Removing unused columns
    df = df.copy().drop(
        [
            "bbl_id",
            "address",
            "sale_date",
            "long",
            "lat",
            "year",
            "bldg_cls_p",
            "bldg_cls_s",
        ],
        axis=1,
    )
    # Run scaling and one hot encoding
    df = pd.concat([scaling_nummerical_features(df), one_hot_encoding(df)], axis=1)
    # Adding Sale_id back to the data frame
    df = pd.concat([df_sale, df], axis=1)
    return df

# Regression

In [None]:
df_reg = regression_data_cleaning(df)

## Correlations

In [None]:
# Dropping Sale_id
del df_reg["Sale_id"]
# Calculating correlations
corr = df_reg.corr()
corr.style.background_gradient(cmap="coolwarm", axis=None)

## Split into Training and Testing Data

In [None]:
y = df_reg.pop("price")
X = df_reg
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=40
)

## Regression Models

In [None]:
def rmse(y_test, y_pred):
    """
    Definition iof the RMSE (Root Mean Square Error)
    """
    return np.sqrt(mean_squared_error(y_test, y_pred))

### Linear Regression (Baseline)

In [None]:
def linear_regression(X_train, y_train, X_test, y_test):
    """
    Performing a simple linear regression
    """
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    # Return metrics
    return {
        "name": "Linear Regression (Baseline)",
        "R squared": linreg.score(X_test, y_test),
        "R squared training": linreg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }

In [None]:
# Run the regression
# linear_regression(X_train, y_train, X_test, y_test)

### Lasso Regression

In [None]:
def lasso_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a lasso regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = LassoCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
    print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Lasso picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Lasso Model")
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Lasso Regression",
        "R squared": reg.score(X_test, y_test),
        "RMSE": rmse(y_test, y_pred),
        "R squared training": reg.score(X_train, y_train),
        "MAE": mean_absolute_error(y_test, y_pred),
    }

In [None]:
# Run the regression
# lasso_regression(X_train, y_train, X_test, y_test, False)

### Ridge Regression

In [None]:
def ridge_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a ridge regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = RidgeCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
    print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Ridge picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Ridge Model")
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Ridge Regression",
        "R squared": reg.score(X_test, y_test),
        "R squared training": reg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }

In [None]:
# Run the regression
# ridge_regression(X_train, y_train, X_test, y_test, False)

### Random Forest Regression

In [None]:
def random_forest_regression(X_train, y_train, X_test, y_test, plot):
    """
    Random Forest Regression using grid search for the parameter tuning 
    and plotting the feature importances
    """
    # Random forest regressor
    rf = RandomForestRegressor(random_state=0)
    # Grid search for parameter tuning
    params = {
        "n_estimators": [10, 20, 30],
        "max_features": ["auto", "log2", "sqrt"],
        "bootstrap": [True, False],
    }
    reg = GridSearchCV(rf, params, cv=5)
    reg.fit(X_train, y_train)
    estimator = reg.best_estimator_
    # Using the test data to calculate a score
    y_pred = estimator.predict(X_test)
    print("Score on test data: ", estimator.score(X_test, y_test))
    print("Root Mean Square Error: ", rmse(y_test, y_pred))
    # Plotting the feature importances
    if plot:
        feat_importances = pd.Series(
            estimator.feature_importances_, index=X_train.columns
        ).sort_values(ascending=True)
        feat_importances.plot(kind="barh")
    return {
        "name": "Random Forest Regression",
        "R squared": estimator.score(X_test, y_test),
        "R squared training": estimator.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }

In [None]:
# Run the regression
# random_forest_regression(X_train, y_train, X_test, y_test, False)

## Summary of Regressions

In [None]:
summary = pd.DataFrame(
    [
        linear_regression(X_train, y_train, X_test, y_test),
        lasso_regression(X_train, y_train, X_test, y_test, False),
        ridge_regression(X_train, y_train, X_test, y_test, False),
        random_forest_regression(X_train, y_train, X_test, y_test, False),
    ]
)
summary.sort_values("R squared")

# Adding Visual Features

## Loading Data

In [None]:
df_vis = pd.read_csv("sales_data_2015_DF-inception-conv.csv")
# Delete unused columns
df_vis = df_vis.copy().drop(["bbl_id", "Unnamed: 0"], axis=1)

In [None]:
df_vis_reg = pd.merge(regression_data_cleaning(df), df_vis, on="Sale_id", how="inner")
# Drop Sale_id
del df_vis_reg["Sale_id"]

## Rerunning the Regressions

In [None]:
y_vis = df_vis_reg.pop("price")
X_vis = df_vis_reg
X_train_vis, X_test_vis, y_train_vis, y_test_vis = train_test_split(
    X_vis, y_vis, test_size=0.20, random_state=40
)

In [None]:
summary = pd.DataFrame(
    [
        linear_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis),
        lasso_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis, False),
        ridge_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis, False),
        random_forest_regression(
            X_train_vis, y_train_vis, X_test_vis, y_test_vis, False
        ),
    ]
)
summary.sort_values("R squared")