# Analysis of the New York City Property Prices Dataset <a class="tocSkip">

In [None]:
# Loading useful extensions
%load_ext autoreload
%autoreload
%load_ext nb_black
%matplotlib inline

In [None]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV, LinearRegression, Ridge, RidgeCV

# Load data

In [None]:
df = pd.read_csv("sales_data_2015.csv")

## EDA

The data has 26 dimensions, which are listed below. Some of the attribute names are quite ambiguous, such are the codes for tax and building classes. For more information you can have a look at
https://www1.nyc.gov/assets/finance/downloads/pdf/07pdf/glossary_rsf071607.pdf
and
https://www1.nyc.gov/assets/finance/jump/hlpbldgcode.html .

In [None]:
# Checking for NaN values
df.isna().sum().sort_values(ascending=False)

In [None]:
# Dimension of the data
df.shape

## Data Cleaning

In [None]:
def general_data_cleaning(df):
    """
    Cleaning the data set by deleting unused columns and filtering columns for unrealistic data
    """
    # Deleting 0 values from the data set
    df = df[(df.yr_built != 0) & (df.tot_sqft != 0) & (df.price != 0)]
    # Deleting columns that are mostly NaN values and unused columns
    df = df.copy().drop(["easmnt", "apt", "Unnamed: 0", "Sale_id", "usable"], axis=1)
    # Drop nan values and reseting the index
    df = df.dropna()
    df = df.reset_index(drop=True)
    # Filter prices that are lower than 100
    df = df[(df.price > 100)]
    return df

In [None]:
def regression_data_cleaning(df):
    """
    Cleaning data for regression models by removing unusable models and on hot encoding categorical variables
    """
    # General data cleaning first
    df = general_data_cleaning(df)
    # removing unused columns
    df = df.copy().drop(
        [
            "bbl_id",
            "address",
            "sale_date",
            "long",
            "lat",
            "bldg_ctgy",
            "bldg_cls_p",
            "bldg_cls_s",
            "tax_cls_p",
            "tax_cls_s",
            "tot_unit",
            "year",
        ],
        axis=1,
    )
    # One hot encode Building Class Category
    # df = pd.concat([df, pd.get_dummies(df["bldg_cls_p"])], axis=1)
    # df.drop(["bldg_cls_p"], axis=1, inplace=True)
    return df

# Regression

In [None]:
df_reg = regression_data_cleaning(df)

## Correlations

In [None]:
corr = df_reg.corr()
corr.style.background_gradient(cmap="coolwarm", axis=None)

## Split into Training and Testing data

In [None]:
y = df_reg.pop("price")
X = df_reg
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=40
)

## Regression Models

### Lasso Regression

In [None]:
def lasso_regression(X_train, y_train, X_test, y_test):
    """
    Perfomring a lasso regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = LassoCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
    print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Lasso picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    plt.rcParams["figure.figsize"] = (8.0, 10.0)
    imp_coef.plot(kind="barh")
    plt.title("Feature importance using Lasso Model")
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    print("Score on test data: ", reg.score(X_test, y_test))

In [None]:
# Run the regression
lasso_regression(X_train, y_train, X_test, y_test)