# Visual Features <a class="tocSkip">

In [None]:
# Loading useful extensions
%load_ext autoreload
%autoreload
%load_ext nb_black
%matplotlib inline

In [None]:
# Import packages
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

warnings.filterwarnings("ignore")

# Load Data

In [None]:
# Read the visual data
df_vis = pd.read_csv("sales_data_2015_DF-inception-conv.csv")
# Delete unused columns
df_vis = df_vis.copy().drop(["bbl_id", "Unnamed: 0"], axis=1)

# Read the regular data
df_reg = pd.read_csv("df_reg.csv")
del df_reg["Unnamed: 0"]

# Read the complete dataset
df_complete = pd.read_csv("df_eda.csv")
del df_complete["Unnamed: 0"]

In [None]:
# Mergin the two data sets
df_vis_reg = pd.merge(df_reg, df_vis, on="Sale_id", how="inner")
# Drop Sale_id
del df_vis_reg["Sale_id"]

In [None]:
# Mergin the two data sets
df_complete = df_complete[["Sale_id", "borough"]]
df_vis_classification = pd.merge(df_complete, df_vis, on="Sale_id", how="inner")
# Drop Sale_id
del df_vis_classification["Sale_id"]

# Rerunning the Regression

In [None]:
# Import regression models
%run regression_models.py

In [None]:
y_vis = df_vis_reg.pop("price")
X_vis = df_vis_reg
X_train_vis, X_test_vis, y_train_vis, y_test_vis = train_test_split(
    X_vis, y_vis, test_size=0.20, random_state=43
)

In [None]:
summary = pd.DataFrame(
    [
        linear_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis, False),
        lasso_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis, False),
        ridge_regression(X_train_vis, y_train_vis, X_test_vis, y_test_vis, False),
        random_forest_regression(
            X_train_vis, y_train_vis, X_test_vis, y_test_vis, False
        ),
    ]
)
summary.sort_values("R squared")

# Classification Model

## Classification of "borough"

In [None]:
def random_forest(X_train, y_train, X_test, y_test):
    """
    Random Forest for classification
    """
    # Use a full grid over all parameters
    parameters = {
        "n_estimators": [10, 20, 30],
        "criterion": ["gini", "entropy"],
        "bootstrap": [True, False],
    }
    grid_clf = GridSearchCV(
        RandomForestClassifier(), parameters, scoring="accuracy", cv=5, n_jobs=-1
    )
    grid_clf.fit(X_train, y_train)

    clf = grid_clf.best_estimator_

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(accuracy_score(y_test, y_pred))

In [None]:
# Label Encoding
df_vis_classification = df_vis_classification.dropna()
df_vis_classification["label"] = LabelEncoder().fit_transform(
    df_vis_classification["borough"]
)
del df_vis_classification["borough"]

# Define Target
X_clas = df_vis_classification.drop("label", axis=1)
y_clas = df_vis_classification[["label"]]

# Split into training and testing set
X_train_clas, X_test_clas, y_train_clas, y_test_clas = train_test_split(
    X_clas, y_clas, test_size=0.2
)

# Run RF
random_forest(X_train_clas, y_train_clas, X_test_clas, y_test_clas)