# Regression Analysis <a class="tocSkip">

In [None]:
# Loading useful extensions
%load_ext autoreload
%autoreload
%load_ext nb_black
%matplotlib inline

In [None]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_regression
from sklearn.linear_model import Lasso, LassoCV, LinearRegression, Ridge, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeRegressor
import warnings

warnings.filterwarnings("ignore")

# Load Data

In [None]:
df_reg = pd.read_csv("df_reg.csv")
del df_reg["Unnamed: 0"]

# Correlations

In [None]:
# Dropping Sale_id
del df_reg["Sale_id"]
# Calculating correlations
# corr = df_reg.corr()
# corr.style.background_gradient(cmap="coolwarm", axis=None)

# Split into Training and Testing Data

In [None]:
y = df_reg.pop("price")
X = df_reg
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=2
)

In [None]:
X_base = X[
    [
        "res_unit",
        "com_unit",
        "tot_unit",
        "land_sqft",
        "tot_sqft",
        "yr_built",
        "borough_1",
        "borough_2",
        "borough_3",
        "borough_4",
        "borough_1",
    ]
]
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(
    X_base, y, test_size=0.20, random_state=42
)

# Regression Models

In [None]:
# Import regression models
%run regression_models.py

## Linear Regression (Baseline)

In [None]:
# Run the regression
linear_regression(X_train_base, y_train_base, X_test_base, y_test_base, True)

## Lasso Regression

In [None]:
# Run the regression
lasso_regression(X_train, y_train, X_test, y_test, True)

## Ridge Regression

In [None]:
# Run the regression
ridge_regression(X_train, y_train, X_test, y_test, True)

## Random Forest Regression

In [None]:
# Run the regression
random_forest_regression(X_train, y_train, X_test, y_test, True)

## Summary of Regressions

In [None]:
summary = pd.DataFrame(
    [
        linear_regression(X_train_base, y_train_base, X_test_base, y_test_base, False),
        lasso_regression(X_train, y_train, X_test, y_test, False),
        ridge_regression(X_train, y_train, X_test, y_test, False),
        random_forest_regression(X_train, y_train, X_test, y_test, False),
    ]
)
summary.sort_values("R squared")

In [None]:
# Exporting the table to a latex table
summary = summary.round(decimals=3)
summary.to_latex("table_one")