## Using Ridge

In [1]:
from utils import rmse
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from statsmodels.stats.outliers_influence import variance_inflation_factor
#from category_encoders import OneHotEncoder
import seaborn as sns

In [2]:
def wrangle_data():
    #Using all the important model
    feature_names = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]
    df = pd.read_csv("housing.csv", sep=r"\s+", header=None)

     # Add Columns name and convert and lower case
    df.columns = feature_names
    df.columns = df.columns.str.lower()
    
    # Remove muticollinearity and insignificant correlation
    df.drop(columns=["dis", "chas", "nox", "zn"], inplace=True)

    #Convert the price to the original amount
    df["medv"] = df["medv"] * 1000
    
    #Remove the outlier from the data by subsetting it between 1 to 90 pct of the data
    low, high = df["medv"].quantile([0.1, 0.9])
    mask = df["medv"].between(low, high)
    df = df[mask]
    
    #Scale the target varibale by taking the log
    # df["log_price"] = np.log1p(df["medv"])
    
    return df

In [3]:
housing = wrangle_data()
df = housing.copy()
df.head()

Unnamed: 0,crim,indus,rm,age,rad,tax,ptratio,b,lstat,medv
0,0.00632,2.31,6.575,65.2,1,296.0,15.3,396.9,4.98,24000.0
1,0.02731,7.07,6.421,78.9,2,242.0,17.8,396.9,9.14,21600.0
2,0.02729,7.07,7.185,61.1,2,242.0,17.8,392.83,4.03,34700.0
3,0.03237,2.18,6.998,45.8,3,222.0,18.7,394.63,2.94,33400.0
5,0.02985,2.18,6.43,58.7,3,222.0,18.7,394.12,5.21,28700.0


In [4]:
target = "medv"
y = df[target]
X = df.drop(columns=target)
X.head()

Unnamed: 0,crim,indus,rm,age,rad,tax,ptratio,b,lstat
0,0.00632,2.31,6.575,65.2,1,296.0,15.3,396.9,4.98
1,0.02731,7.07,6.421,78.9,2,242.0,17.8,396.9,9.14
2,0.02729,7.07,7.185,61.1,2,242.0,17.8,392.83,4.03
3,0.03237,2.18,6.998,45.8,3,222.0,18.7,394.63,2.94
5,0.02985,2.18,6.43,58.7,3,222.0,18.7,394.12,5.21


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
y_mean = y_train.mean()
y_pred_base = [y_mean] * len(y_train)
base_err = mean_absolute_error(y_train, y_pred_base)
base_err

np.float64(4042.1052631578946)

In [7]:
model = make_pipeline(
    StandardScaler(),
    Ridge()
)

model.fit(X_train, y_train)

In [8]:
check_is_fitted(model[-1])

In [9]:
y_train_pred = model.predict(X_train)
train_err = mean_absolute_error(y_train, y_train_pred)
train_err

np.float64(2338.1023235654634)

In [10]:
y_test_pred = model.predict(X_test)
test_err = mean_absolute_error(y_test, y_test_pred)
test_err

np.float64(2422.2898724356896)

In [28]:
dat = X_test[ 15:20]
dat

Unnamed: 0,crim,indus,rm,age,rad,tax,ptratio,b,lstat
302,0.09266,6.09,6.495,18.4,7,329.0,16.1,383.61,8.67
198,0.03768,1.52,7.274,38.3,2,329.0,12.6,392.2,6.62
448,9.32909,18.1,6.185,98.7,24,666.0,20.2,396.9,18.13
10,0.22489,7.87,6.377,94.3,5,311.0,15.2,392.52,20.45
220,0.35809,6.2,6.951,88.5,8,307.0,17.4,391.7,9.71


In [31]:
y_pred = model.predict(dat)
np.round(y_pred, 2)

array([26204.27, 29621.59, 18488.02, 21342.89, 25882.09])

In [32]:
for i in range(len(dat)):
    index = dat.iloc[i].name
    print
    print(f"Index{index}: {housing.loc[index]['medv']}")

Index302: 26400.0
Index198: 34600.0
Index448: 14100.0
Index10: 15000.0
Index220: 26700.0
