In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV


import plotnine as p9

In [3]:
ames = pd.read_csv("AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()


In [4]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [7]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=1))]
)

In [8]:
lr_fitted = lr_pipeline.fit(X_train, y_train)
ridge_fitted = ridge_pipeline.fit(X_train, y_train)

lr_coef = lr_fitted['linear_regression'].coef_
ridge_coef = ridge_fitted['ridge_regression'].coef_

df = pd.DataFrame({'coefficient':list(range(1,237)),'LR':lr_coef, 'RR':ridge_coef})

scaler = StandardScaler()
df[['LR','RR']] = scaler.fit_transform(df[['LR', 'RR']])

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
R2_lr = scores.mean()

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge = scores.mean()

print(R2_lr)
print(R2_ridge)

ValueError: All arrays must be of the same length

In [None]:
plot = (p9.ggplot(df)
    +p9. geom_point(p9.aes(x = "coefficient", y = "LR"), color = 'red')
    + p9.geom_point(p9.aes(x = "coefficient", y = "RR"), color = 'blue'))

In [None]:
plot.show()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=0.001))]
)

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge001 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=0.01))]
)

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge01 = scores.mean()

NameError: name 'ColumnTransformer' is not defined

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=0.1))]
)

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge1 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=10))]
)

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge10 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge(alpha=100))]
)

scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
R2_ridge100 = scores.mean()

In [None]:
print(R2_ridge001)
print(R2_ridge01)
print(R2_ridge1)
print(R2_ridge)
print(R2_ridge10)
print(R2_ridge100)

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=0.001))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso001 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=0.01))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso01 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=0.1))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso1 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=1))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=10))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso10 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=100))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso100 = scores.mean()

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso(alpha=1000))]
)

scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
R2_lasso1000 = scores.mean()

In [None]:
print(R2_lasso001)
print(R2_lasso01)
print(R2_lasso1)
print(R2_lasso)
print(R2_lasso10)
print(R2_lasso100)
print(R2_lasso1000)