In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('AmesHousing.csv')

In [3]:
df

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000


###GrLivArea, OverallQual, and TotalBsmtSF

In [4]:
X = df.drop(['SalePrice', 'PID'], axis=1)
y = df['SalePrice']

In [5]:
# 3. Identify Column Types for Processing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

In [6]:
# 4. Create Preprocessing Steps
# For numbers: Fill missing values with the median, then scale them so big numbers don't dominate
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [7]:
# For categories: Fill missing values with the word 'missing', then One-Hot Encode them
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [8]:
# Bundle both preprocessors together
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# 5. Build the Final Pipeline
# We chain the preprocessor with a Ridge Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=10.0)) # Alpha controls the regularization strength
])

In [10]:
# 6. Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 7. Train the Model
print("Training the Ridge Regression model...")
model.fit(X_train, y_train)

Training the Ridge Regression model...


In [12]:
# 8. Predict and Evaluate
y_pred = model.predict(X_test)

In [13]:
# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [14]:
print("-" * 30)
print(f"Model Performance:")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R-squared (R^2): {r2:.4f}")
print("-" * 30)

------------------------------
Model Performance:
Root Mean Squared Error (RMSE): $29,001.98
R-squared (R^2): 0.8951
------------------------------


In [15]:
# 8. Predict on BOTH training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics for Training Set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_train = r2_score(y_train, y_train_pred)

# Calculate metrics for Testing Set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print("-" * 30)
print("TRAINING Performance (The data the model studied):")
print(f"RMSE: ${rmse_train:,.2f}")
print(f"R-squared: {r2_train:.4f}")
print("-" * 30)
print("TESTING Performance (The data the model has never seen):")
print(f"RMSE: ${rmse_test:,.2f}")
print(f"R-squared: {r2_test:.4f}")
print("-" * 30)

------------------------------
TRAINING Performance (The data the model studied):
RMSE: $22,859.22
R-squared: 0.9121
------------------------------
TESTING Performance (The data the model has never seen):
RMSE: $29,001.98
R-squared: 0.8951
------------------------------


In [16]:
from sklearn.model_selection import GridSearchCV

# 1. Define the grid of parameters to test
# The syntax 'regressor__alpha' is crucial. It tells the pipeline to look inside
# the step named 'regressor' and change its 'alpha' setting.
param_grid = {
    'regressor__alpha': [0.1, 1.0, 10.0, 50.0, 100.0, 200.0, 500.0]
}

# 2. Set up the Grid Search
# cv=5 means 5-fold cross-validation
# n_jobs=-1 tells your computer to use all available CPU cores to run this faster
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# 3. Run the search (This will train 35 separate models in the background!)
print("Running Grid Search... Please wait.")
grid_search.fit(X_train, y_train)

# 4. Extract the winning model
print(f"WINNING PARAMETER: alpha = {grid_search.best_params_['regressor__alpha']}")
best_model = grid_search.best_estimator_

# 5. Evaluate the newly tuned model on the untouched test set
y_test_pred_tuned = best_model.predict(X_test)
tuned_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_tuned))
tuned_r2 = r2_score(y_test, y_test_pred_tuned)

print("-" * 30)
print("TUNED Model Performance (Test Set):")
print(f"RMSE: ${tuned_rmse:,.2f}")
print(f"R-squared: {tuned_r2:.4f}")
print("-" * 30)

Running Grid Search... Please wait.
WINNING PARAMETER: alpha = 10.0
------------------------------
TUNED Model Performance (Test Set):
RMSE: $29,001.98
R-squared: 0.8951
------------------------------


In [17]:
import joblib

# We are using 'best_model' which was extracted from your Grid Search earlier
# best_model = grid_search.best_estimator_

# Choose a file name for your saved pipeline
model_filename = "ames_ridge_model.pkl"

# Export the model to your hard drive
joblib.dump(best_model, model_filename)

print(f"Success! Model securely saved as {model_filename}")

Success! Model securely saved as ames_ridge_model.pkl
