In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [3]:
# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False, parser='auto')
print(mnist.DESCR)

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [4]:
# Check if Data is Loaded Correctly
X, y = mnist["data"], mnist["target"].astype(np.uint8)
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (70000, 784)
Shape of y: (70000,)


In [5]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Linear Regression
lin_reg = LinearRegression()
scores_lr = cross_validate(lin_reg, X_train, y_train, cv=3, scoring='neg_mean_squared_error')["test_score"]
print('Linear Regression RMSE for each iteration:', np.sqrt(-scores_lr))
print('Linear Regression RMSE:', np.sqrt(np.mean(-scores_lr)))

Linear Regression RMSE for each iteration: [1.97636959e+06 1.66995996e+08 2.93905758e+10]
Linear Regression RMSE: 16968930778.638704


In [7]:
# Lasso Regression
lasso = Lasso()
hyper_param_lasso = {'alpha': [0.01, 1, 2, 5, 10]}  # Changed from tuple to list
lasso_reg = GridSearchCV(lasso, hyper_param_lasso, cv=5)
lasso_reg.fit(X_train, y_train)
print('Best Lasso parameters:', lasso_reg.best_params_)
scores_lasso = cross_validate(lasso_reg, X_train, y_train, cv=3, scoring='neg_mean_squared_error')["test_score"]
print('Lasso Regression RMSE for each iteration:', np.sqrt(-scores_lasso))
print('Lasso Regression RMSE:', np.sqrt(np.mean(-scores_lasso)))

Best Lasso parameters: {'alpha': 0.01}
Lasso Regression RMSE for each iteration: [1.81673881 1.81069318 1.80944323]
Lasso Regression RMSE: 1.8122945403900494


In [None]:
# Random Forest Regression
forest = RandomForestRegressor()
param_grid = {'n_estimators': [10, 100, 120], 'max_depth': [None, 10]}
forest_reg = GridSearchCV(forest, param_grid=param_grid, cv=5)
forest_reg.fit(X_train, y_train)
print('Best Random Forest parameters:', forest_reg.best_params_)
scores_forest = cross_validate(forest_reg, X_train, y_train, cv=3, scoring='neg_mean_squared_error')["test_score"]
print('Random Forest Regression RMSE for each iteration:', np.sqrt(-scores_forest))
print('Random Forest Regression RMSE:', np.sqrt(np.mean(-scores_forest)))

In [None]:
# Choosing the best model based on RMSE
best_model = lin_reg if np.mean(-scores_lr) < np.mean(-scores_lasso) else lasso_reg.best_estimator_ if np.mean(-scores_lasso) < np.mean(-scores_forest) else forest_reg.best_estimator_

In [None]:
# Training the best model
best_model.fit(X_train, y_train)

In [None]:
# Evaluating the best model on the test set
y_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print('Best Model RMSE on test set:', test_rmse)