In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# For reproducibility
np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')


In [None]:
# Create synthetic data
np.random.seed(42)
X = 2 * np.random.rand(100, 1)  # 100 samples, 1 feature
y = 4 + 3 * X + np.random.randn(100, 1)  # true relationship: y = 4 + 3x + noise

# Visualize the data
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.7)
plt.title('Synthetic Data')
plt.xlabel('X')
plt.ylabel('y')
plt.show()

# Implement linear regression from scratch using the normal equation
class LinearRegressionFromScratch:
    def __init__(self):
        self.theta = None  # Model parameters
    
    def fit(self, X, y):
        # Add bias term (intercept)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        # Normal equation: theta = (X^T X)^(-1) X^T y
        self.theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
        
        return self
    
    def predict(self, X):
        # Add bias term for prediction
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        # Make predictions
        return X_b.dot(self.theta)
    
    def get_params(self):
        if self.theta is None:
            return None
        return {"intercept": self.theta[0][0], "coefficient": self.theta[1][0]}

# Train the model
model = LinearRegressionFromScratch()
model.fit(X, y)

# Get model parameters
params = model.get_params()
print(f"Intercept (β₀): {params['intercept']:.4f}")
print(f"Coefficient (β₁): {params['coefficient']:.4f}")
print(f"True parameters: Intercept = 4, Coefficient = 3")

# Make predictions
X_new = np.array([[0], [2]])  # Two sample points for visualization
y_pred = model.predict(X_new)

# Visualize the model
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.7, label='Data points')
plt.plot(X_new, y_pred, 'r-', linewidth=2, label='Linear regression model')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression from Scratch')
plt.legend()
plt.show()

# Calculate error metrics
y_pred_all = model.predict(X)
mse = np.mean((y - y_pred_all) ** 2)
rmse = np.sqrt(mse)
r2 = 1 - (np.sum((y - y_pred_all) ** 2) / np.sum((y - np.mean(y)) ** 2))

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")


In [None]:
# Use scikit-learn's LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# Print model parameters
print(f"Intercept (β₀): {lin_reg.intercept_[0]:.4f}")
print(f"Coefficient (β₁): {lin_reg.coef_[0][0]:.4f}")
print(f"True parameters: Intercept = 4, Coefficient = 3")

# Make predictions
y_pred_sk = lin_reg.predict(X)

# Calculate error metrics
mse_sk = mean_squared_error(y, y_pred_sk)
rmse_sk = np.sqrt(mse_sk)
r2_sk = r2_score(y, y_pred_sk)

print(f"Mean Squared Error (MSE): {mse_sk:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_sk:.4f}")
print(f"R-squared (R²): {r2_sk:.4f}")

# Compare with our implementation
print("\nComparison with our implementation:")
print(f"Intercept difference: {abs(params['intercept'] - lin_reg.intercept_[0]):.10f}")
print(f"Coefficient difference: {abs(params['coefficient'] - lin_reg.coef_[0][0]):.10f}")
print(f"MSE difference: {abs(mse - mse_sk):.10f}")

# Visualize the scikit-learn model
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.7, label='Data points')
plt.plot(X_new, lin_reg.predict(X_new), 'r-', linewidth=2, label='scikit-learn Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression with scikit-learn')
plt.legend()
plt.show()


In [None]:
# Load the California Housing dataset
housing = fetch_california_housing()

# Create a DataFrame
X_housing = pd.DataFrame(housing.data, columns=housing.feature_names)
y_housing = pd.Series(housing.target)

# Display dataset information
print(f"Dataset shape: {X_housing.shape}")
print(f"Features: {X_housing.columns.tolist()}")
print(f"Target: Median house value (in $100,000)")

# Summary statistics
X_housing.describe().round(2)


In [None]:
# Explore the data
plt.figure(figsize=(15, 10))

# Visualize feature distributions
for i, feature in enumerate(X_housing.columns):
    plt.subplot(3, 3, i+1)
    sns.histplot(X_housing[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
plt.show()

# Explore target distribution
plt.figure(figsize=(10, 6))
sns.histplot(y_housing, kde=True)
plt.title('Distribution of Median House Value')
plt.xlabel('Median House Value ($100,000)')
plt.show()

# Check correlations
correlation_matrix = pd.concat([X_housing, y_housing], axis=1).corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_housing, y_housing, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a multiple linear regression model
lin_reg_multi = LinearRegression()
lin_reg_multi.fit(X_train_scaled, y_train)

# Get model coefficients
coefficients = pd.DataFrame({
    'Feature': X_housing.columns,
    'Coefficient': lin_reg_multi.coef_
})
coefficients = coefficients.sort_values('Coefficient', ascending=False)

print(f"Intercept: {lin_reg_multi.intercept_:.4f}")
print("\nFeature Coefficients:")
print(coefficients)

# Visualize feature importance
plt.figure(figsize=(12, 6))
plt.barh(coefficients['Feature'], coefficients['Coefficient'])
plt.xlabel('Coefficient Value')
plt.title('Feature Importance (Coefficient Magnitude)')
plt.grid(axis='x')
plt.tight_layout()
plt.show()

# Make predictions
y_train_pred = lin_reg_multi.predict(X_train_scaled)
y_test_pred = lin_reg_multi.predict(X_test_scaled)

# Calculate error metrics
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\nTraining Set Metrics:")
print(f"MSE: {train_mse:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAE: {train_mae:.4f}")
print(f"R²: {train_r2:.4f}")

print("\nTest Set Metrics:")
print(f"MSE: {test_mse:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"R²: {test_r2:.4f}")

# Visualize predictions vs actual values
plt.figure(figsize=(12, 5))

# Training set
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Training Set: Actual vs Predicted')

# Test set
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Test Set: Actual vs Predicted')

plt.tight_layout()
plt.show()

# Visualize residuals
plt.figure(figsize=(12, 5))

# Training set residuals
plt.subplot(1, 2, 1)
train_residuals = y_train - y_train_pred
plt.scatter(y_train_pred, train_residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Training Set: Residual Plot')

# Test set residuals
plt.subplot(1, 2, 2)
test_residuals = y_test - y_test_pred
plt.scatter(y_test_pred, test_residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Test Set: Residual Plot')

plt.tight_layout()
plt.show()


In [None]:
# Train Ridge and Lasso models with different regularization strengths
alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

# Store results
results = {'alpha': [], 'model_type': [], 'train_r2': [], 'test_r2': []}

# Baseline Linear Regression (for comparison)
results['alpha'].append(0)
results['model_type'].append('Linear')
results['train_r2'].append(train_r2)
results['test_r2'].append(test_r2)

# Ridge Regression
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_scaled, y_train)
    
    # Evaluate on training and test sets
    train_score = ridge.score(X_train_scaled, y_train)
    test_score = ridge.score(X_test_scaled, y_test)
    
    # Store results
    results['alpha'].append(alpha)
    results['model_type'].append('Ridge')
    results['train_r2'].append(train_score)
    results['test_r2'].append(test_score)

# Lasso Regression
for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train_scaled, y_train)
    
    # Evaluate on training and test sets
    train_score = lasso.score(X_train_scaled, y_train)
    test_score = lasso.score(X_test_scaled, y_test)
    
    # Store results
    results['alpha'].append(alpha)
    results['model_type'].append('Lasso')
    results['train_r2'].append(train_score)
    results['test_r2'].append(test_score)

# Convert to DataFrame for easier plotting
results_df = pd.DataFrame(results)

# Plot the results
plt.figure(figsize=(14, 6))

# Training R²
plt.subplot(1, 2, 1)
for model in ['Linear', 'Ridge', 'Lasso']:
    model_data = results_df[results_df['model_type'] == model]
    if model == 'Linear':
        plt.axhline(y=model_data['train_r2'].values[0], color='blue', linestyle='-', label='Linear')
    else:
        plt.plot(model_data['alpha'], model_data['train_r2'], marker='o', label=model)
plt.xscale('log')
plt.xlabel('Alpha (regularization strength)')
plt.ylabel('R² Score')
plt.title('Training Set R² vs Alpha')
plt.legend()
plt.grid(True)

# Test R²
plt.subplot(1, 2, 2)
for model in ['Linear', 'Ridge', 'Lasso']:
    model_data = results_df[results_df['model_type'] == model]
    if model == 'Linear':
        plt.axhline(y=model_data['test_r2'].values[0], color='blue', linestyle='-', label='Linear')
    else:
        plt.plot(model_data['alpha'], model_data['test_r2'], marker='o', label=model)
plt.xscale('log')
plt.xlabel('Alpha (regularization strength)')
plt.ylabel('R² Score')
plt.title('Test Set R² vs Alpha')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Find best models
best_ridge = results_df[results_df['model_type'] == 'Ridge'].iloc[results_df[results_df['model_type'] == 'Ridge']['test_r2'].idxmax()]
best_lasso = results_df[results_df['model_type'] == 'Lasso'].iloc[results_df[results_df['model_type'] == 'Lasso']['test_r2'].idxmax()]

print(f"Best Ridge model: alpha = {best_ridge['alpha']}, Test R² = {best_ridge['test_r2']:.4f}")
print(f"Best Lasso model: alpha = {best_lasso['alpha']}, Test R² = {best_lasso['test_r2']:.4f}")
print(f"Linear Regression: Test R² = {test_r2:.4f}")

# Train the best Ridge model to examine coefficients
best_ridge_model = Ridge(alpha=best_ridge['alpha'])
best_ridge_model.fit(X_train_scaled, y_train)

# Train the best Lasso model to examine coefficients
best_lasso_model = Lasso(alpha=best_lasso['alpha'], max_iter=10000)
best_lasso_model.fit(X_train_scaled, y_train)

# Compare coefficients
coef_comparison = pd.DataFrame({
    'Feature': X_housing.columns,
    'Linear': lin_reg_multi.coef_,
    'Ridge': best_ridge_model.coef_,
    'Lasso': best_lasso_model.coef_
})

print("\nCoefficient Comparison:")
print(coef_comparison)


In [None]:
# Visualize coefficient comparison
plt.figure(figsize=(14, 8))

# Reshape data for plotting
coef_plot_data = pd.melt(coef_comparison, id_vars=['Feature'], var_name='Model', value_name='Coefficient')

# Plot
sns.barplot(x='Feature', y='Coefficient', hue='Model', data=coef_plot_data)
plt.xticks(rotation=45, ha='right')
plt.title('Coefficient Comparison Across Models')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Compare prediction performance of the best models
linear_preds = lin_reg_multi.predict(X_test_scaled)
ridge_preds = best_ridge_model.predict(X_test_scaled)
lasso_preds = best_lasso_model.predict(X_test_scaled)

# Create scatter plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(y_test, linear_preds, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Linear Regression')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.subplot(1, 3, 2)
plt.scatter(y_test, ridge_preds, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title(f'Ridge Regression (α={best_ridge["alpha"]})')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.subplot(1, 3, 3)
plt.scatter(y_test, lasso_preds, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title(f'Lasso Regression (α={best_lasso["alpha"]})')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.tight_layout()
plt.show()
