## Step 1:Loading and Preprocessing

In [6]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load the dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
data['target'] = california_housing.target

print(data.head())

# Check for missing values
print(data.isnull().sum())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64


In [5]:
# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('target', axis=1))
scaled_data = pd.DataFrame(scaled_features, columns=data.columns[:-1])
scaled_data['target'] = data['target']

# Display the first few rows of the scaled data
print(scaled_data.head())

     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  target  
0  -1.327835   4.526  
1  -1.322844   3.585  
2  -1.332827   3.521  
3  -1.337818   3.413  
4  -1.337818   3.422  


## Step 2: Regression Algorithm Implementation

In [10]:
# Split the data
X = scaled_data.drop('target', axis=1)
y = scaled_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [12]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree Regressor
tree_model=DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

In [13]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor
forest_model=RandomForestRegressor(random_state=42)
forest_model.fit(X_train, y_train)

In [14]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor
gb_model=GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

In [15]:
from sklearn.svm import SVR

# Support Vector Regressor
svr_model=GradientBoostingRegressor(random_state=42)
svr_model.fit(X_train, y_train)

## Step 3: Model Evaluation and Comparison

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a dictionary to store the results
results = {}

# Evaluate Linear Regression
y_pred_linear = linear_model.predict(X_test)
results['Linear Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_linear),
    'MAE': mean_absolute_error(y_test, y_pred_linear),
    'R²': r2_score(y_test, y_pred_linear)
}

# Evaluate Decision Tree Regressor
y_pred_tree = tree_model.predict(X_test)
results['Decision Tree'] = {
    'MSE': mean_squared_error(y_test, y_pred_tree),
    'MAE': mean_absolute_error(y_test, y_pred_tree),
    'R²': r2_score(y_test, y_pred_tree)
}

# Evaluate Random Forest Regressor
y_pred_forest = forest_model.predict(X_test)
results['Random Forest'] = {
    'MSE': mean_squared_error(y_test, y_pred_forest),
    'MAE': mean_absolute_error(y_test, y_pred_forest),
    'R²': r2_score(y_test, y_pred_forest)
}

# Evaluate Gradient Boosting Regressor
y_pred_gb = gb_model.predict(X_test)
results['Gradient Boosting'] = {
    'MSE': mean_squared_error(y_test, y_pred_gb),
    'MAE': mean_absolute_error(y_test, y_pred_gb),
    'R²': r2_score(y_test, y_pred_gb)
}

# Evaluate Support Vector Regressor
y_pred_svr = svr_model.predict(X_test)
results['Support Vector Regressor'] = {
    'MSE': mean_squared_error(y_test, y_pred_svr),
    'MAE': mean_absolute_error(y_test, y_pred_svr),
    'R²': r2_score(y_test, y_pred_svr)
}

# Display the results
results_df = pd.DataFrame(results).T
print(results_df)

                               MSE       MAE        R²
Linear Regression         0.555892  0.533200  0.575788
Decision Tree             0.494272  0.453784  0.622811
Random Forest             0.255498  0.327613  0.805024
Gradient Boosting         0.293999  0.371650  0.775643
Support Vector Regressor  0.293999  0.371650  0.775643


## Step 4: Compare the Results

In [17]:
# Identify the best and worst performing algorithms
best_model = results_df['R²'].idxmax()
worst_model = results_df['R²'].idxmin()

print(f"Best Performing Model: {best_model} with R²: {results_df.loc[best_model, 'R²']}")
print(f"Worst Performing Model: {worst_model} with R²: {results_df.loc[worst_model, 'R²']}")

Best Performing Model: Random Forest with R²: 0.805024407701793
Worst Performing Model: Linear Regression with R²: 0.575787706032451
