In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
print('Import Complete')

Import Complete


### Load DataSet

In [6]:
# Load the dataset
file_path = '/Users/ilaakshmishra/Documents/Machine_Learning/Assignment4/housing.csv'
housing_data = pd.read_csv(file_path)

### Preprocessing

In [7]:
# Handle Missing Values
imputer = SimpleImputer(strategy="median")
housing_num = housing_data.drop("ocean_proximity", axis=1)
imputed_data = imputer.fit_transform(housing_num)

In [8]:
# Encode Categorical Variables
housing_cat = housing_data[["ocean_proximity"]]
cat_encoder = OneHotEncoder()
housing_cat_encoded = cat_encoder.fit_transform(housing_cat)

### Feature Scaling

In [11]:
scaler = StandardScaler()
housing_scaled = scaler.fit_transform(imputed_data)

### Reconstruct Database

In [13]:
housing_preprocessed = pd.DataFrame(
    housing_scaled, columns=housing_num.columns, index=housing_data.index)
housing_cat_df = pd.DataFrame(
    housing_cat_encoded.toarray(), columns=cat_encoder.get_feature_names_out(), index=housing_data.index)
housing_preprocessed = pd.concat([housing_preprocessed, housing_cat_df], axis=1)

### Splitting Database

In [14]:
X = housing_preprocessed
y = housing_data["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### KNN

In [16]:
# K-Nearest Neighbors
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)


In [17]:
print("KNN Mean Squared Error:", mse_knn)
print("KNN R² Score:", r2_knn)

KNN Mean Squared Error: 318688517.9240795
KNN R² Score: 0.9756802247835719


### Random Forest

In [18]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [19]:
print("Random Forest Mean Squared Error:", mse_rf)
print("Random Forest R² Score:", r2_rf)

Random Forest Mean Squared Error: 5487.71285130814
Random Forest R² Score: 0.9999995812213636


### Model Evaluation and Interpretation

In [20]:
print("Model Evaluation and Interpretation:")

# K-Nearest Neighbors (KNN) Evaluation
print("\nK-Nearest Neighbors (KNN):")
print(f"Mean Squared Error (MSE): {mse_knn:.2f}")
print(f"R² Score: {r2_knn:.4f}")
print("Interpretation: A higher MSE suggests the model may not be very accurate in predictions.")
print("A high R² score indicates the model explains a significant portion of the variance in the target variable.")

# Random Forest Evaluation
print("\nRandom Forest:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"R² Score: {r2_rf:.4f}")
print("Interpretation: A lower MSE indicates a better fit to the data.")
print("A near-perfect R² score might suggest overfitting, where the model performs exceptionally well on training data but may not generalize as well.")

# Overall Comparison
print("\nOverall Comparison:")
if mse_knn < mse_rf:
    print("KNN has a lower MSE, indicating better accuracy in this case.")
else:
    print("Random Forest has a lower MSE, indicating better accuracy in this case.")

if r2_knn < r2_rf:
    print("Random Forest has a higher R² score, indicating it explains more variance in the target variable.")
else:
    print("KNN has a higher R² score, indicating it explains more variance in the target variable.")

print("\nFurther Steps:")
print("Consider cross-validation for a more robust evaluation.")
print("Hyperparameter tuning may improve model performance.")
print("Investigate feature importance, especially for Random Forest.")
print("Compare with other regression algorithms.")
print("Conduct error analysis to understand where and why the models make errors.")


Model Evaluation and Interpretation:

K-Nearest Neighbors (KNN):
Mean Squared Error (MSE): 318688517.92
R² Score: 0.9757
Interpretation: A higher MSE suggests the model may not be very accurate in predictions.
A high R² score indicates the model explains a significant portion of the variance in the target variable.

Random Forest:
Mean Squared Error (MSE): 5487.71
R² Score: 1.0000
Interpretation: A lower MSE indicates a better fit to the data.
A near-perfect R² score might suggest overfitting, where the model performs exceptionally well on training data but may not generalize as well.

Overall Comparison:
Random Forest has a lower MSE, indicating better accuracy in this case.
Random Forest has a higher R² score, indicating it explains more variance in the target variable.

Further Steps:
Consider cross-validation for a more robust evaluation.
Hyperparameter tuning may improve model performance.
Investigate feature importance, especially for Random Forest.
Compare with other regression 