In [1]:
import pandas as pd

# Try loading with different delimiters
try:
    # Attempt loading with a comma delimiter
    data = pd.read_csv("housing.csv", delimiter=',')
    if data.shape[1] == 1:
        # If still one column, try with a different delimiter
        data = pd.read_csv("housing.csv", delimiter='\t')
    if data.shape[1] == 1:
        # If still one column, try with a semicolon
        data = pd.read_csv("housing.csv", delimiter=';')

    # If it's still one column, raise an error
    if data.shape[1] == 1:
        raise ValueError("The dataset appears to have only one column after trying common delimiters. Please check the file.")

except FileNotFoundError:
    print("The file 'housing.csv' was not found. Please check the file path.")
    exit()
except ValueError as e:
    print(str(e))
    exit()

# Display the corrected dataset structure
print("Dataset Info:")
print(data.info())
print("\nFirst Few Rows of the Dataset:")
print(data.head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   0.00632  505 non-null    float64
 1   18       505 non-null    float64
 2   2.31     505 non-null    float64
 3   0        505 non-null    int64  
 4   0.538    505 non-null    float64
 5   6.575    505 non-null    float64
 6   65.2     505 non-null    float64
 7   4.09     505 non-null    float64
 8   1        505 non-null    int64  
 9   296      505 non-null    int64  
 10  15.3     505 non-null    float64
 11  396.9    505 non-null    float64
 12  4.98     505 non-null    float64
 13  24       505 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.4 KB
None

First Few Rows of the Dataset:
   0.00632   18  2.31  0  0.538  6.575  65.2    4.09  1  296  15.3   396.9  \
0  0.02731  0.0  7.07  0  0.469  6.421  78.9  4.9671  2  242  17.8  396.90   
1  0.02729  0.0  7.07  0  0.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
data = pd.read_csv("housing.csv")

# Select features and target
X = data.iloc[:, :-1]  # Features: All columns except the last
y = data.iloc[:, -1]   # Target: Last column

# Convert target to numeric and handle NaN values
y = pd.to_numeric(y, errors="coerce")
if y.isnull().sum() > 0:
    print(f"Target variable contains {y.isnull().sum()} NaN values. Filling with mean.")
y.fillna(y.mean(), inplace=True)  # Replace NaN in the target with the mean value

# Ensure all feature values are numeric and handle NaN values
X_clean = X.apply(pd.to_numeric, errors="coerce")  # Convert features to numeric
if X_clean.isnull().sum().sum() > 0:
    print(f"Feature matrix contains {X_clean.isnull().sum().sum()} NaN values. Filling with 0.")
X_clean.fillna(0, inplace=True)  # Replace NaN in features with 0

# Align features and target to ensure the same indices
X_clean, y_clean = X_clean.align(y, join="inner", axis=0)

# Final check for NaN values
if X_clean.isnull().sum().sum() > 0 or y_clean.isnull().sum() > 0:
    print("NaN values remain after cleaning. Please check preprocessing steps.")
    exit()

# Normalize features (optional)
X_normalized = (X_clean - X_clean.mean()) / X_clean.std()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Ensure no NaN values in the training or test sets
if X_train.isnull().sum().sum() > 0 or y_train.isnull().sum() > 0:
    print("Training set contains NaN values after splitting.")
    exit()
if X_test.isnull().sum().sum() > 0 or y_test.isnull().sum() > 0:
    print("Test set contains NaN values after splitting.")
    exit()

# Check the split sizes
print(f"Training set - X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test set - X_test: {X_test.shape}, y_test: {y_test.shape}")

### Linear Regression Model (from scratch)
class LinearRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.epochs):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# Train and test Linear Regression
lr_model = LinearRegression(learning_rate=0.01, epochs=1000)
lr_model.fit(X_train.values, y_train.values)
y_pred_lr = lr_model.predict(X_test.values)

### Performance Comparison
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

# Evaluate Linear Regression
metrics_lr = evaluate_model(y_test, y_pred_lr)
print(f"Linear Regression - RMSE: {metrics_lr[0]:.4f}, R²: {metrics_lr[1]:.4f}")

### Visualization (Optional)
# Example: Bar chart for feature importance (if implemented for Random Forest)
importances = ...
plt.bar(range(len(importances)), importances)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importance")
plt.show()


Training set - X_train: (404, 13), y_train: (404,)
Test set - X_test: (101, 13), y_test: (101,)


ValueError: Input contains NaN.