In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/emadrigals104/PLFPython/main/Datasets/housing.csv'
df = pd.read_csv(url)

df.head()


In [None]:
df.describe()


In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# Create an imputer object with a median filling strategy
imputer = SimpleImputer(strategy='median')

# We need to reshape the data for the imputer
total_bedrooms = df['total_bedrooms'].values.reshape(-1, 1)

# Fit and transform the data
df['total_bedrooms'] = imputer.fit_transform(total_bedrooms)

# Verify that there are no more missing values
print(df.isnull().sum())


In [None]:
# Convert categorical variable into dummy/indicator variables
df = pd.get_dummies(df, columns=['ocean_proximity'])

# Display the first 5 rows of the encoded dataframe
print(df.head())

In [None]:
# Separate the target variable (y) from the features (X)
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Print the shapes of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Display the first 5 rows of X
print('Features(X):')
print(X.head())

# Display the first 5 rows of y
print('Target (Y):')
print(y.head())

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Create a dictionary of models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}


In [None]:
# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the training data
    y_pred_train = model.predict(X_train_scaled)
    
    # Calculate the root mean squared error (RMSE)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    print(f"Model: {name}")
    print(f"Training RMSE: {rmse_train:.2f}")
    print("-" * 30)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Evaluate models on the test set and visualize the results
for name, model in models.items():
    # Make predictions on the test data
    y_pred_test = model.predict(X_test_scaled)
    
    # Calculate the root mean squared error (RMSE) on the test data
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Print the test RMSE
    print(f"Model: {name}")
    print(f"Test RMSE: {rmse_test:.2f}")
    
    # Create a scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred_test, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(f"{name} - Actual vs. Predicted Values (Test Set)")
    plt.grid(True)
    plt.show()
    
    print("-" * 30)