In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from joblib import dump, load


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
data = pd.read_csv("AmesHousing.csv")

# Handle missing values (choose an appropriate strategy)
data = data.fillna(method="ffill")  # Example: Filling with previous valid values

# Separate features and target variable
features = data.drop("SalePrice", axis=1)
target = data["SalePrice"]

# Encode categorical features
encoder = OneHotEncoder(handle_unknown="ignore")
encoded_features = encoder.fit_transform(features).toarray()

# Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(encoded_features)


FileNotFoundError: [Errno 2] No such file or directory: 'AmesHousing.csv'

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    scaled_features, target, test_size=0.2, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42  # Adjust test_size as needed
)


In [None]:
models = [
    ("Linear Regression", LinearRegression()),
    ("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42)),
    (
        "TensorFlow Neural Network",
        tf.keras.models.Sequential(
            [
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                tf.keras.layers.Dense(1),
            ]
        ),
    ),
]

best_model = None
best_mse = np.inf
for name, model in models:
    if isinstance(model, tf.keras.Model):  # Check if it's a TensorFlow model
        model.compile(optimizer="adam", loss="mse")  # Compile before fitting
    model.fit(X_train, y_train)  # Now you can fit the model

    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print(f"{name} MSE: {mse}")

    if mse < best_mse:
        best_model = model
        best_mse = mse


In [None]:
import pandas as pd

# Create DataFrames from NumPy arrays
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)

# Save DataFrames to CSV
X_train_df.to_csv("X_train.csv", index=False)
X_test_df.to_csv("X_test.csv", index=False)
y_train_df.to_csv("y_train.csv", index=False)
y_test_df.to_csv("y_test.csv", index=False)


In [None]:
if isinstance(best_model, LinearRegression) or isinstance(best_model, RandomForestRegressor):
    dump(best_model, "best_model.joblib")
else:
    best_model.save("best_model.h5")


NameError: name 'best_model' is not defined

In [None]:
import os

file_name = "best_model.joblib" if isinstance(best_model, (LinearRegression, RandomForestRegressor)) else "best_model.h5"

if isinstance(best_model, LinearRegression) or isinstance(best_model, RandomForestRegressor):
    dump(best_model, file_name)
else:
    best_model.save(file_name)

# Get the absolute path of the saved file
file_path = os.path.abspath(file_name)

print(f"The model is saved at: {file_path}")


NameError: name 'best_model' is not defined

In [None]:
loaded_model = load("best_model.joblib")  # Or load TensorFlow model using model.load
y_pred_test = loaded_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print("Test Set MSE:", mse_test)


In [None]:
import gzip

with gzip.open("X_train_compressed.csv.gz", "wt") as f:
    X_train_df.to_csv(f, index=False)

with gzip.open("X_test_compressed.csv.gz", "wt") as f:
    X_test_df.to_csv(f, index=False)
