In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import joblib

# Load your dataset (ensure the path is correct)
df = pd.read_excel("/content/drive/MyDrive/HousePricePrediction.xlsx")

# Drop rows with missing SalePrice
df = df.dropna(subset=["SalePrice"])

# Drop ID column if present (adjust according to your dataset)
df = df.drop(columns=["Id"], errors="ignore")




# Separate features and target variable
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()



# Preprocessing for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Preprocessor for applying transformations to columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Final pipeline combining preprocessing and model
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])



# Train-test split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (includes preprocessing and training)
model_pipeline.fit(X_train, y_train)

# Save the trained pipeline (model + preprocessing steps)
joblib.dump(model_pipeline, "house_price_model.pkl")

# Optionally, save the column names used in the training set for later use
joblib.dump(X.columns.tolist(), "trained_columns.pkl")

print("✅ Model and preprocessing pipeline saved successfully.")