
# Predicting House Prices with MLFlow and DVC

This notebook demonstrates how to:
1. Use the `train.csv` dataset to train a predictive model for house prices.
2. Track experiments with MLFlow (integrated with DagsHub).
3. Manage data versions with DVC.
4. Predict house prices for the `test.csv` dataset.
5. Save the results in the format of `sample_submission.csv`.


In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
import os

# DVC-specific imports
from dvc.api import open as dvc_open


In [None]:

# Load train.csv using DVC
train_path = "data/train.csv"
with dvc_open(train_path, "r") as f:
    train_data = pd.read_csv(f)

# Load test.csv using DVC
test_path = "data/test.csv"
with dvc_open(test_path, "r") as f:
    test_data = pd.read_csv(f)

# Display the first few rows of the train dataset
train_data.head()


In [None]:

# Data preprocessing

# Handling missing values (basic strategy)
train_data.fillna(train_data.median(), inplace=True)
test_data.fillna(test_data.median(), inplace=True)

# Select features and target
features = [
    "MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", 
    "YearBuilt", "YearRemodAdd", "GrLivArea", "FullBath", "HalfBath", "BedroomAbvGr"
]
X = train_data[features]
y = train_data["SalePrice"]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Set up MLFlow tracking URI (DagsHub integration)
mlflow.set_tracking_uri("https://dagshub.com/<your-dagshub-username>/<your-repo-name>.mlflow")
mlflow.set_experiment("House Prices Prediction")

# Start MLFlow run
with mlflow.start_run():
    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model on validation data
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    # Log parameters, metrics, and model to MLFlow
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "random_forest_model")
    
    print(f"Validation RMSE: {rmse}")


In [None]:

# Predict on the test dataset
test_features = test_data[features]
test_predictions = model.predict(test_features)

# Create submission DataFrame
submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": test_predictions
})

# Save submission file
submission_path = "sample_submission.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission file saved: {submission_path}")
