In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [65]:
import mlflow
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './silicon-data-423218-q0-179a2dbeb763.json'
# mlflow.set_tracking_uri(f'postgresql://{os.environ["POSTGRES_USER"]}:{os.environ["POSTGRES_PASSWORD"]}@db:5432/{os.environ["POSTGRES_DB"]}')
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment("housing-prices-experiment")

2024/07/11 13:31:33 INFO mlflow.tracking.fluent: Experiment with name 'housing-prices-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='gs://mlflow-models-marko/1', creation_time=1720697494168, experiment_id='1', last_update_time=1720697494168, lifecycle_stage='active', name='housing-prices-experiment', tags={}>

In [66]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [67]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    with mlflow.start_run():
        n_estimators = 100
        mlflow.set_tag("developer", "marko")
        mlflow.log_param("train-input-shape", X_train.shape)
        mlflow.log_param("valid-input-shape", X_valid.shape)
        mlflow.log_param("n_estimators", n_estimators)
        model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        mae = mean_absolute_error(y_valid, preds)
        mlflow.log_metric("mae", mae)
    return mae

In [68]:
json_X = X_train[0:1].to_json()
json_X

'{"MSSubClass":{"619":20},"LotFrontage":{"619":90.0},"LotArea":{"619":11694},"OverallQual":{"619":9},"OverallCond":{"619":5},"YearBuilt":{"619":2007},"YearRemodAdd":{"619":2007},"MasVnrArea":{"619":452.0},"BsmtFinSF1":{"619":48},"BsmtFinSF2":{"619":0},"BsmtUnfSF":{"619":1774},"TotalBsmtSF":{"619":1822},"1stFlrSF":{"619":1828},"2ndFlrSF":{"619":0},"LowQualFinSF":{"619":0},"GrLivArea":{"619":1828},"BsmtFullBath":{"619":0},"BsmtHalfBath":{"619":0},"FullBath":{"619":2},"HalfBath":{"619":0},"BedroomAbvGr":{"619":3},"KitchenAbvGr":{"619":1},"TotRmsAbvGrd":{"619":9},"Fireplaces":{"619":1},"GarageYrBlt":{"619":2007.0},"GarageCars":{"619":3},"GarageArea":{"619":774},"WoodDeckSF":{"619":0},"OpenPorchSF":{"619":108},"EnclosedPorch":{"619":0},"3SsnPorch":{"619":0},"ScreenPorch":{"619":260},"PoolArea":{"619":0},"MiscVal":{"619":0},"MoSold":{"619":7},"YrSold":{"619":2007}}'

In [69]:
cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]

reduced_X_train = X_train.drop(cols_with_missing_values, axis = 1)
reduced_X_valid = X_valid.drop(cols_with_missing_values, axis = 1)

In [70]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17837.82570776256


In [71]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer() 
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns


In [72]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
18062.894611872147


In [73]:
# Preprocessed training and validation features
imputer = SimpleImputer(strategy='median') 
final_X_train = pd.DataFrame(imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(imputer.transform(X_valid))
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

In [74]:
# Define and fit model
params = {"n_estimators": 100, "random_state": 42}
model = RandomForestRegressor(**params)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE:")
mae = mean_absolute_error(y_valid, preds_valid)
print(mae)
with mlflow.start_run() as run:
  mlflow.log_params(params)
  mlflow.log_metrics({"mae": mae})
  mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="sklearn-model",
    input_example=final_X_train,
    registered_model_name="sk-learn-random-forest-reg-model",
  )


MAE:
17849.725786040442


Successfully registered model 'sk-learn-random-forest-reg-model'.
2024/07/11 13:31:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-random-forest-reg-model, version 1
Created version '1' of model 'sk-learn-random-forest-reg-model'.


In [75]:
final_X_test = pd.DataFrame(imputer.transform(X_test))
final_X_test.columns = X_test.columns

preds_test = model.predict(final_X_test)

In [76]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [77]:
from sklearn.pipeline import Pipeline

# Model with imputer in pipeline
imputer = SimpleImputer(strategy='median') 
model = RandomForestRegressor(**params)

pipeline = Pipeline([
    ('imputer', imputer),
    ('model', model)
])

pipeline.fit(final_X_train, y_train)
preds_valid = pipeline.predict(X_valid)
mae = mean_absolute_error(y_valid, preds_valid)
print("MAE:")
mae = mean_absolute_error(y_valid, preds_valid)
print(mae)


MAE:
17849.725786040442


In [78]:
with mlflow.start_run():
    mlflow.set_tag("developer", "marko")
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="pipeline-model",
        input_example=final_X_train,
        registered_model_name="pipeline-random-forest-reg-model"
    )

Successfully registered model 'pipeline-random-forest-reg-model'.
2024/07/11 13:31:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: pipeline-random-forest-reg-model, version 1
Created version '1' of model 'pipeline-random-forest-reg-model'.
