In [40]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import chi2, VarianceThreshold
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from sklearn.metrics import (r2_score, mean_absolute_error, 
                             mean_squared_error)

In [44]:
train_data_path = "../data//house-prices-advanced-regression-techniques/train.csv"
test_data_path = "../data/house-prices-advanced-regression-techniques/test.csv"

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

train_data.shape, test_data.shape

((1460, 81), (1459, 80))

In [52]:
for column in train_data.columns:
    percentage = train_data[column].isna().sum() * 100 / len(train_data)

    if (percentage <= 3 and percentage > 0):  # with mean
        print(train_data[column].mode()[0])

None
0.0
TA
TA
No
Unf
Unf
SBrkr


In [54]:
train_data['MasVnrType'].value_counts()

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

# Remove redundant features

In [5]:
uneeded_columns = []

for column in train_data.columns:
    percentage = train_data[column].isna().sum() * 100 / len(train_data)
    if percentage > 30:
        uneeded_columns.append([column, percentage])

print(uneeded_columns)

[['Alley', 93.76712328767124], ['FireplaceQu', 47.26027397260274], ['PoolQC', 99.52054794520548], ['Fence', 80.75342465753425], ['MiscFeature', 96.3013698630137]]


In [6]:
# Removing uneeded columns from test and train data

drop = []
for i, j in uneeded_columns:
    drop.append(i)

combin = [train_data, test_data]
for dataset in combin:
    dataset.drop(columns=drop, axis=1, inplace=True)

train_data.shape, test_data.shape

((1460, 76), (1459, 75))

# Handling missing values

In [7]:
null_num_columns = []
null_cat_columns = []

for column in train_data.columns:
    percentage = train_data[column].isna().sum() * 100 / len(train_data)
    if percentage > 0:
        if train_data[column].dtype != "O":
            null_num_columns.append([column, percentage])
        else:
            null_cat_columns.append([column, percentage])

print("Numerical columns with missing values")
print(null_num_columns, "\n")

print("Categorical columns with missing values")
print(null_cat_columns)

Numerical columns with missing values
[['LotFrontage', 17.73972602739726], ['MasVnrArea', 0.547945205479452], ['GarageYrBlt', 5.5479452054794525]] 

Categorical columns with missing values
[['MasVnrType', 0.547945205479452], ['BsmtQual', 2.5342465753424657], ['BsmtCond', 2.5342465753424657], ['BsmtExposure', 2.6027397260273974], ['BsmtFinType1', 2.5342465753424657], ['BsmtFinType2', 2.6027397260273974], ['Electrical', 0.0684931506849315], ['GarageType', 5.5479452054794525], ['GarageFinish', 5.5479452054794525], ['GarageQual', 5.5479452054794525], ['GarageCond', 5.5479452054794525]]


In [8]:
num_columns = []
cat_columns = []
for i, j in null_num_columns:
    num_columns.append(i)

for i, j in null_cat_columns:
    cat_columns.append(i)


print(num_columns)
print()
print(cat_columns)

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']


In [9]:
"""Filling columns with less than 3% missing values with mean(for numerical) and 
    mode(for categorical) and filling the other columns with linear models or decision 
    tree"""

def fill_numerical_values_with_linear_model(dataset, column):

    data = dataset[[column, "SalePrice"]].copy()
    data[column] = data[column].fillna(-1)
    train = data[data[column] != -1]
    missied_data = pd.DataFrame(data[data[column] == -1]["SalePrice"])

    x_train, x_test, y_train, y_test = train_test_split(
        train.drop(columns=column, axis=1),
        train[column],
        train_size=0.01,
        random_state=42,
    )

    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)
    predction = list(lin_reg.predict(missied_data))

    def update(value):
        if value == -1:
            ret = int(predction[0])
            predction.pop(0)
            return ret
        return value

    dataset[column] = dataset[column].fillna(-1)
    dataset[column] = dataset[column].apply(update)

    return dataset[column]


def fill_numerical_values_with_mean(dataset=pd.DataFrame(), column=str):

    mean = dataset[column].mean()
    dataset[column] = dataset[column].fillna(mean)
    return dataset[column]


for column in num_columns:
    percentage = train_data[column].isna().sum() * 100 / len(train_data)

    if percentage <= 3:  # with mean
        train_data[column] = fill_numerical_values_with_mean(train_data, column)

    else:  # with model
        train_data[column] = fill_numerical_values_with_linear_model(train_data, column)

In [10]:
def fill_numerical_values_with_RF_model(dataset, column):

    data = dataset[[column, "SalePrice"]].copy()
    data[column] = data[column].fillna("missied_data")
    train = data[data[column] != "missied_data"]
    missied_data = pd.DataFrame(data[data[column] == "missied_data"]["SalePrice"])

    x_train, x_test, y_train, y_test = train_test_split(
        train.drop(columns=column, axis=1),
        train[column],
        train_size=0.01,
        random_state=42,
    )

    RF = RandomForestClassifier(ccp_alpha=0.015)
    RF.fit(x_train, y_train)
    predction = list(RF.predict(missied_data))

    def update(value):
        if value == "missied_data":
            ret = predction[0]
            predction.pop(0)
            return ret
        return value

    dataset[column] = dataset[column].fillna("missied_data")
    dataset[column] = dataset[column].apply(update)

    return dataset[column]


def fill_numerical_values_with_mode(dataset, column):
    mode = dataset[column].mode()[0]
    dataset[column] = dataset[column].fillna(mode)
    return dataset[column]


for column in cat_columns:
    percentage = train_data[column].isna().sum() * 100 / len(train_data)

    if percentage <= 3:  # with mean
        train_data[column] = fill_numerical_values_with_mode(train_data, column)

    else:  # with model
        train_data[column] = fill_numerical_values_with_RF_model(train_data, column)

In [11]:
for i in train_data.columns:
    if train_data[i].isna().sum() > 0:
        print(i)

In [12]:
null_num_columns = []
null_cat_columns = []

for column in test_data.columns:
    percentage = test_data[column].isna().sum() * 100 / len(test_data)
    if percentage > 0:
        if test_data[column].dtype != "O":
            null_num_columns.append([column, percentage])
        else:
            null_cat_columns.append([column, percentage])

print(null_num_columns, "\n")
print(null_cat_columns)

[['LotFrontage', 15.558601782042494], ['MasVnrArea', 1.0281014393420151], ['BsmtFinSF1', 0.06854009595613433], ['BsmtFinSF2', 0.06854009595613433], ['BsmtUnfSF', 0.06854009595613433], ['TotalBsmtSF', 0.06854009595613433], ['BsmtFullBath', 0.13708019191226867], ['BsmtHalfBath', 0.13708019191226867], ['GarageYrBlt', 5.346127484578479], ['GarageCars', 0.06854009595613433], ['GarageArea', 0.06854009595613433]] 

[['MSZoning', 0.27416038382453733], ['Utilities', 0.13708019191226867], ['Exterior1st', 0.06854009595613433], ['Exterior2nd', 0.06854009595613433], ['MasVnrType', 1.0966415352981493], ['BsmtQual', 3.015764222069911], ['BsmtCond', 3.0843043180260454], ['BsmtExposure', 3.015764222069911], ['BsmtFinType1', 2.8786840301576424], ['BsmtFinType2', 2.8786840301576424], ['KitchenQual', 0.06854009595613433], ['Functional', 0.13708019191226867], ['GarageType', 5.2090472926662095], ['GarageFinish', 5.346127484578479], ['GarageQual', 5.346127484578479], ['GarageCond', 5.346127484578479], ['Sale

In [13]:
num_columns = []
cat_columns = []
for i, j in null_num_columns:
    num_columns.append(i)

for i, j in null_cat_columns:
    cat_columns.append(i)


print(num_columns)
print()
print(cat_columns)

['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']

['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType']


In [14]:
for column in cat_columns:
    test_data[column] = fill_numerical_values_with_mode(test_data, column)

for column in num_columns:
    test_data[column] = fill_numerical_values_with_mean(test_data, column)

# Removing outlier

In [18]:
utliers_col = ["LotFrontage", "LotArea", "BsmtFinSF1", "TotalBsmtSF", "GrLivArea"]

train_data = train_data.drop(train_data[train_data["LotFrontage"] > 185].index)
train_data = train_data.drop(train_data[train_data["LotArea"] > 100000].index)
train_data = train_data.drop(train_data[train_data["BsmtFinSF1"] > 4000].index)
train_data = train_data.drop(train_data[train_data["TotalBsmtSF"] > 5000].index)
train_data = train_data.drop(train_data[train_data["GrLivArea"] > 4000].index)

In [19]:
train_data.shape

(1451, 76)

# Feature Construction

In [20]:
combin = [train_data, test_data]

for dataset in combin:
    dataset["Totalarea"] = dataset["LotArea"] + dataset["LotFrontage"]
    dataset["TotalBsmtFin"] = dataset["BsmtFinSF1"] + dataset["BsmtFinSF2"]
    dataset["TotalSF"] = dataset["TotalBsmtSF"] + dataset["2ndFlrSF"]
    dataset["TotalBath"] = dataset["FullBath"] + dataset["HalfBath"]
    dataset["TotalPorch"] = (
        dataset["ScreenPorch"] + dataset["EnclosedPorch"] + dataset["OpenPorchSF"]
    )

In [21]:
def update(val):
    if val > 0:
        return 1
    return 0


for dataset in combin:
    dataset["Totalarea"] = dataset["Totalarea"].apply(update)
    dataset["TotalBsmtFin"] = dataset["TotalBsmtFin"].apply(update)
    dataset["TotalSF"] = dataset["TotalSF"].apply(update)
    dataset["TotalBath"] = dataset["TotalBath"].apply(update)
    dataset["TotalPorch"] = dataset["Totalarea"].apply(update)

## Feature Selection

In [22]:
numerical_cols =  train_data.dtypes[train_data.dtypes != 'object'].index.tolist()
categorical_cols =  train_data.dtypes[train_data.dtypes == 'object'].index.tolist()

# Variance Threshold For Numerical Columns

In [23]:
presntage = 0.8 * (1 - 0.8)
X = train_data[numerical_cols]
var = VarianceThreshold(threshold=presntage)
var.fit(X)

In [24]:
boolean_selection = var.get_support()
columns_names = var.feature_names_in_

removed_columns = set()
for i in range(len(boolean_selection)):
    if boolean_selection[i] == False:
        removed_columns.add(columns_names[i])

removed_columns

{'BsmtHalfBath',
 'KitchenAbvGr',
 'TotalBath',
 'TotalPorch',
 'TotalSF',
 'Totalarea'}

# Correlation

In [25]:
def remove_strong_corr(data_set, neg_corr, pos_corr):
    global removed_columns
    correlation = data_set.corr()

    for i in range(len(correlation.columns)):

        for j in range(i):
            corr_value = correlation.iloc[i, j]

            if corr_value < 0 and corr_value < neg_corr:
                print(correlation.columns[j], correlation.columns[i])
                removed_columns.add(correlation.columns[j])
            elif corr_value > 0 and corr_value > pos_corr:
                print(correlation.columns[j], correlation.columns[i])
                removed_columns.add(correlation.columns[j])

    return

df = train_data[numerical_cols].drop(columns=["Id"], axis=1)
remove_strong_corr(df, neg_corr=-0.4, pos_corr=0.8)
print(len(removed_columns))
print(removed_columns)

BsmtFinSF1 BsmtUnfSF
TotalBsmtSF 1stFlrSF
BsmtUnfSF BsmtFullBath
GrLivArea TotRmsAbvGrd
YearBuilt GarageYrBlt
GarageCars GarageArea
OverallQual SalePrice
BsmtUnfSF TotalBsmtFin
13
{'TotalBath', 'YearBuilt', 'TotalSF', 'BsmtHalfBath', 'KitchenAbvGr', 'TotalPorch', 'GarageCars', 'OverallQual', 'Totalarea', 'TotalBsmtSF', 'GrLivArea', 'BsmtUnfSF', 'BsmtFinSF1'}


In [26]:
removed_columns.remove("GrLivArea")
removed_columns

{'BsmtFinSF1',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'GarageCars',
 'KitchenAbvGr',
 'OverallQual',
 'TotalBath',
 'TotalBsmtSF',
 'TotalPorch',
 'TotalSF',
 'Totalarea',
 'YearBuilt'}

# Chi2 For Categorical Columns

In [27]:
df = train_data[categorical_cols]

ord = OrdinalEncoder()
ord.fit(df)
df[df.columns] = ord.transform(df)


X = df
Y = train_data["SalePrice"]

chi_state, p_value = chi2(X, Y)

needed_column = 0
column_names = []
columns = list(X.columns)


for i in range(len(p_value)):
    if p_value[i] <= 0.05:
        needed_column += 1
        column_names.append(columns[i])
        removed_columns.add(columns[i])

print("all columns = ", len(p_value))
print("needed column = ", needed_column)
print("the columns i will take : ", column_names)
print("number of removed columns :", len(removed_columns))
print("removed columns are :", removed_columns)

all columns =  38
needed column =  10
the columns i will take :  ['LotShape', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofMatl', 'Exterior2nd', 'BsmtFinType1', 'HeatingQC', 'GarageType']
number of removed columns : 22
removed columns are : {'TotalSF', 'Exterior2nd', 'BsmtHalfBath', 'LandSlope', 'TotalPorch', 'Totalarea', 'HouseStyle', 'BldgType', 'TotalBath', 'YearBuilt', 'GarageType', 'KitchenAbvGr', 'RoofMatl', 'OverallQual', 'BsmtUnfSF', 'BsmtFinSF1', 'HeatingQC', 'LotShape', 'Neighborhood', 'BsmtFinType1', 'GarageCars', 'TotalBsmtSF'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.columns] = ord.transform(df)


# Variance Threshold For Categorical Columns

In [28]:
presntage = 0.8 * (1 - 0.8)
X = df
var = VarianceThreshold(threshold=presntage)
var.fit(X)

In [29]:
boolean_selection = var.get_support()
columns_names = var.feature_names_in_
print(boolean_selection)
print()
print(columns_names)

[ True False  True  True False  True False  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True False  True  True  True  True  True  True  True  True
  True  True]

['MSZoning' 'Street' 'LotShape' 'LandContour' 'Utilities' 'LotConfig'
 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2' 'BldgType'
 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st' 'Exterior2nd'
 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond'
 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'Heating' 'HeatingQC'
 'CentralAir' 'Electrical' 'KitchenQual' 'Functional' 'GarageType'
 'GarageFinish' 'GarageQual' 'GarageCond' 'PavedDrive' 'SaleType'
 'SaleCondition']


In [30]:
for i in range(len(boolean_selection)):
    if boolean_selection[i] == False:
        removed_columns.add(columns_names[i])

print(len(removed_columns))

27


# Remove columns that have the same value more than 80%

In [31]:
for col in train_data.columns:
    count = train_data[col].value_counts().sort_values(ascending=False)
    top_value_count = count.iloc[0]
    if top_value_count * 100 / len(train_data) > 80:
        removed_columns.add(col)

print(len(removed_columns))
print()
print((removed_columns))

46

{'TotalSF', 'Utilities', 'GarageQual', 'MiscVal', 'Condition1', 'Exterior2nd', 'Condition2', 'SaleCondition', 'BsmtHalfBath', 'LandSlope', 'TotalPorch', 'Totalarea', 'CentralAir', 'PoolArea', 'HouseStyle', 'LowQualFinSF', 'GarageCond', 'EnclosedPorch', 'BldgType', '3SsnPorch', 'BsmtFinType2', 'Functional', 'TotalBath', 'YearBuilt', 'GarageType', 'KitchenAbvGr', 'RoofMatl', 'BsmtCond', 'ScreenPorch', 'OverallQual', 'Street', 'BsmtUnfSF', 'BsmtFinSF1', 'Electrical', 'HeatingQC', 'Heating', 'LotShape', 'Neighborhood', 'BsmtFinType1', 'GarageCars', 'LandContour', 'BsmtFinSF2', 'SaleType', 'PavedDrive', 'TotalBsmtSF', 'ExterCond'}


In [32]:
removed_columns.remove("TotalBath")
removed_columns.remove("TotalSF")
removed_columns.remove("TotalBsmtSF")
removed_columns.remove("TotalPorch")
removed_columns.remove("Totalarea")


combin = [train_data, test_data]


for dataset in combin:
    dataset.drop(columns=list(removed_columns), axis=1, inplace=True)
train_data.shape, test_data.shape

((1451, 40), (1459, 39))

In [34]:
train_data["SalePrice"] = np.log10(train_data["SalePrice"])

In [72]:
for col in train_data.columns:
    if train_data[col].dtype == "bool":
        train_data[col] = train_data[col].astype("int32")

for col in test_data.columns:
    if test_data[col].dtype == "bool":
        test_data[col] = test_data[col].astype("int32")

## Models

In [73]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("House SalePrice Prediction")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/PRODIGY_ML_01/mlruns/1', creation_time=1708731534546, experiment_id='1', last_update_time=1708731534546, lifecycle_stage='active', name='House SalePrice Prediction', tags={}>

In [74]:
ids = test_data["Id"]
data = {
    "model": [],
    "MAE": [],
    "MSE": [],
    "RMSE": [],
    "train_score": [],
    "test_score": [],
}
models = pd.DataFrame(columns=data)

train_data = train_data.drop(columns=["Id"], axis=1)
test_data = test_data.drop(columns=["Id"], axis=1)

In [75]:
x_train, x_test, y_train, y_test = train_test_split(train_data.drop(columns=["SalePrice"]),
                                                    train_data["SalePrice"],
                                                    test_size=0.3,
                                                    random_state=42,)


vectorizer = DictVectorizer(sparse=False)
x_train = vectorizer.fit_transform(x_train.to_dict(orient="records"))
x_test = vectorizer.transform(x_test.to_dict(orient="records"))

In [76]:
def evaluate_model(y_train, y_test, train_prediction, test_prediction):

    train_prediction = np.exp(train_prediction)
    test_prediction = np.exp(test_prediction)

    score_train = r2_score(y_true=y_train, y_pred=train_prediction)
    score_test = r2_score(y_true=y_test, y_pred=test_prediction)

    MAE = mean_absolute_error(y_true=y_test, y_pred=test_prediction)
    MSE = mean_squared_error(y_true=y_test, y_pred=test_prediction)
    RMSE = np.sqrt(mean_squared_error(y_true=y_test, y_pred=test_prediction))
            
    metrics = {"MAE": MAE, "MSE": MSE,
              "RMSE": RMSE, "train_score": score_train,
              "test_score": score_test,}
    return metrics

In [77]:
with mlflow.start_run():

    model = LinearRegression()
    model.fit(x_train, y_train)

    mlflow.set_tag("Model", "LinearRegression")
    mlflow.set_tag("Scaler", None)

    train_prediction = model.predict(x_train)
    test_prediction = model.predict(x_test)

    metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
    mlflow.log_metrics(metrics)
    print(metrics)

{'MAE': 182.36148088462963, 'MSE': 34286.08107319772, 'RMSE': 185.1650103912662, 'train_score': -1171694.8699612531, 'test_score': -1155776.306955997}


In [78]:
from hyperopt.pyll import scope
from sklearn.pipeline import make_pipeline
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe

In [79]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'Ridge')
        mlflow.set_tag("Scaler", None)
        mlflow.log_params(params)

        model = Ridge(**params)
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)

        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}

space = {"alpha":  scope.float(hp.quniform('alpha', 0.2, 20, 0.4))}
best_result = fmin(fn= objective, space=space,
                   algo=tpe.suggest, max_evals=50, trials=Trials())

100%|██████████| 50/50 [00:07<00:00,  6.96trial/s, best loss: 34250.63830557558] 


In [80]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'Ridge')
        mlflow.set_tag("Scaler", "StandardScaler")
        mlflow.log_params(params)

        model = make_pipeline(StandardScaler(),
                              Ridge(**params))
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)
        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}

space = {"alpha":  scope.float(hp.quniform('alpha', 0.2, 20, 0.4))}

best_result = fmin(fn= objective, space=space,
                   algo=tpe.suggest, max_evals=50, trials=Trials())

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:06<00:00,  7.81trial/s, best loss: 34269.43290553566]


In [89]:
space = {"alpha":  scope.float(hp.quniform('alpha', 0.2, 20, 0.4))}

In [91]:
for i in space["alpha"]

<hyperopt.pyll.base.Apply at 0x7f020fb48ad0>

In [81]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'Lasso')
        mlflow.set_tag("Scaler", None)
        mlflow.log_params(params)

        model = Lasso(**params)
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)

        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}

space = {"alpha":  scope.float(hp.quniform('alpha', 0.2, 20, 0.4))}

best_result = fmin(fn= objective, space=space,
                   algo=tpe.suggest, max_evals=50, trials=Trials())


def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'Lasso')
        mlflow.set_tag("Scaler", "StandardScaler")
        mlflow.log_params(params)

        model = make_pipeline(StandardScaler(),
                              Lasso(**params))
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)

        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}

space = {"alpha":  scope.float(hp.quniform('alpha', 0.2, 20, 0.4))}

best_result = fmin(fn= objective, space=space,
                   algo=tpe.suggest, max_evals=50, trials=Trials())

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:06<00:00,  8.08trial/s, best loss: 33081.825532005336]
100%|██████████| 50/50 [00:06<00:00,  8.20trial/s, best loss: 32490.96657674933]


Linear Scaled Model

In [82]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'DecisionTree')
        mlflow.log_params(params)

        model = DecisionTreeRegressor(**params)
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)

        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}

space = {"max_depth": hp.randint("max_depth", 1, 15),
        'min_samples_split': hp.randint("min_samples_split", 2, 15),
        'min_samples_leaf': hp.randint("min_samples_leaf", 1, 15),
        }

best_result = fmin(fn= objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials()
                    )

100%|██████████| 50/50 [00:05<00:00,  8.82trial/s, best loss: 33358.44807133812] 


In [83]:
def random_forest_objective(params):

    with mlflow.start_run():
        mlflow.set_tag('Model', 'RandomForest')
        mlflow.log_params(params)

        model = RandomForestRegressor(**params)
        model.fit(x_train, y_train)

        train_prediction = model.predict(x_train)
        test_prediction = model.predict(x_test)

        metrics = evaluate_model(y_train, y_test, train_prediction, test_prediction)
        mlflow.log_metrics(metrics)
        
    return {"loss": metrics['MSE'], 'status': STATUS_OK}


space = {"n_estimators": hp.choice("n_estimators", [2,5,10, 20, 30, 50, 100,]),
            'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 5)),
            }

best_result = fmin(fn=random_forest_objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=50,
                    trials=Trials())

100%|██████████| 50/50 [00:20<00:00,  2.41trial/s, best loss: 33892.07966261768] 
