# 1) First Train from CSV
---

In [None]:
import pandas as pd
import time
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import  StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dotenv import load_dotenv
import os

load_dotenv()

# Set tracking URI to your Heroku application
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])

if __name__ == "__main__":

    ### MLFLOW Experiment setup
    experiment_name="Housing_prices_estimator"
    mlflow.set_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

    client = mlflow.tracking.MlflowClient()
    run = client.create_run(experiment.experiment_id)

    print("training model...")
    
    # Time execution
    start_time = time.time()

    # Call mlflow autolog
    mlflow.sklearn.autolog(log_models=False)

    # Import dataset
    data = pd.read_csv('dataset_from_kaggle.csv')


    #Drop lines containing outliers :
    def drop_outlier(dataset, col):
        upper_outlier = dataset[col].mean() + 3 * dataset[col].std()
        lower_outlier = dataset[col].mean() - 3 * dataset[col].std()
        outlier_condition = (dataset[col] > upper_outlier) | (dataset[col] < lower_outlier)
        dataset = dataset[~outlier_condition]

        return dataset

    Columns_to_clean = ['Price']
    for col in Columns_to_clean:
        df = drop_outlier(data,col)

    df.columns = df.columns.str.lower()


    #Separate target from other explanatory variable :
    target_variable = "price"
    X = df.drop([target_variable] , axis = 1)
    y = df[target_variable]
 

    my_features_list = X.columns.tolist() #for later


    #Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = X_train.astype({col: "float64" for col in X_train.select_dtypes("int").columns})


    #Numeric/categorical columns
    categorical_features = []
    numeric_features = [feature for feature in my_features_list if feature not in categorical_features]


    #Transformer
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])


    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numeric_features)
            ])

    # Pipeline 

    model = Pipeline(steps=[
        ("Preprocessing", preprocessor),
        ("Regressor",LinearRegression())
    ], verbose=True)


    # Log experiment to MLFlow
    with mlflow.start_run(run_id = run.info.run_id) as run:
        model.fit(X_train, y_train)
        predictions = model.predict(X_train)

        # Log model seperately to have more flexibility on setup 
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="Housing_prices_estimator",
            #registered_model_name="Housing_prices_estimator_LR",    #not working with mlflow 3.11
            signature=infer_signature(X_train, predictions)
        )
        
    print("...Done!")
    print(f"---Total training time: {time.time()-start_time}")


# 2) Train from DATABASE
---

In [None]:
import pandas as pd
import time
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import  StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os

load_dotenv()
# Set tracking URI to your Heroku application
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])

if __name__ == "__main__":

    ### MLFLOW Experiment setup
    experiment_name="Housing_prices_estimator_from_DB"
    mlflow.set_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

    client = mlflow.tracking.MlflowClient()
    run = client.create_run(experiment.experiment_id)

    print("training model...")
    
    # Time execution
    start_time = time.time()

    # Call mlflow autolog
    mlflow.sklearn.autolog(log_models=False)

    # Import dataset
    postgres_database = os.environ["POSTGRES_DATABASE"]
    engine = create_engine(postgres_database, echo=True)
    table_name = 'housing_prices'

    #  search for data to train
    with engine.connect() as conn:
        stmt = text(f"""
            SELECT * FROM {table_name} 
            WHERE price IS NOT NULL""")
        result = conn.execute(stmt)
        data = pd.DataFrame(result.fetchall(), columns=result.keys())


    #Drop lines containing outliers :
    def drop_outlier(dataset, col):
        upper_outlier = dataset[col].mean() + 3 * dataset[col].std()
        lower_outlier = dataset[col].mean() - 3 * dataset[col].std()
        outlier_condition = (dataset[col] > upper_outlier) | (dataset[col] < lower_outlier)
        dataset = dataset[~outlier_condition]

        return dataset

    Columns_to_clean = ['price']
    for col in Columns_to_clean:
        df = drop_outlier(data,col)


    #Separate target from other explanatory variable :
    target_variable = "price"
    my_features_list = ['square_feet', 'num_bedrooms', 'num_bathrooms', 'num_floors',
            'year_built', 'has_garden', 'has_pool', 'garage_size',
            'location_score', 'distance_to_center']

    X = df[my_features_list]
    y = df[target_variable]
 

    #Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = X_train.astype({col: "float64" for col in X_train.select_dtypes("int").columns})


    #Numeric/categorical columns
    categorical_features = []
    numeric_features = [feature for feature in my_features_list if feature not in categorical_features]


    #Transformer
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])


    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numeric_features)
            ])

    # Pipeline 

    model = Pipeline(steps=[
        ("Preprocessing", preprocessor),
        ("Regressor",LinearRegression())
    ], verbose=True)


    # Log experiment to MLFlow
    with mlflow.start_run(run_id = run.info.run_id) as run:
        model.fit(X_train, y_train)
        predictions = model.predict(X_train)

        # Log model seperately to have more flexibility on setup 
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="Housing_prices_estimator",
            #registered_model_name="Housing_prices_estimator_LR",    #not working with mlflow 3.11
            signature=infer_signature(X_train, predictions)
        )
        
    print("...Done!")
    print(f"---Total training time: {time.time()-start_time}")