# **Import MLflow libraries**

In [1]:
import mlflow
from mlflow.tracking import MlflowClient

# **Set Server Uri**

In [2]:
mlflow.set_tracking_uri('http://localhost:5000')

# **Create new experiment**

In [3]:
#create a new experiment
mlflow.create_experiment("Used Car Price Prediction")
#use the experiment created
mlflow.set_experiment("Used Car Price Prediction")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='Used Car Price Prediction', tags={}>

# **Load the dataset and devide it into train and test**

In [4]:
"""
Load the dataset in pandas dataframe and apply the Label encoder and the One hot encoder for the categorical data
it should be noted that this dataset is the cleaned dataset after applying all the preprocessing steps presented 
in the file preprocessing.py 
"""
import pandas as pd
data=pd.read_csv('car_price.csv')
name=pd.get_dummies(data['name'])
fuel = pd.get_dummies(data['fuel'])
transmission = pd.get_dummies(data['transmission'])
owner = data['owner'].map(lambda x: 1 if x=='First  Owner' else 2 if x=='Second  Owner' else 3)

In [5]:
#specify the features and the target column
features = pd.concat ([name ,fuel ,transmission ,owner ,data.drop(['name','transmission','fuel','owner','selling_price'],axis =1)],axis =1)
target = data['selling_price']

In [6]:
#devide the dataframe into train and test
from sklearn.model_selection import train_test_split 
X_train ,X_test ,y_train ,y_test=train_test_split(features ,target ,test_size=0.2, random_state =42)

# **MLflow Tracking**

### **Linear Regression**

In [7]:
#the metircs used to evaluate each model 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.linear_model import LinearRegression
# Defining model parameters
normalize = True
# Running MLFlow script
with mlflow.start_run(run_name="Linear Regression run_1"):
# Instantiating model with model parameters
    model = LinearRegression (normalize= normalize  )
# Fitting training data to the model
    model.fit(X_train, y_train)
# Running prediction on validation dataset
    preds = model.predict(X_test)
# Getting metrics on the validation dataset
    rmse = mean_squared_error(preds, y_test)
    abs_error = mean_absolute_error(preds, y_test)
    r2 = r2_score(preds, y_test)
# Logging params and metrics to MLFlow
    mlflow.log_param('normalize', normalize)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('abs_error', abs_error)
    mlflow.log_metric('r2', r2)
# Logging model to MLFlow
    mlflow.sklearn.log_model(model, 'model')

In [8]:
#the metircs used to evaluate each model 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.linear_model import LinearRegression
# Defining model parameters
normalize = False
# Running MLFlow script
with mlflow.start_run(run_name="Linear Regression run_2"):
# Instantiating model with model parameters
    model = LinearRegression (normalize= normalize  )
# Fitting training data to the model
    model.fit(X_train, y_train)
# Running prediction on validation dataset
    preds = model.predict(X_test)
# Getting metrics on the validation dataset
    rmse = mean_squared_error(preds, y_test)
    abs_error = mean_absolute_error(preds, y_test)
    r2 = r2_score(preds, y_test)
# Logging params and metrics to MLFlow
    mlflow.log_param('normalize', normalize)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('abs_error', abs_error)
    mlflow.log_metric('r2', r2)
# Logging model to MLFlow
    mlflow.sklearn.log_model(model, 'model')

### **Random Forest**

In [9]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
# Defining model parameters
criterion = 'mse'
n_estimators = 30
# Running MLFlow script
with mlflow.start_run():
# Instantiating model with model parameters
    model = RandomForestRegressor(criterion=  criterion,
                       n_estimators= n_estimators)
# Fitting training data to the model
    model.fit(X_train, y_train)
# Running prediction on validation dataset
    preds = model.predict(X_test)
# Getting metrics on the validation dataset
    rmse = mean_squared_error(preds, y_test)
    abs_error = mean_absolute_error(preds, y_test)
    r2 = r2_score(preds, y_test)
# Logging params and metrics to MLFlow
    mlflow.log_param('criterion', criterion)
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('abs_error', abs_error)
    mlflow.log_metric('r2', r2)
# Logging model to MLFlow
    mlflow.sklearn.log_model(model, 'model')

In [10]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
# Defining model parameters
criterion = 'mse'
n_estimators = 20
# Running MLFlow script
with mlflow.start_run():
# Instantiating model with model parameters
    model = RandomForestRegressor(criterion=  criterion,
                       n_estimators= n_estimators)
# Fitting training data to the model
    model.fit(X_train, y_train)
# Running prediction on validation dataset
    preds = model.predict(X_test)
# Getting metrics on the validation dataset
    rmse = mean_squared_error(preds, y_test)
    abs_error = mean_absolute_error(preds, y_test)
    r2 = r2_score(preds, y_test)
# Logging params and metrics to MLFlow
    mlflow.log_param('criterion', criterion)
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('abs_error', abs_error)
    mlflow.log_metric('r2', r2)
# Logging model to MLFlow
    mlflow.sklearn.log_model(model, 'model')

# **MLflow Model Registry**

### **Get the run_id for the highest score run (Based on R2 metric)**

In [11]:
#Load all the run_ids bellongs to the experiment_id=1 refers to the Used car price prediction experiment
run_id=mlflow.search_runs(experiment_ids="1")
#sort all the run_ids in descending order based on the metric R2 for each run
run_id.sort_values(['metrics.r2'],ascending=False,inplace=True)
#save the fisrt run_id which refer to the run that have the highest score
run_id_best = run_id.head(1)["run_id"].values[0]
print(f"the run_id that have the highest score is : {run_id_best}")

the run_id that have the highest score is : 212d1bcfb3b843aa80c7bce94c892555


### **Register the model**

In [12]:
#Create new model named Used Car Price Prediction 
# It should be notice that new registered model requires a unique name 
client = MlflowClient()
client.create_registered_model("Used Car Price Prediction")


<RegisteredModel: creation_timestamp=1642954689098, description='', last_updated_timestamp=1642954689098, latest_versions=[], name='Used Car Price Prediction', tags={}>

In [13]:
client = MlflowClient()
result = client.create_model_version(
    name="Used Car Price Prediction",
    source=f"mlruns/1/{run_id_best}/artifacts/model",
    run_id=run_id_best
)

2022/01/23 17:18:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Used Car Price Prediction, version 1


### **Transition modele stage from none to Staging**

In [14]:
client = MlflowClient()
client.transition_model_version_stage(
    name="Used Car Price Prediction",
    version=1,
    stage="Staging"
)

<ModelVersion: creation_timestamp=1642954717096, current_stage='Staging', description='', last_updated_timestamp=1642954734840, name='Used Car Price Prediction', run_id='212d1bcfb3b843aa80c7bce94c892555', run_link='', source='mlruns/1/212d1bcfb3b843aa80c7bce94c892555/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

### **Transition modele stage from Staging to Production**

In [15]:
client = MlflowClient()
client.transition_model_version_stage(
    name="Used Car Price Prediction",
    version=1,
    stage="Production"
)

<ModelVersion: creation_timestamp=1642954717096, current_stage='Production', description='', last_updated_timestamp=1642954742163, name='Used Car Price Prediction', run_id='212d1bcfb3b843aa80c7bce94c892555', run_link='', source='mlruns/1/212d1bcfb3b843aa80c7bce94c892555/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

# **Make Predictions**

In [16]:
"""
Define function take as parameter all the features required to make prediction
and return dataframe with one row and 32 columns which is the number of
features to make prediction
"""
import numpy as np
def predict_price(name,transmission,fuel,owner,year,km_driven,engine,max_power):
    x = []
    x[:26] = np.zeros(32,dtype='int32')
    x[27] = owner
    x[28] = year
    x[29] = km_driven
    x[30] = engine
    x[31] = max_power
    
    name_index = np.where(features.columns==name)[0][0]
    transmission_index = np.where(features.columns==transmission)[0][0]
    fuel_index = np.where(features.columns==fuel)[0][0]
    
    if name_index>=0:
        x[name_index] = 1
    if transmission_index>=0:
        x[transmission_index] = 1
    if fuel_index>=2:
        x[fuel_index] = 1
        
    return pd.DataFrame(x).transpose()

test = predict_price('Audi','Manual','Petrol',1,2011,15000,2050,150)

In [17]:
prediction1 = predict_price('Audi','Manual','Petrol',1,2011,15000,2050,150)
import mlflow.pyfunc
model_name = "Used Car Price Prediction"
stage = 'Production'
model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)
print(f'the price predicted is : {model.predict(prediction1)[0]}')

the price predicted is : 1786433.3333333333


In [18]:
prediction2 = predict_price('Audi','Automatic','Petrol',2,2011,15000,2050,150)
import mlflow.pyfunc
model_name = "Used Car Price Prediction"
stage = 'Production'
model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)
model.predict(prediction2)[0]

1849766.6666666667