## ML Flow file for Experiments Tracking 

In [29]:
# importing all the libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from mlflow.tracking import MlflowClient

from sklearn.model_selection import RandomizedSearchCV
import mlflow
import mlflow.sklearn

In [31]:
# Load Dataset
df = pd.read_csv("flights.csv")

In [33]:
# Dataset First Look
df.sample(5)

Unnamed: 0,travelCode,userCode,from,to,flightType,price,time,distance,agency,date
219380,109690,1086,Florianopolis (SC),Sao Paulo (SP),firstClass,693.51,1.46,562.14,CloudFy,12/29/2022
57578,28789,274,Brasilia (DF),Aracaju (SE),premium,987.07,1.11,425.98,CloudFy,12/15/2022
162428,81214,790,Natal (RN),Rio de Janeiro (RJ),firstClass,1314.05,1.55,595.03,Rainbow,11/14/2019
98764,49382,483,Campo Grande (MS),Brasilia (DF),premium,691.99,0.72,277.7,Rainbow,09/29/2022
229624,114812,1141,Florianopolis (SC),Salvador (BH),firstClass,1692.64,2.44,937.77,FlyingDrops,04/08/2021


In [35]:
# Dataset Rows & Columns count
rows = df.shape[0]
columns = df.shape[1]
print("Rows:", rows)
print("Columns:", columns)

Rows: 271888
Columns: 10


In [37]:
# Dataset Duplicate Value Count
print("Duplicates:",  df.duplicated().sum())
print("There are no duplicated values")

Duplicates: 0
There are no duplicated values


In [39]:
# Missing Values/Null Values Count
df.isnull().sum()

travelCode    0
userCode      0
from          0
to            0
flightType    0
price         0
time          0
distance      0
agency        0
date          0
dtype: int64

In [41]:
# removing columns that are not required for the machine learning 
df.drop(columns = ['travelCode', 'userCode', 'date', 'time', 'distance'], inplace = True)

In [43]:
df = pd.get_dummies(df, columns = ['from', 'to', 'flightType', 'agency'], drop_first = True)

In [45]:
df.head()

Unnamed: 0,price,from_Brasilia (DF),from_Campo Grande (MS),from_Florianopolis (SC),from_Natal (RN),from_Recife (PE),from_Rio de Janeiro (RJ),from_Salvador (BH),from_Sao Paulo (SP),to_Brasilia (DF),...,to_Florianopolis (SC),to_Natal (RN),to_Recife (PE),to_Rio de Janeiro (RJ),to_Salvador (BH),to_Sao Paulo (SP),flightType_firstClass,flightType_premium,agency_FlyingDrops,agency_Rainbow
0,1434.38,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,True,False,True,False
1,1292.29,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,True,False
2,1487.52,True,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
3,1127.36,False,False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,1684.05,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False


In [47]:
X = df.drop(columns = 'price')
y = df['price']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)

# Using Linear Regression for Prediction

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [15]:
y_lr_predict = lr.predict(X_test)
mae = mean_absolute_error(y_test, y_lr_predict)
mse = mean_squared_error(y_test, y_lr_predict)
r2 = r2_score(y_test, y_lr_predict)

lr_metrics = {"Mean_Absolute_Error" : mae, "Mean_Squared_Error" : mse, "R2_Score" : r2}
lr_metrics

{'Mean_Absolute_Error': 165.35144693376648,
 'Mean_Squared_Error': 47021.417673198244,
 'R2_Score': 0.6431532077752646}

In [17]:
lr_hyperparameters = {"intercept" : lr.fit_intercept}
lr_hyperparameters

{'intercept': True}

#### Logging Linear Regression data to ML flow

In [31]:
mlflow.set_experiment("Travel Price Prediction")

with mlflow.start_run(run_name = "Linear Regression Model") as run:
    mlflow.log_params(lr_hyperparameters)
    mlflow.log_metrics(lr_metrics)
    mlflow.sklearn.log_model(sk_model = lr, artifact_path= "Linear Regression Model", input_example = X_train[:1])
    print("Completed")

2025/01/04 02:26:34 INFO mlflow.tracking.fluent: Experiment with name 'Travel Price Prediction' does not exist. Creating a new experiment.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Completed


# Using Decision Tree Regressor for Prediction

In [34]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [36]:
y_dt_predict = dt.predict(X_test)
mae = mean_absolute_error(y_test, y_dt_predict)
mse = mean_squared_error(y_test, y_dt_predict)
r2 = r2_score(y_test, y_dt_predict)

dt_metrics = {"Mean_Absolute_Error" : mae, "Mean_Squared_Error" : mse, "R2_Score" : r2}
dt_metrics

{'Mean_Absolute_Error': 7.143983624129047e-12,
 'Mean_Squared_Error': 8.736174426338578e-23,
 'R2_Score': 1.0}

In [38]:
dt_hyperparameters = {"criterion" : dt.criterion , "max_depth" : dt.max_depth, "max_features" : dt.max_features, 
                                     "min_samples_split" : dt.min_samples_split, "min_samples_leaf" : dt.min_samples_leaf}
dt_hyperparameters

{'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'min_samples_split': 2,
 'min_samples_leaf': 1}

#### Logging Linear Regression data to ML flow

In [41]:
mlflow.set_experiment("Travel Price Prediction")

with mlflow.start_run(run_name = "Decision Tree Regression Model") as run:
    mlflow.log_params(dt_hyperparameters)
    mlflow.log_metrics(dt_metrics)
    mlflow.sklearn.log_model(sk_model = dt, artifact_path= "Decision Tree Model", input_example = X_train[:1])
    print("Completed")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Completed


# Using XGBoost Regressor for Prediction

In [44]:
xgb = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)


In [45]:
y_xgb_predict = xgb.predict(X_test)

mae = mean_absolute_error(y_test, y_xgb_predict)
mse = mean_squared_error(y_test, y_xgb_predict)
r2 = r2_score(y_test, y_xgb_predict)

xgb_metrics = {"Mean_Absolute_Error": mae, "Mean_Squared_Error": mse, "R2_Score": r2}
print(xgb_metrics)

{'Mean_Absolute_Error': 40.14342006622317, 'Mean_Squared_Error': 3297.6795283503416, 'R2_Score': 0.9749738221493959}


In [48]:
xgb_hyperparameters = { "n_estimators": xgb.n_estimators, "max_depth": xgb.max_depth, "learning_rate": xgb.learning_rate}
print(xgb_hyperparameters)

{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}


#### Logging XGBoost Regression data to ML flow

In [51]:
mlflow.set_experiment("Travel Price Prediction")

with mlflow.start_run(run_name="XGBoost Regression Model") as run:
    mlflow.log_params(xgb_hyperparameters)
    mlflow.log_metrics(xgb_metrics)
    mlflow.sklearn.log_model(sk_model=xgb, artifact_path="XGBoost Model", input_example=X_train[:1])

    print("Completed")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Completed


# Using Random Forest Regressor for Prediction

In [53]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [54]:
y_rf_predict = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_rf_predict)
mse = mean_squared_error(y_test, y_rf_predict)
r2 = r2_score(y_test, y_rf_predict)

rf_metrics = {
    "Mean_Absolute_Error": mae,
    "Mean_Squared_Error": mse,
    "R2_Score": r2
}
print(rf_metrics)

{'Mean_Absolute_Error': 3.616483510794777e-12, 'Mean_Squared_Error': 2.4058155091511482e-23, 'R2_Score': 1.0}


In [55]:
rf_hyperparameters = {
    "n_estimators": rf.n_estimators,
    "max_depth": rf.max_depth,
    "min_samples_split": rf.min_samples_split,
    "min_samples_leaf": rf.min_samples_leaf
}
print(rf_hyperparameters)

{'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1}


In [56]:
mlflow.set_experiment("Travel Price Prediction")

with mlflow.start_run(run_name="Random Forest Regression Model") as run:
    mlflow.log_params(rf_hyperparameters)
    mlflow.log_metrics(rf_metrics)
    mlflow.sklearn.log_model(sk_model=rf, artifact_path="Random Forest Model", input_example=X_train[:1])

print("Completed")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Completed


# Registring best performing model in the Model Registry

In [17]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Register the model
# client = MlflowClient()  (You dont have to use it. in video i mistakenly said that you have to use for the registration of the model)
model_uri = "runs:/5dc6163a0dc944ae8792def4b242908a/Random Forest Model"
model_name = "Travel Price Prediction"
model_version = mlflow.register_model(model_uri, model_name)

print(f"Model registered as '{model_name}', version: {model_version.version}")


Successfully registered model 'Travel Price Prediction'.
2025/01/04 14:31:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Travel Price Prediction, version 1


Model registered as 'Travel Price Prediction', version: 1


Created version '1' of model 'Travel Price Prediction'.


# Loading the model and checking it for deployment purpose

In [52]:
import mlflow.pyfunc
import pandas as pd

# Define the model name and alias
model_name = "Travel Price Prediction"  
model_alias = "champion"                

model_uri = f"models:/{model_name}@{model_alias}"

# Load the Random Forest model from the Model Registry
loaded_model = mlflow.pyfunc.load_model(model_uri)
y_pred = loaded_model.predict(X_test)

# Display the predictions
print("Predictions:")
print(y_pred)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Predictions:
[ 481.42 1124.11 1174.97 ...  961.99  674.16 1616.44]


In [54]:
print(y_test)

226840     481.42
31439     1124.11
260515    1174.97
265574     898.67
14311      959.91
           ...   
149290     446.73
140960    1714.75
257659     961.99
125996     674.16
226173    1616.44
Name: price, Length: 54378, dtype: float64


##### We have checked the model and it is giving the right prediction that means model is ready for deployment in the cloud services