<h1>MLOps Project - Wine Quality</h1>

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from typing import List
from scipy.sparse import csr_matrix
import statsmodels.formula.api as smf

file_path = "/Users/lauralumbreras/Documents/ESILV/Web Scraping/WS - Env/esilv-mlops/MLOpsProject/winequalityN.csv"

## 1 - Load data

In [2]:
def load_data(path: str):
    return pd.read_csv(file_path)

train_df = load_data(file_path)
train_df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## 2 - Prepare the data

In [3]:
# Rename Columns 

def prepare_data(df: pd.DataFrame):
    df = df.rename(columns = {'fixed acidity' :'fixed_acidity', 'volatile acidity':'volatile_acidity', 'citric acid':'citric_acid', 'residual sugar':'residual_sugar', 'free sulfur dioxide':'free_sulfur_dioxide', 'total sulfur dioxide' : 'total_sulfur_dioxide'}, inplace = False)
    df = df.dropna(axis=0)
    return df

train_df = prepare_data(train_df)
train_df.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
train_df.columns

Index(['type', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
       'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [6]:
# Encode Wine Type (White = 1 , Red = 0)
from sklearn import preprocessing

CATEGORICAL_COLS = ['type', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
       'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']


def encode_categorical_cols(wine_df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ['type', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
                             'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
                             'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']

    le = preprocessing.LabelEncoder()
    wine_df['type'] = le.fit_transform(wine_df['type'])

    # Ensure that the numeric columns are present before attempting to fill missing values
    numeric_cols = [col for col in categorical_cols if col in wine_df.columns]
    
    # Fill missing values for numeric columns
    wine_df[numeric_cols] = wine_df[numeric_cols].fillna(0).astype("float")

    return wine_df


train_df = encode_categorical_cols(train_df)
train_df.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,1.0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,1.0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,1.0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,1.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,1.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [8]:
def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:

    if categorical_cols is None:
        categorical_cols = ['type', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
                             'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
                             'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']

    dicts = df[categorical_cols].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["quality"].values

    x = dv.transform(dicts)
    return x, y, dv


X_train, y_train, dv = extract_x_y(train_df)



In [13]:
from sklearn.feature_extraction import DictVectorizer
from app_config import PATH_TO_PREPROCESSOR
import pickle

def save_pickle(path:str, dv: DictVectorizer):
    with open(path, "wb") as f:
        pickle.dump(dv,f)

save_pickle(PATH_TO_PREPROCESSOR, dv)

In [106]:
wine_df

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,1.0,7.0,0.0,0.0,20.0,0.0,45.0,170.0,1.0,3.0,0.0,8.0,6
1,1.0,6.0,0.0,0.0,1.0,0.0,14.0,132.0,0.0,3.0,0.0,9.0,6
2,1.0,8.0,0.0,0.0,6.0,0.0,30.0,97.0,0.0,3.0,0.0,10.0,6
3,1.0,7.0,0.0,0.0,8.0,0.0,47.0,186.0,0.0,3.0,0.0,9.0,6
4,1.0,7.0,0.0,0.0,8.0,0.0,47.0,186.0,0.0,3.0,0.0,9.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0.0,6.0,0.0,0.0,2.0,0.0,32.0,44.0,0.0,3.0,0.0,10.0,5
6493,0.0,5.0,0.0,0.0,2.0,0.0,39.0,51.0,0.0,3.0,0.0,11.0,6
6494,0.0,6.0,0.0,0.0,2.0,0.0,29.0,40.0,0.0,3.0,0.0,11.0,6
6495,0.0,5.0,0.0,0.0,2.0,0.0,32.0,44.0,0.0,3.0,0.0,10.0,5


## 3 - Train model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

def train_model(x_train: csr_matrix, y_train: np.ndarray):
    x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.2,random_state=42)
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    return lr

model = train_model(X_train, y_train)

## 4 - Evaluate model

In [15]:
def predict_quality(input_data: csr_matrix, model: LinearRegression):
    return model.predict(input_data)


def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    return mean_squared_error(y_true, y_pred, squared=False)


prediction = predict_quality(X_train, model)
train_me = evaluate_model(y_train, prediction)
train_me

0.7349701364153743

## 5 - Log Model Parameters to MlFlow

In [16]:
import mlflow

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///Users/lauralumbreras/Documents/ESILV/Web%20Scraping/WS%20-%20Env/esilv-mlops/MLOpsProject/Model/mlruns'


In [17]:
# Set the experiment name
mlflow_experiment_path = f"/mlflow/LR_Predict_WineQuality"
mlflow.set_experiment(mlflow_experiment_path)
name="LR_Predict_WineQuality"

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("01-model-experiment", "Predict Wine Quality")

    # Load data
    train_df = load_data(file_path)

    # Compute target
    train_df = prepare_data(train_df)

    # Encode categorical columns
    train_df = encode_categorical_cols(train_df)

    # Extract X and y
    X_train, y_train, dv = extract_x_y(train_df)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    prediction = predict_quality(X_train, model)
    train_me = evaluate_model(y_train, prediction)

    # Log your model
    mlflow.sklearn.log_model(model, "models")

    # Register your model in mlfow model registry
    mlflow.register_model("runs:/{}/models".format(run_id), "LR_Predict_WineQuality")
  

Registered model 'LR_Predict_WineQuality' already exists. Creating a new version of this model...
2024/01/27 15:39:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LR_Predict_WineQuality, version 2
Created version '2' of model 'LR_Predict_WineQuality'.


In [18]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

model_version = client.get_latest_versions("LR_Predict_WineQuality", stages=["None"])[0].version
name= "LR_Predict_WineQuality"
desc = "Logistic Regression Model for Predict Wine Quality"

client.transition_model_version_stage(name, version=model_version, stage="production")



<ModelVersion: aliases=[], creation_timestamp=1706366373006, current_stage='Production', description=None, last_updated_timestamp=1706366377222, name='LR_Predict_WineQuality', run_id='f42438cf2aaf42ccabaef545789a872a', run_link=None, source='file:///Users/lauralumbreras/Documents/ESILV/Web%20Scraping/WS%20-%20Env/esilv-mlops/MLOpsProject/Model/mlruns/306097817479606221/f42438cf2aaf42ccabaef545789a872a/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=2>

## 6 - Predict

In [19]:
# Load prediction data
predict_df = load_data(file_path)
print(file_path)
print(mlflow_experiment_path)

# Apply feature engineering
predict_df = prepare_data(predict_df)
predict_df = encode_categorical_cols(predict_df)
X_pred, _, _ = extract_x_y(predict_df, dv=dv, with_target=False)

#mlflow.set_experiment(mlflow_experiment_path)
#name="LR_Predict_Uber"

# Load production model
model_uri = f"models:/LR_Predict_WineQuality/production"
print("2")

model = mlflow.sklearn.load_model(model_uri)

# Make predictions
y_pred = predict_quality(X_pred, model)
y_pred



/Users/lauralumbreras/Documents/ESILV/Web Scraping/WS - Env/esilv-mlops/MLOpsProject/winequalityN.csv
/mlflow/LR_Predict_WineQuality
2


array([5.62653159, 5.30002673, 5.74109894, ..., 6.03143989, 5.5465745 ,
       6.23348634])