### MLFlow Tutorial
+ ML Model Experiment Tracking & Monitoring
+ Track Metrics, Params, Models and Artifacts

In [1]:
# Load EDA Pkgs
import pandas as pd
import numpy as np
# Load Data vis pkgs
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.dummy import DummyClassifier

In [3]:
# Load Data
df = pd.read_csv("data/hate_speech_labeled_data.csv")
class_labels = {"hate speech":0,
"offensive language":1,
 "neither":2,}
df["class_labels"] = df["class"].map({v:k for k,v in class_labels.items()})

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,class_labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,neither
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,offensive language
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,offensive language
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,offensive language
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,offensive language


In [6]:
import neattext.functions as nfx

In [8]:
df['tweet_clean'] = df['tweet'].apply(lambda x: nfx.remove_stopwords(str(x).lower()))
df['tweet_clean'] = df['tweet_clean'].apply(lambda x: nfx.remove_special_characters(str(x).lower()))
df['tweet_clean'] = df['tweet_clean'].apply(lambda x: nfx.remove_hashtags(str(x).lower()))
df['tweet_clean'] = df['tweet_clean'].apply(lambda x: nfx.remove_punctuations(str(x).lower()))


In [9]:
# Features & Labels
Xfeatures = df['tweet_clean']
ylabels = df['class_labels']
ylabels_int = df['class']

In [10]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(Xfeatures, ylabels_int, test_size=0.3, random_state=42)


In [11]:
pipe_base = Pipeline(steps=[('cv', CountVectorizer()), ('dummy', DummyClassifier())])
pipe_nb = Pipeline(steps=[('cv', CountVectorizer()), ('nb', MultinomialNB())])
pipe_lr = Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])
pipe_lr_tfidf = Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())])


In [12]:
# Build Base Model
pipe_base.fit(x_train,y_train)

In [13]:
pipe_base.score(x_test,y_test)

0.7729657027572293

In [14]:
pipe_nb.fit(x_train,y_train)
pipe_nb.score(x_test,y_test)

0.853530598520511

In [16]:
# Model Evaluation
y_pred = pipe_nb.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.39      0.02      0.03       427
           1       0.85      0.99      0.91      5747
           2       0.89      0.52      0.66      1261

    accuracy                           0.85      7435
   macro avg       0.71      0.51      0.53      7435
weighted avg       0.83      0.85      0.82      7435



In [17]:
eval_report = classification_report(y_test,y_pred,output_dict=True)
eval_report

{'0': {'precision': 0.3888888888888889,
  'recall': 0.01639344262295082,
  'f1-score': 0.03146067415730337,
  'support': 427.0},
 '1': {'precision': 0.8512136649685346,
  'recall': 0.9885157473464417,
  'f1-score': 0.9147411641574753,
  'support': 5747.0},
 '2': {'precision': 0.8855989232839838,
  'recall': 0.5218080888183981,
  'f1-score': 0.656686626746507,
  'support': 1261.0},
 'accuracy': 0.853530598520511,
 'macro avg': {'precision': 0.7085671590471359,
  'recall': 0.5089057595959302,
  'f1-score': 0.5342961550204285,
  'support': 7435.0},
 'weighted avg': {'precision': 0.8304937095347448,
  'recall': 0.853530598520511,
  'f1-score': 0.8202465386154035,
  'support': 7435.0}}

In [18]:
### ML Tracking
import mlflow

In [19]:
mlflow.set_experiment("Hate Speech Detection Experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

2024/09/23 00:30:56 INFO mlflow.tracking.fluent: Experiment with name 'Hate Speech Detection Experiment' does not exist. Creating a new experiment.


In [20]:
with mlflow.start_run():
    # Log Params
    # log Metrics
    mlflow.log_metrics({"accuracy":eval_report['accuracy']})
    # log Models
    mlflow.sklearn.log_model(pipe_nb,"Pipe Naive Bayes")

2024/09/23 00:34:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run intrigued-wolf-982 at: http://127.0.0.1:5000/#/experiments/328272098298209501/runs/cd126452b3ee4d63b7ad7e01d79a5a6c.
2024/09/23 00:34:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/328272098298209501.


In [21]:
# Function to train and log model
def train_and_log_model(model,model_name):
    with mlflow.start_run():
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        eval_report = classification_report(y_test,y_pred,output_dict=True)
        # MLflow to log
        mlflow.log_param("model",model_name)
        mlflow.log_metric("accuracy", model.score(x_test, y_test))
        mlflow.log_metric("precision", eval_report['weighted avg']['precision'])
        mlflow.log_metric("recall", eval_report['weighted avg']['recall'])
        mlflow.log_metric("f1-score", eval_report['weighted avg']['f1-score'])
        
        # Log the 
        mlflow.sklearn.log_model(model,model_name)

In [22]:
# Train and Track Model
train_and_log_model(pipe_base,"Dummy Classifier")
train_and_log_model(pipe_nb,"Naive Bayes")
train_and_log_model(pipe_lr, "Logistic Regression with CountVectorizer")
train_and_log_model(pipe_lr_tfidf, "Logistic Regression with TfidfVectorizer")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2024/09/23 00:41:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-smelt-264 at: http://127.0.0.1:5000/#/experiments/328272098298209501/runs/a097432caa624eb4a7fc1f1e2966a707.
2024/09/23 00:41:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/328272098298209501.
2024/09/23 00:41:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-penguin-338 at: http://127.0.0.1:5000/#/experiments/328272098298209501/runs/c215cb9ab75142578b1a9fae9b93ba90.
2024/09/23 00:41:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/328272098298209501.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

In [23]:
### Thanks for your attention
### Jesus Saves @JCharisTech
### Jesse E.Agbe (JCharis) Sept 2024