## Importing Libraries

In [53]:
%matplotlib inline
import pandas as pd
import numpy as np
import mlflow
import logging
from urllib.parse import urlparse
from markupsafe import escape
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_recall_curve,auc
from sklearn.ensemble import RandomForestClassifier
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

## Loading The Train, Validation and Test Data

In [54]:
train = pd.read_csv("Data/train.csv")
val = pd.read_csv("Data/validation.csv")
test = pd.read_csv("Data/test.csv")

In [55]:
train.head()

Unnamed: 0,Label,Message
0,0,Ok... But bag again..
1,0,Err... Cud do. I'm going to at 8pm. I haven't...
2,0,Well done and ! luv ya all
3,0,Had the money issue weigh me down but thanks t...
4,0,I am going to film 2day da. At 6pm. Sorry da.


In [56]:
val.head()

Unnamed: 0,Label,Message
0,0,Pls send me a comprehensive mail about who i'm...
1,0,U in town alone?
2,0,How are you doing? Hope you've settled in for ...
3,0,Lara said she can loan me &lt;#&gt; .
4,0,"Sorry, I'll call later"


In [57]:
test.head()

Unnamed: 0,Label,Message
0,0,"Yo, any way we could pick something up tonight?"
1,0,Yes:)sura in sun tv.:)lol.
2,0,Welp apparently he retired
3,0,I've told him that i've returned it. That shou...
4,0,Having lunch:)you are not in online?why?


In [58]:
y_train, X_train = train["Label"], train["Message"]
y_val, X_val = val["Label"], val["Message"]
y_test, X_test = test["Label"], test["Message"]

## Changing String To Vectors For Inputting To Model

In [59]:
# Replacing NAN entries with an empty string
X_train = X_train.replace(np.nan, '', regex=True)
X_val = X_val.replace(np.nan, '', regex=True)
X_test = X_test.replace(np.nan, '', regex=True)

In [60]:
count = CountVectorizer().fit(X_train)
X_train = count.transform(X_train)
X_val = count.transform(X_val)
X_test = count.transform(X_test)

In [61]:
tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(X_train)
tfidf_val = tfidf_transformer.fit_transform(X_val)
tfidf_test = tfidf_transformer.fit_transform(X_test)

In [62]:
tfidf_train.shape, tfidf_val.shape, tfidf_test.shape

((4026, 7273), (711, 7273), (837, 7273))

## Training Models

In [63]:
def eval_metrics(actual, pred):
    precision, recall, thresholds = precision_recall_curve(actual, pred)
    auc_precision_recall = auc(recall, precision)
    return (auc_precision_recall)

### Model 1: Random Forest Classifier

In [64]:
mlflow.sklearn.autolog()

n_estimators = 200
max_depth = 5
clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth,random_state=101)
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_1=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name=f"n_estimators : {n_estimators}, max_depth : {max_depth}"):
  
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.log_dict(np.array(conf_1).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(clf, "model")

    print("\nRandom Classifier Model (no_of_estimator={:f}, max_depth={:f}):".format(n_estimators, max_depth))
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr} ")
    print(f"Confusion Matrix: {conf_1} \n \n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="random-forest-classification-model"
    )
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(clf, "model", registered_model_name="Randomclassifier")
    else:
        mlflow.sklearn.log_model(clf, "model")

2023/02/25 19:32:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '67fd179cdb6a46dea154b990265b462c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Random Classifier Model (no_of_estimator=200.000000, max_depth=5.000000):
Accuracy: 0.8578255675029869
AUCPR: 0.5710872162485066 
Confusion Matrix: [[718   0]
 [119   0]] 
 



Registered model 'random-forest-classification-model' already exists. Creating a new version of this model...
2023/02/25 19:32:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-classification-model, version 1
Created version '1' of model 'random-forest-classification-model'.


In [65]:
mlflow.tracking.MlflowClient().get_model_version("random-forest-classification-model","1")

<ModelVersion: creation_timestamp=1677333735497, current_stage='None', description=None, last_updated_timestamp=1677333735497, name='random-forest-classification-model', run_id='4e078d03d11e48158cb379e4c190b2fd', run_link=None, source='file:///C:/Jupyter%20Lab/CMI/Applied%20Machine%20Learning/Assignment%202/mlruns/0/4e078d03d11e48158cb379e4c190b2fd/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

### Model 2: Multinomial Naive Bayes

In [66]:
mlflow.sklearn.autolog()

clf = MultinomialNB()
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_2=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multinomial Naive Bayes"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    mlflow.log_dict(np.array(conf_2).tolist(), "confusion_matrix.json")

    print("\nMultinomial Naive Bayes")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix: {conf_2} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multinomial-nb-model"
    )
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(clf, "model", registered_model_name="multinomial-nb-model")
    else:
        mlflow.sklearn.log_model(clf, "model")

2023/02/25 19:32:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'dac6745e2c3744dea81453693447ad37', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multinomial Naive Bayes
Accuracy: 0.9414575866188769
AUCPR: 0.8233888537493851
Confusion Matrix: [[718   0]
 [ 49  70]] 




Registered model 'multinomial-nb-model' already exists. Creating a new version of this model...
2023/02/25 19:32:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multinomial-nb-model, version 1
Created version '1' of model 'multinomial-nb-model'.


In [67]:
print(mlflow.tracking.MlflowClient().get_model_version("multinomial-nb-model", '1'))

<ModelVersion: creation_timestamp=1677333746391, current_stage='None', description=None, last_updated_timestamp=1677333746391, name='multinomial-nb-model', run_id='be01ed93545942189af1d20e9e3a3c83', run_link=None, source='file:///C:/Jupyter%20Lab/CMI/Applied%20Machine%20Learning/Assignment%202/mlruns/0/be01ed93545942189af1d20e9e3a3c83/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


### Model 3: MLP Classifier

In [68]:
mlflow.sklearn.autolog()

clf = MLPClassifier(random_state=101,learning_rate='adaptive')
clf.fit(tfidf_train, y_train)

y_pred = clf.predict(tfidf_test)
aucpr = eval_metrics(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
conf_3=confusion_matrix(y_test, y_pred)

with mlflow.start_run(run_name="Multilayer Perceptron"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    
    print("\nMultilayer Perceptron")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix {conf_3} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multilayer-perceptron-model"
    )
    if tracking_url_type_store != "file":
        mlflow.sklearn.log_model(clf, "model", registered_model_name="multilayer-perceptron-model")
    else:
        mlflow.sklearn.log_model(clf, "model")

2023/02/25 19:32:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1aaa95b52acb4ddd916172d022b1a410', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multilayer Perceptron
Accuracy: 0.982078853046595
AUCPR: 0.9448226913677488
Confusion Matrix [[717   1]
 [ 14 105]] 




Registered model 'multilayer-perceptron-model' already exists. Creating a new version of this model...
2023/02/25 19:33:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multilayer-perceptron-model, version 1
Created version '1' of model 'multilayer-perceptron-model'.


In [69]:
print(mlflow.tracking.MlflowClient().get_model_version("multilayer-perceptron-model", '1'))

<ModelVersion: creation_timestamp=1677333788700, current_stage='None', description=None, last_updated_timestamp=1677333788700, name='multilayer-perceptron-model', run_id='d31c73378b9a4ab3a7b34b1fdb925b24', run_link=None, source='file:///C:/Jupyter%20Lab/CMI/Applied%20Machine%20Learning/Assignment%202/mlruns/0/d31c73378b9a4ab3a7b34b1fdb925b24/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


MLP Classifier has the best performance