#### Nevetha N G
#### MDS202128

In [1]:
import nltk
import mlflow
from textblob import TextBlob
import sklearn
import numpy as np
import pandas as pd
import sklearn.metrics as met
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, average_precision_score, precision_recall_curve

In [2]:
#flie paths
train_file_path = "C:/Users/Nevetha/AML/Assignment_02/data/training_data.csv"
valid_file_path = "C:/Users/Nevetha/AML/Assignment_02/data/valid_data.csv"
test_file_path = "C:/Users/Nevetha/AML/Assignment_02/data/test_data.csv"
data_train = pd.read_csv(train_file_path)
data_valid = pd.read_csv(valid_file_path)
data_test = pd.read_csv(test_file_path)

In [3]:
data_train.head()

Unnamed: 0,X_train,y_train,length
0,K.k:)apo k.good movie.,0,22
1,Hey ! Don't forget ... You are MINE ... For ME...,0,113
2,Haha... Hope ü can hear the receipt sound... G...,0,53
3,Great. Never been better. Each day gives even ...,0,71
4,Dear Subscriber ur draw 4 £100 gift voucher wi...,1,141


In [4]:
X_train = data_train["X_train"]
y_train = data_train["y_train"]

In [5]:
X_test = data_test["X_test"]
y_test = data_test["y_test"]

In [6]:
X_valid = data_valid["X_valid"]
y_valid = data_valid["y_valid"]

In [7]:
def split_msgs(text):
    text = str(text).lower()
    words = TextBlob(text).words
    return [word.lemma for word in words]

In [8]:
#fitting and transforming X_train using a Count Vectorizer with default parameters
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

### 1) Multinomial Naive Bayes Classifier Model

In [9]:
Exp_name = "mlflow-bayes1"
Exp_Id = mlflow.create_experiment(Exp_name)

In [10]:
for idx, depth in enumerate([0.01,0.05,0.1,0.5,1]):

    model1 = sklearn.naive_bayes.MultinomialNB(alpha=depth)
    model_fit_NB = model1.fit(X_train_vectorized, y_train)
    predictions_NB = model1.predict(vect.transform(X_test))

    report_NB = classification_report(y_valid, predictions_NB)
    accuracy1 = accuracy_score(y_valid, predictions_NB)
    precision1, recall1, thresholds1 = precision_recall_curve(y_valid, predictions_NB)
    auc_precision_recall1 = auc(recall1, precision1)

    # Start MLflow
    RUN_NAME_1 = f"run_{idx}"
    with mlflow.start_run(experiment_id=Exp_Id, run_name=RUN_NAME_1) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id

        # Track parameters
        mlflow.log_param("depth", depth)

        # Track metrics
        mlflow.log_metric("accuracy", accuracy1)
        mlflow.log_metric("AUCPR", auc_precision_recall1)

        # Track model
        mlflow.sklearn.log_model(model_fit_NB, "classifier")



In [11]:
from mlflow.tracking import MlflowClient

In [12]:
def print_run_info(runs):
    for run in runs:
        print("run_id: {}".format(run.info.run_id))
        print("lifecycle_stage: {}".format(run.data.params))
        print("metrics: {}".format(run.data.metrics))

        # Exclude mlflow system tags
        tags = {k: v for k, v in run.data.tags.items() if not k.startswith("mlflow.")}
        print("tags: {}".format(tags))

In [13]:
client_NB = MlflowClient()
runs_NB = client_NB.search_runs(Exp_Id, order_by=["metrics.m DESC"])
print_run_info(runs_NB)
print("_end_")

run_id: d512fe3128084cd79595bd36cebec078
lifecycle_stage: {'depth': '1'}
metrics: {'accuracy': 0.7954545454545454, 'AUCPR': 0.22365571959380645}
tags: {}
run_id: cb2fda5eca9741088bc58027a58598ec
lifecycle_stage: {'depth': '0.5'}
metrics: {'accuracy': 0.7918660287081339, 'AUCPR': 0.2286880537271366}
tags: {}
run_id: c3d28d8937f84eb7b9cd3be6e8ce866e
lifecycle_stage: {'depth': '0.1'}
metrics: {'accuracy': 0.7906698564593302, 'AUCPR': 0.22792389671297958}
tags: {}
run_id: fd498eca3c044cc28fa1a72d559e1c60
lifecycle_stage: {'depth': '0.05'}
metrics: {'accuracy': 0.7894736842105263, 'AUCPR': 0.21983114029752904}
tags: {}
run_id: ff661f5671ca4835913a105c07c02bf4
lifecycle_stage: {'depth': '0.01'}
metrics: {'accuracy': 0.7882775119617225, 'AUCPR': 0.21910720207359086}
tags: {}
_end_


### 2) Random Forest Classifier Model

In [14]:
Exp_name2 = "mlflow-RF1"
Exp_Id2 = mlflow.create_experiment(Exp_name2)

In [15]:
for idx, depth in enumerate([1,3,7,10,20]):
    
    model2 = RandomForestClassifier(random_state = 1,  n_estimators = 200,
                                    max_depth = depth, criterion = 'gini')
    model_fit_RF = model2.fit(X_train_vectorized, y_train)
    predictions_RF = model2.predict(vect.transform(X_test))


    report_RF = classification_report(y_valid, predictions_RF)
    accuracy2 = accuracy_score(y_valid, predictions_RF)
    precision2, recall2, thresholds2 = precision_recall_curve(y_valid, predictions_RF)
    auc_precision_recall2 = auc(recall2, precision2)

    # Start MLflow
    RUN_NAME_1 = f"run_{idx}"
    with mlflow.start_run(experiment_id=Exp_Id2, run_name=RUN_NAME_1) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id

        # Track parameters
        mlflow.log_param("depth", depth)

        # Track metrics
        mlflow.log_metric("accuracy", accuracy2)
        mlflow.log_metric("AUCPR", auc_precision_recall2)

        # Track model
        mlflow.sklearn.log_model(model_fit_RF, "classifier")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
client_RF = MlflowClient()
runs_RF = client_RF.search_runs(Exp_Id2, order_by=["metrics.m DESC"])
print_run_info(runs_RF)
print("_end_")

run_id: cff46547feea496994253d091fe2765a
lifecycle_stage: {'depth': '20'}
metrics: {'accuracy': 0.8217703349282297, 'AUCPR': 0.17746019958032955}
tags: {}
run_id: e9e319ab114541bfb7cc4332bdd0a853
lifecycle_stage: {'depth': '10'}
metrics: {'accuracy': 0.8588516746411483, 'AUCPR': 0.17476133578270106}
tags: {}
run_id: 78ea40c4654d4e839d1f0f48f15b9888
lifecycle_stage: {'depth': '7'}
metrics: {'accuracy': 0.8660287081339713, 'AUCPR': 0.060406698564593304}
tags: {}
run_id: 32c9f087fd8446668d15d8aa9fd7ac94
lifecycle_stage: {'depth': '3'}
metrics: {'accuracy': 0.8791866028708134, 'AUCPR': 0.5604066985645934}
tags: {}
run_id: 6fb10062809545c7849e05a0630f5395
lifecycle_stage: {'depth': '1'}
metrics: {'accuracy': 0.8791866028708134, 'AUCPR': 0.5604066985645934}
tags: {}
_end_


### 3) Support Vector Classifier Model

In [17]:
Exp_name3 = "mlflow-SVM1"
Exp_Id3 = mlflow.create_experiment(Exp_name3)

In [18]:
for idx, depth in enumerate([0.001,0.01,0.1,1,10]):

    clf = SVC(C=depth)
    model_fit_SVM = clf.fit(X_train_vectorized, y_train)
    predictions_SVM = clf.predict(vect.transform(X_test))

    report_SVM = classification_report(y_valid, predictions_SVM)
    accuracy3 = accuracy_score(y_valid, predictions_SVM)
    precision3, recall3, thresholds3 = precision_recall_curve(y_valid, predictions_SVM)
    auc_precision_recall3 = auc(recall3, precision3)

    # Start MLflow
    RUN_NAME_1 = f"run_{idx}"
    with mlflow.start_run(experiment_id=Exp_Id3, run_name=RUN_NAME_1) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id

        # Track parameters
        mlflow.log_param("depth", depth)

        # Track metrics
        mlflow.log_metric("accuracy", accuracy3)
        mlflow.log_metric("AUCPR", auc_precision_recall3)

        # Track model
        mlflow.sklearn.log_model(model_fit_SVM, "classifier")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
client_SVM = MlflowClient()
runs_SVM = client_SVM.search_runs(Exp_Id3, order_by=["metrics.m DESC"])
print_run_info(runs_SVM)
print("_end_")

run_id: be81cfaf6be749eda2c9ecbfeb2480c5
lifecycle_stage: {'depth': '10'}
metrics: {'accuracy': 0.8050239234449761, 'AUCPR': 0.238102468141551}
tags: {}
run_id: c977309c4cd3490480aee5cf1c26cb77
lifecycle_stage: {'depth': '1'}
metrics: {'accuracy': 0.8014354066985646, 'AUCPR': 0.22785897010753706}
tags: {}
run_id: 7179321bb7b040ceb8d2621d07f849be
lifecycle_stage: {'depth': '0.1'}
metrics: {'accuracy': 0.8755980861244019, 'AUCPR': 0.060406698564593304}
tags: {}
run_id: 23984cfe1c7a4aa1a46343340ee53fdc
lifecycle_stage: {'depth': '0.01'}
metrics: {'accuracy': 0.8791866028708134, 'AUCPR': 0.5604066985645934}
tags: {}
run_id: 974d984f730d4f69a6d4f59f53fd5767
lifecycle_stage: {'depth': '0.001'}
metrics: {'accuracy': 0.8791866028708134, 'AUCPR': 0.5604066985645934}
tags: {}
_end_
