In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../Data/train_df.csv")
df

Unnamed: 0,target,id,date,flag,user,text,words
0,1,1760360644,Sun May 10 21:19:37 PDT 2009,NO_QUERY,daveloire,never bothered with that book ill stick to my ...,never bother book ill stick murder mystery nov...
1,0,2203539152,Tue Jun 16 23:58:07 PDT 2009,NO_QUERY,QueenDD09,depression is so hardi just sat her and ate fu...,depression hardi sat ate full plate mac cheese...
2,0,2288068433,Mon Jun 22 18:40:37 PDT 2009,NO_QUERY,katiecantdance,as amatter of fact they both got stolen,amatter fact get steal
3,0,2247250610,Fri Jun 19 18:48:59 PDT 2009,NO_QUERY,paco1472,loltake it easy homes la migra be on twiiter t...,loltake easy home la migra twiiter tooand im s...
4,0,2016771244,Wed Jun 03 07:20:04 PDT 2009,NO_QUERY,Angela_Webber_,yes pouts ive been resting so much im bored gi...,yes pout ive rest much im bore giggle softly s...
...,...,...,...,...,...,...,...
799995,1,2051215933,Fri Jun 05 21:09:51 PDT 2009,NO_QUERY,WhittEds,bed timeee,bed timeee
799996,0,1993553086,Mon Jun 01 10:33:40 PDT 2009,NO_QUERY,Chi0ma4,wish was in cali again,wish cali
799997,0,2061204888,Sat Jun 06 20:23:27 PDT 2009,NO_QUERY,margauxantonio,today might not be so great,today might great
799998,1,2176139262,Mon Jun 15 03:01:37 PDT 2009,NO_QUERY,meganx__x,song of the moment pink bad influence,song moment pink bad influence


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import joblib

vectorizer = CountVectorizer()
df = df.dropna(subset=['words'])

X_CountVecorizer = vectorizer.fit_transform(df['words'])
y_CountVecorizer = df['target']

with open("embedding/CountVectorizer.pkl", 'wb') as vec_file:
    joblib.dump(vectorizer, vec_file)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

vectorizer = TfidfVectorizer()

X_TfidfVectorizer = vectorizer.fit_transform(df['words'])
y_TfidfVectorizer = df['target']

with open("embedding/TfidfVectorizer.pkl", 'wb') as vec_file:
    joblib.dump(vectorizer, vec_file)

In [6]:
from sklearn.model_selection import train_test_split

X_train_Cv, X_test_Cv, y_train_Cv, y_test_Cv = train_test_split(X_CountVecorizer,
                                                                 y_CountVecorizer,
                                                                 test_size = 0.2,
                                                                 random_state = 42)

X_train_Tv, X_test_Tv, y_train_Tv, y_test_Tv = train_test_split(X_TfidfVectorizer,
                                                                 y_TfidfVectorizer,
                                                                 test_size = 0.2,
                                                                 random_state = 42)

In [7]:
from sklearn.linear_model import LogisticRegression
import mlflow

param_sets = [
    {'solver': 'liblinear'},
    {'solver': 'saga'},
    {'solver': 'lbfgs'},
    {'max_iter': 100},
    {'max_iter': 200},
    {'max_iter': 300},
    {'C': 0.0001},
    {'C': 0.001},
    {'C': 0.01},
    {'C': 0.1},
    {'C': 1},
    {'C': 10},
    {'C': 100}
]

In [49]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
import joblib

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("Analysis_sentiment")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_Cv_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Cv, y_train_Cv)

        y_pred = clf.predict(X_test_Cv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Cv, y_test_Cv))
        mlflow.log_metric("Precision", precision_score(y_test_Cv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Cv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Cv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Cv, clf.predict_proba(X_test_Cv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            mlflow.set_tag("tag1", "Logistic regression with CountVectorizer()")
            mlflow.set_tags({"tag2":f'{name_experience}'})
            mlflow.sklearn.log_model(clf, "model", registered_model_name="Logistic_regression_Cv")

            conf_matrix = confusion_matrix(y_test_Cv, y_pred)
            conf_matrix_path = f"{artifact_path}confMat_RLCountVectorizer_{name_experience}.csv"
            pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
            mlflow.log_artifact(conf_matrix_path, "metrics")

            plt.figure(figsize=(8, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(True)
            roc_curve_path = f"{artifact_path}roc_RLCountVectorizer_{name_experience}.png"
            plt.savefig(roc_curve_path)
            plt.close()
            mlflow.log_artifact(roc_curve_path, "plots")

2024/02/15 11:25:39 INFO mlflow.tracking.fluent: Experiment with name 'DeepLearningTweet' does not exist. Creating a new experiment.
Successfully registered model 'Logistic_regression_Cv'.
Created version '1' of model 'Logistic_regression_Cv'.
Registered model 'Logistic_regression_Cv' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic_regression_Cv'.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for 

In [50]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix

artifact_path = './artifacts/'

best_metric_value = -1
best_model_path = ""

mlflow.set_experiment("Analysis_sentiment")

for i, params in enumerate(param_sets):
    name_experience = f'{list(params.keys())[0]}_{list(params.values())[0]}'
    with mlflow.start_run(run_name=f"reg_logistic_Tv_{name_experience}"):
        clf = LogisticRegression(**params)
        clf.fit(X_train_Tv, y_train_Tv)

        y_pred = clf.predict(X_test_Tv)
        mlflow.log_param("params", params)
        mlflow.log_metric("accuracy", clf.score(X_test_Tv, y_test_Tv))
        mlflow.log_metric("Precision", precision_score(y_test_Tv, y_pred))
        mlflow.log_metric("Recall", recall_score(y_test_Tv, y_pred))
        mlflow.log_metric("F1_Score", f1_score(y_test_Tv, y_pred))

        f1 = f1_score(y_test_Cv, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test_Tv, clf.predict_proba(X_test_Tv)[:, 1])
        roc_auc = auc(fpr, tpr)
        mlflow.log_metric("AUC", roc_auc)

        if f1 > best_metric_value:
            best_metric_value = f1
            mlflow.set_tag("tag1", "Logistic regression with TfidfVectorizer()")
            mlflow.set_tags({"tag2":f'{name_experience}'})
            mlflow.sklearn.log_model(clf, "model", registered_model_name="Logistic_regression_Tv")

            conf_matrix = confusion_matrix(y_test_Tv, y_pred)
            conf_matrix_path = f"{artifact_path}confMat_RLTfidfVectorizer_{name_experience}.csv"
            pd.DataFrame(conf_matrix).to_csv(conf_matrix_path, index=False, header=False)
            mlflow.log_artifact(conf_matrix_path, "metrics")

            plt.figure(figsize=(8, 8))
            plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
            plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic (ROC) Curve')
            plt.legend(loc="lower right")
            plt.grid(True)
            roc_curve_path = f"{artifact_path}roc_RLTfidfVectorizer_{name_experience}.png"
            plt.savefig(roc_curve_path)
            plt.close()
            mlflow.log_artifact(roc_curve_path, "plots")

Successfully registered model 'Logistic_regression_Tv'.
Created version '1' of model 'Logistic_regression_Tv'.
Registered model 'Logistic_regression_Tv' already exists. Creating a new version of this model...
Created version '2' of model 'Logistic_regression_Tv'.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_opt