In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from text_analysis import stem_sentence, remove_urls, remove_users, remove_retweets, TextCleaner
from data import load_data, array_to_df
from sklearn.metrics import precision_recall_curve, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from scipy.sparse import lil_matrix, csr_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pickle
import os
import re
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from mlflow import mlflow, log_metric, log_param, log_artifacts
import mlflow.sklearn
from mlflow.tracking.client import MlflowClient
from column_selector import ColumnsSelector

In [2]:
import os
mlflow.set_tracking_uri('http://localhost:5000')
os.environ["AWS_ACCESS_KEY_ID"] = "imroot"
os.environ["AWS_SECRET_ACCESS_KEY"] = "beaconpass"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"

In [3]:
df = load_data("labelled-tweets-20-09-2021.json")
x = df[["text"]]
y = array_to_df(df["topics"])

In [4]:
svc_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('select_columns', ColumnsSelector("text")),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC(probability=True))
])

svc_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
    "classifier__kernel":["linear", "rbf"],
    "classifier__class_weight": ["balanced", None]
}

svc_search = GridSearchCV(svc_pipeline, svc_search_params, cv=4, n_jobs=-1, scoring="recall")

In [5]:
sgd_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('select_columns', ColumnsSelector("text")),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier())
])

sgd_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
    "classifier__kernel":["linear", "rbf"],
}

sgd_search = GridSearchCV(sgd_pipeline, sgd_search_params, cv=4, n_jobs=-1, scoring="recall")

In [6]:
nb_pipeline = Pipeline([
    ('text_cleaner', TextCleaner(remove_urls=True)),
    ('select_columns', ColumnsSelector("text")),
    ('vectorizer', CountVectorizer(stop_words=stopwords.words("english"))),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

nb_search_params = {
    "text_cleaner__stem": [True, False],
    "vectorizer__ngram_range":[(1,1), (1,2), (2,2)],
}

nb_search = GridSearchCV(nb_pipeline, nb_search_params, cv=4, n_jobs=-1, scoring="recall")

In [7]:
searches = [
    ("SVC", svc_search), 
    #("SGD", sgd_search), 
    ("MNB", nb_search)]

def sort_by_precision(df):
    return df.sort_values("precision", ascending=False)

def get_accuracy_precision_recall_f1(labels,pred):
    acc = accuracy_score(labels,pred)
    prec = precision_score(labels,pred,average='weighted') #,average='micro'
    recall = recall_score(labels,pred,average='weighted')
    f1 = f1_score(labels,pred,average='weighted')
    return acc,prec,recall,f1

def train_test_clf(x, y, clf):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)
    accuracy, precision, recall, f1 = get_accuracy_precision_recall_f1(y_test, predictions)
    return accuracy, precision, recall, f1

def search_best(x, y):
    results_df = pd.DataFrame(columns=[ "classifier", "accuracy", "precision", "recall", "f1", "run_id"])
    for name, search in searches:
        with mlflow.start_run() as run:
            print("--- Fitting", name,"---")
            log_param("model", name)
            accuracy, precision, recall, f1 = train_test_clf(x, y, search)
            log_metric("accuracy", accuracy)
            log_metric("precision", precision)
            log_metric("recall", recall)
            log_metric("f1", f1)
            #log_param("Best params:", search.best_params_)
            mlflow.sklearn.log_model(search.best_estimator_, "model")
            results_df.loc[len(results_df.index)] = [name, accuracy, precision, recall, f1, run.info.run_id]
            print("Best params:", search.best_params_)
            mlflow.end_run()
        
    return sort_by_precision(results_df)


def get_experiment_name(topic):
    return topic+"-tweet-labelling"

def get_experiment_id(topic):
    experiment_name = get_experiment_name(topic)
    current_experiment=dict(mlflow.get_experiment_by_name(experiment_name))
    return current_experiment['experiment_id']
    
def get_experiment_best_metrics(topic):
    query = "metric"
    results = MlflowClient().search_runs(
        experiment_ids=[get_experiment_id(topic)], 
        max_results=1,
        order_by=["metrics.precision DESC"]
    )
    return results[0].data.metrics

def save_best(topic):
    experiment_name = get_experiment_name(topic)
    mlflow.set_experiment(experiment_name)
    results = search_best(x, y[[topic]])
    best_result = results.loc[0]
    for classifier, gs in searches:
        if classifier is best_result["classifier"]:
            print("Best classifier if %s" % (best_result["classifier"]))
            print("Accuracy: %.3f, precision: %.3f, recall: %.3f, f1: %.3f"%(best_result["accuracy"], best_result["precision"], best_result["recall"], best_result["f1"]))
            best_experiment = get_experiment_best_metrics(topic)
            print("Current best experiment is ", best_experiment)
            if best_experiment["precision"] <= best_result["precision"]:
                result = mlflow.register_model(
                    "runs:/"+best_result["run_id"]+"/model",
                    experiment_name
                )

In [8]:
save_best("crypto")

--- Fitting SVC ---


  y = column_or_1d(y, warn=True)


Best params: {'classifier__class_weight': 'balanced', 'classifier__kernel': 'linear', 'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 2)}
--- Fitting MNB ---


  y = column_or_1d(y, warn=True)


Best params: {'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 1)}
Best classifier if SVC
Accuracy: 0.847, precision: 0.846, recall: 0.847, f1: 0.846
Current best experiment is  {'accuracy': 0.847457627118644, 'f1': 0.8463140594817694, 'precision': 0.8459599308260222, 'recall': 0.847457627118644}


Registered model 'crypto-tweet-labelling' already exists. Creating a new version of this model...
2021/10/24 23:54:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: crypto-tweet-labelling, version 3
Created version '3' of model 'crypto-tweet-labelling'.


In [9]:
save_best("defi")

--- Fitting SVC ---


  y = column_or_1d(y, warn=True)


Best params: {'classifier__class_weight': 'balanced', 'classifier__kernel': 'linear', 'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 1)}
--- Fitting MNB ---


  y = column_or_1d(y, warn=True)


Best params: {'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 1)}
Best classifier if SVC
Accuracy: 0.898, precision: 0.889, recall: 0.898, f1: 0.893
Current best experiment is  {'accuracy': 0.8945386064030132, 'f1': 0.8482703143618602, 'precision': 0.905702761490785, 'recall': 0.8945386064030132}


In [10]:
save_best("NFT")

--- Fitting SVC ---


  y = column_or_1d(y, warn=True)


Best params: {'classifier__class_weight': 'balanced', 'classifier__kernel': 'linear', 'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 1)}
--- Fitting MNB ---


  y = column_or_1d(y, warn=True)


Best params: {'text_cleaner__stem': True, 'vectorizer__ngram_range': (1, 1)}
Best classifier if SVC
Accuracy: 0.977, precision: 0.977, recall: 0.977, f1: 0.977
Current best experiment is  {'accuracy': 0.9774011299435028, 'f1': 0.9767402035709837, 'precision': 0.9765170224522707, 'recall': 0.9774011299435028}


Registered model 'NFT-tweet-labelling' already exists. Creating a new version of this model...
2021/10/25 00:02:51 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: NFT-tweet-labelling, version 2
Created version '2' of model 'NFT-tweet-labelling'.
