# Author: Mihir Gajjar, Data Science Division, Statistics Canada.

# Goal: Demonstrate Mlflow features.

## Imports

In [242]:
import time

import pandas as pd

from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer

import mlflow

In [243]:
run_name = str(int(time.time()))
print('Run name: ', run_name)

Run name:  1626200071


## Reading the data

In [244]:
# Kaggle competition data download link: https://www.kaggle.com/c/nlp-getting-started/data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [245]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [246]:
print('The length of the training data is %d' % len(train_data))
print('The length of the test data is %d' % len(test_data))

The length of the training data is 7613
The length of the test data is 3263


## Text preprocessing

In [247]:
def clean_text(text):
    # split into words
    tokens = word_tokenize(text)
    # remove all tokens that are not alphanumeric. Can also use .isalpha() here if do not want to keep numbers.
    words = [word for word in tokens if word.isalnum()]
    # remove stopwords
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    # performing lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    # Converting list of words to string
    words = ' '.join(words)
    return words

In [248]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [249]:
train_data['text'].iloc[100]

'.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad'

In [250]:
train_data['cleaned_text'].iloc[100]

'NorwayMFA Bahrain police previously died road accident killed explosion http'

In [251]:
train_data['target'].iloc[100]

1

## Tf-idf features

In [252]:
ngram_range=(1,1)
max_features=100
norm='l2'

In [253]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features, norm=norm)

In [254]:
train_data_tfidf = tfidf_vectorizer.fit_transform(train_data['cleaned_text'])
train_data_tfidf

<7613x100 sparse matrix of type '<class 'numpy.float64'>'
	with 15838 stored elements in Compressed Sparse Row format>

In [255]:
tfidf_vectorizer.get_feature_names()[:10]

['accident',
 'amp',
 'and',
 'as',
 'attack',
 'back',
 'best',
 'body',
 'bomb',
 'building']

## Baseline model

In [256]:
strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [257]:
scoring_function_f1 = make_scorer(f1_score, pos_label=1, average='binary')

## SVC cross validation.

In [258]:
C = 1.0
kernel='poly'
max_iter=-1
random_state=42

In [259]:
svc = SVC(C=C, kernel=kernel, max_iter=max_iter, random_state=random_state)

In [260]:
cv_results = cross_validate(estimator=svc, X=train_data_tfidf, y=train_data['target'], scoring=scoring_function_f1, cv=strat_k_fold, n_jobs=-1, return_train_score=True)
cv_results

{'fit_time': array([0.99043322, 0.99829006, 0.94024873, 0.97373009, 0.96771407]),
 'score_time': array([0.13656974, 0.1343472 , 0.13345313, 0.13198996, 0.13271189]),
 'test_score': array([0.60486891, 0.65035517, 0.5557656 , 0.5426945 , 0.63071895]),
 'train_score': array([0.71281362, 0.76168757, 0.71334394, 0.7291713 , 0.75554698])}

In [261]:
def mean_sd_cv_results(cv_results, metric='F1'):
    print(f"{metric} Train CV results: {cv_results['train_score'].mean().round(3)} +- {cv_results['train_score'].std().round(3)}")
    print(f"{metric} Val CV results: {cv_results['test_score'].mean().round(3)} +- {cv_results['test_score'].std().round(3)}")

In [262]:
mean_sd_cv_results(cv_results)

F1 Train CV results: 0.735 +- 0.021
F1 Val CV results: 0.597 +- 0.042


In [263]:
! jupyter nbconvert --to html mlflow-example-real-or-not-disaster-tweets-modeling-SVC.ipynb

[NbConvertApp] Converting notebook mlflow-example-real-or-not-disaster-tweets-modeling-SVC.ipynb to html
[NbConvertApp] Writing 610630 bytes to mlflow-example-real-or-not-disaster-tweets-modeling-SVC.html


## Mlflow logging

In [264]:
server_uri = 'http://127.0.0.1:5000'
mlflow.set_tracking_uri(server_uri)

In [265]:
mlflow.set_experiment('nlp_with_disaster_tweets')

In [266]:
# MLflow logging.
with mlflow.start_run(run_name=run_name) as run:

    # Logging tags
    # run_name.
    mlflow.set_tag(key='Run name', value=run_name)
    # Goal.
    mlflow.set_tag(key='Goal', value='Check model performance and decide whether we require further pre-processing/hyper-parameter tuning.')
    # Modeling exp.
    mlflow.set_tag(key='Modeling technique', value='SVC')

    # Logging parameters
    mlflow.log_param(key='ngram_range', value=ngram_range)
    mlflow.log_param(key='max_features', value=max_features)
    mlflow.log_param(key='norm', value=norm)
    mlflow.log_param(key='C', value=C)
    mlflow.log_param(key='kernel', value=kernel)
    mlflow.log_param(key='max_iter', value=max_iter)
    mlflow.log_param(key='random_state', value=random_state)

    # Logging the SVC model.
    mlflow.sklearn.log_model(sk_model=svc, artifact_path='svc_model')
   
    # Logging metrics.
    # mean F1-score - train.
    mlflow.log_metric(key='mean F1-score - train', value=cv_results['train_score'].mean().round(3))
    # mean F1-score - val.
    mlflow.log_metric(key='mean F1-score - val', value=cv_results['test_score'].mean().round(3))
    # std F1-score - train.
    mlflow.log_metric(key='std F1-score - train', value=cv_results['train_score'].std().round(3))
    # std F1-score - val.
    mlflow.log_metric(key='std F1-score - val', value=cv_results['test_score'].std().round(3))
   
    # Logging the notebook.
    # Nb.
    mlflow.log_artifact(local_path='real-or-not-disaster-tweets-modeling-SVC.ipynb', artifact_path='Notebook')
    # Nb in HTML.
    mlflow.log_artifact(local_path='real-or-not-disaster-tweets-modeling-SVC.html', artifact_path='Notebook')