In [1]:
import os
import sys
import warnings
import pprint
import json

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

MLFLOW_SERVER_URL = 'http://127.0.0.1:5000/'
experiment_name = 'soccer-news-experiment'

warnings.filterwarnings("ignore")


In [15]:
data = pd.read_csv("ge_news.csv")

In [16]:
data["split_club"] = data.club.str.split('[^a-zA-Z]')

In [17]:
data["tokenized"] = data.text.str.lower().str.split('[^a-zA-Z0-9µùàçéèçÇ]')

In [18]:
for row in data.iterrows():
    row[1].tokenized = ' '.join([word for word in row[1].tokenized if word not in row[1].split_club])

In [19]:
data.drop(["date", "time", "title", "text", "link", "split_club"], axis = 1, inplace = True)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
vectorizer = TfidfVectorizer(min_df=3, max_features=500)

In [22]:
df = pd.DataFrame(vectorizer.fit_transform(data.tokenized).toarray().tolist())

In [24]:
df["y"] = data.club

In [25]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,y
0,0.057972,0.063357,0.065457,0.0,0.070633,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.152227,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.041087,athletico-pr
1,0.000000,0.053242,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.060013,0.053543,...,0.000000,0.042642,0.000000,0.0,0.0,0.099440,0.0,0.000000,0.034528,athletico-pr
2,0.000000,0.000000,0.000000,0.0,0.103110,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.074074,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.059979,athletico-pr
3,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.133025,0.113686,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,athletico-pr
4,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.118290,athletico-pr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139929,0.048024,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.049185,0.000000,0.027494,0.0,0.0,0.024506,0.0,0.098722,0.017018,cruzeiro
139930,0.000000,0.130793,0.000000,0.0,0.000000,0.0,0.144301,0.0,0.000000,0.000000,...,0.000000,0.000000,0.137033,0.0,0.0,0.000000,0.0,0.000000,0.084820,cruzeiro
139931,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,cruzeiro
139932,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,0.255854,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.128384,0.000000,cruzeiro


In [26]:
train, test = train_test_split(df)

train_x = train.drop("y", axis=1)
test_x = test.drop("y", axis=1)
train_y = train[["y"]]
test_y = test[["y"]]

In [3]:
client = mlflow.tracking.MlflowClient(MLFLOW_SERVER_URL)

mlflow.set_tracking_uri(MLFLOW_SERVER_URL)

mlflow.set_experiment(experiment_name)


In [5]:
os.system('MLFLOW_TRACKING_URI=http://0.0.0.0:5000 mlflow models serve -m "models:/soccer-model/Production" -p 5005 --no-conda &')


In [6]:
sorted(client.list_run_infos(experiment.experiment_id), key=lambda x: x.start_time)

[<RunInfo: artifact_uri='./artifacts/1/6bceb36bb95f4f89b41f437b84ec62ea/artifacts', end_time=1630852228587, experiment_id='1', lifecycle_stage='active', run_id='6bceb36bb95f4f89b41f437b84ec62ea', run_uuid='6bceb36bb95f4f89b41f437b84ec62ea', start_time=1630852136072, status='FINISHED', user_id='ijontichy42'>,
 <RunInfo: artifact_uri='./artifacts/1/b1133b4bf5554b4191404a7aa0ebb2d7/artifacts', end_time=1630852253775, experiment_id='1', lifecycle_stage='active', run_id='b1133b4bf5554b4191404a7aa0ebb2d7', run_uuid='b1133b4bf5554b4191404a7aa0ebb2d7', start_time=1630852228597, status='FINISHED', user_id='ijontichy42'>,
 <RunInfo: artifact_uri='./artifacts/1/eff3dd30b5854fad922981625251c0f9/artifacts', end_time=1630852255882, experiment_id='1', lifecycle_stage='active', run_id='eff3dd30b5854fad922981625251c0f9', run_uuid='eff3dd30b5854fad922981625251c0f9', start_time=1630852253787, status='FINISHED', user_id='ijontichy42'>,
 <RunInfo: artifact_uri='./artifacts/1/81b6d954fc254aa6b8612b683b4fa31

In [43]:
client.list_run_infos(experiment.experiment_id)[0]

<RunInfo: artifact_uri='./artifacts/3/514c28e35377483b93c670b67c1dcc86/artifacts', end_time=1630843672451, experiment_id='3', lifecycle_stage='active', run_id='514c28e35377483b93c670b67c1dcc86', run_uuid='514c28e35377483b93c670b67c1dcc86', start_time=1630843653436, status='FINISHED', user_id='ijontichy42'>

In [45]:
sorted(client.list_run_infos(experiment.experiment_id), key=lambda x: x.end_time)[-1]

<RunInfo: artifact_uri='./artifacts/3/514c28e35377483b93c670b67c1dcc86/artifacts', end_time=1630843672451, experiment_id='3', lifecycle_stage='active', run_id='514c28e35377483b93c670b67c1dcc86', run_uuid='514c28e35377483b93c670b67c1dcc86', start_time=1630843653436, status='FINISHED', user_id='ijontichy42'>

In [47]:
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
    ("Linear SVC", LinearSVC()),
    ("Multinomial Naive Bayes", MultinomialNB()),
    ("Logistic Regression", LogisticRegression(random_state=0)),
]

staging_acc = 0
staging_version = 0

client.delete_registered_model("soccer-model")
client.create_registered_model("soccer-model")

for model in models:
    with mlflow.start_run():

        model[1].fit(train_x, train_y)

        y_pred = model[1].predict(test_x)
        accuracy = accuracy_score(test_y, y_pred)

        print("%s" % (model[0]))
        print("  Accuracy: %s" % accuracy)

        mlflow.log_param("accuracy", accuracy)

        mlflow.sklearn.log_model(model[1], "model")
        
        experiment = client.get_experiment_by_name(experiment_name)
        run_info = sorted(client.list_run_infos(experiment.experiment_id), key=lambda x: x.start_time)[-1]
        
        result = client.create_model_version(
            name="soccer-model",
            source=f"{run_info.artifact_uri}/model",
            run_id=run_info.run_id
        )
        
        if accuracy > staging_acc:
            client.transition_model_version_stage(
                name="soccer-model",
                version=result.version,
                stage="Staging"
            )
            staging_version = result.version
            staging_acc = accuracy

Random Forest
  Accuracy: 0.5762062657214727


2021/09/05 15:19:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: soccer-model, version 1


Linear SVC
  Accuracy: 0.701577864166476


2021/09/05 15:19:59 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: soccer-model, version 2


Multinomial Naive Bayes
  Accuracy: 0.5087754402012349


2021/09/05 15:20:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: soccer-model, version 3


Logistic Regression
  Accuracy: 0.6975188657672079


2021/09/05 15:20:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: soccer-model, version 4


In [48]:
client.transition_model_version_stage(
    name="soccer-model",
    version=staging_version,
    stage="Production"
)

<ModelVersion: creation_timestamp=1630844399300, current_stage='Production', description='', last_updated_timestamp=1630844451598, name='soccer-model', run_id='75078b66870b465bab5eeaec3cc31356', run_link='', source='./artifacts/3/75078b66870b465bab5eeaec3cc31356/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [49]:
os.system('MLFLOW_TRACKING_URI=http://0.0.0.0:5000 mlflow models serve -m "models:/soccer-model/Production" -p 5005 --no-conda &')


0

2021/09/05 15:20:53 INFO mlflow.models.cli: Selected backend for flavor 'python_function'
2021/09/05 15:20:53 INFO mlflow.pyfunc.backend: === Running command 'gunicorn --timeout=60 -b 127.0.0.1:5005 -w 1 ${GUNICORN_CMD_ARGS} -- mlflow.pyfunc.scoring_server.wsgi:app'
[2021-09-05 15:20:54 +0300] [21639] [INFO] Starting gunicorn 20.1.0
[2021-09-05 15:20:54 +0300] [21639] [INFO] Listening at: http://127.0.0.1:5005 (21639)
[2021-09-05 15:20:54 +0300] [21639] [INFO] Using worker: sync
[2021-09-05 15:20:54 +0300] [21640] [INFO] Booting worker with pid: 21640


In [35]:
data

Unnamed: 0,club,tokenized
0,athletico-pr,"[0.0579724562245176, 0.06335652176413933, 0.06..."
1,athletico-pr,"[0.0, 0.05324224711869487, 0.0, 0.0, 0.0, 0.0,..."
2,athletico-pr,"[0.0, 0.0, 0.0, 0.0, 0.10310967135352692, 0.0,..."
3,athletico-pr,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,athletico-pr,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
139929,cruzeiro,"[0.04802359185840033, 0.0, 0.0, 0.0, 0.0, 0.0,..."
139930,cruzeiro,"[0.0, 0.1307929344891117, 0.0, 0.0, 0.0, 0.0, ..."
139931,cruzeiro,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
139932,cruzeiro,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [53]:
import requests
url = f'http://127.0.0.1:5005/invocations'

http_data = test_x[0:6].to_json(orient='split')

response = requests.post(url=url, headers={'Content-Type': 'application/json'}, data=http_data)
prod_pred = json.loads(response.text)
print(prod_pred)
print(test_y[:6])

['atletico-mg', 'botafogo', 'botafogo', 'palmeiras', 'botafogo', 'gremio']
                  y
15485   atletico-mg
25901      botafogo
58711      flamengo
106467    palmeiras
31546      botafogo
114912       santos
