In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("./data/cleaned_data.csv")
df.head()

Unnamed: 0,full_text,aac,aapt,aar,abaddressbook,abi,abort,absolutepath,absolutevalue,abstract,...,zooming,zorder,zpl,zplii,zsh,zshcompletion,zshrc,zsi,zurbfoundation,zxing
0,Pan & Zoom Image\nI want to create a simple im...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,Is a bool read/write atomic in C#\nIs accessin...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,What is the advantage of storing schema in avr...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,How do you get JavaScript/jQuery Intellisense ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,XmlSerializer - There was an error reflecting ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
num_labels = 10

In [6]:
def get_top_n(dataframe, top_n):
    """
    Get the top n of the created 
    """
    categories = list(dataframe.columns.values)[1:]
    df_stats = pd.DataFrame([(category, dataframe[category].sum()) for category in categories], columns=['category', 'number of queries'])
    df_stats.sort_values(by=['number of queries'], ascending = False, inplace=True)
    
    top_ = df_stats['category'][:top_n].tolist()
    df_top = dataframe[['full_text'] + top_]
    
    # Remove raws without labels
    df_top = df_top[(df_top[top_].T != 0).any()]
    df_top = df_top.reset_index(drop=True)
    return df_top, top_

In [7]:
df, label_cols = get_top_n(df, num_labels)
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,full_text,java,python,javascript,ios,android,iphone,objectivec,html,jquery,php,one_hot_labels
0,What is the advantage of storing schema in avr...,1,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,How do you get JavaScript/jQuery Intellisense ...,0,0,1,0,0,0,0,0,1,0,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0]"
2,How can I prevent SQL injection in PHP?\nIf us...,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,Mocking Static Blocks in Java\nMy motto for Ja...,1,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Getting random row through SQLAlchemy\nHow do ...,0,1,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [1]:
import os
from transformers import BertConfig
from transformers import BertForSequenceClassification
from transformers import BertTokenizer

2022-08-28 17:22:33.369661: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-28 17:22:33.369716: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [10]:
import torch

In [11]:
model_eval = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model_eval.load_state_dict(torch.load("bert_model_stackoverflow10", map_location='cpu'))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
sample = list(df['full_text'].values)[:5]
output = list(df['one_hot_labels'].values)[:5]

In [13]:
inputs = tokenizer(sample, padding=True, return_tensors='pt')
predictions = model_eval(**inputs)

probs = torch.nn.Softmax(dim=1)(predictions.logits)
probs = probs.detach().numpy()

classes = probs.argmax(axis=1)
confidences = probs.max(axis=1)

In [18]:
from mlflow.models.signature import infer_signature

sample = pd.DataFrame({'text': sample})
outputs = pd.DataFrame({ 'rating': [label_cols[c] for c in classes], 'confidence': confidences })

signature = infer_signature(sample, outputs)
signature

inputs: 
  ['text': string]
outputs: 
  ['rating': string, 'confidence': float]

In [23]:
from mlflow.pyfunc import PythonModel, PythonModelContext
from typing import Dict
import pandas as pd

class BertTextClassifier(PythonModel):
    def load_context(self, context: PythonModelContext):
        import os
        from transformers import BertConfig
        from transformers import BertForSequenceClassification
        from transformers import BertTokenizer
        self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)
        self.model.load_state_dict(torch.load("bert_model_stackoverflow10", map_location='cpu'))
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.labels = ['java','python','javascript','ios','android','iphone','objectivec','html','jquery','php']
        
        if torch.cuda.is_available():
            print('[INFO] Model is being sent to CUDA device as GPU is available')
            self.model = self.model.cuda()
        else:
            print('[INFO] Model will use CPU runtime')
        
        _ = self.model.eval()
        
    def _predict_batch(self, data):
        import torch
        import pandas as pd
        
        with torch.no_grad():
            inputs = self.tokenizer(list(data['text'].values), padding=True, return_tensors='pt', truncation=True)
        
            if self.model.device.index != None:
                torch.cuda.empty_cache()
                for key in inputs.keys():
                    inputs[key] = inputs[key].to(self.model.device.index)

            predictions = self.model(**inputs)
            
            probs = torch.nn.Softmax(dim=1)(predictions.logits)
            probs = probs.detach().cpu().numpy()
            
            classes = probs.argmax(axis=1)
            confidences = probs.max(axis=1)

            return classes, confidences
        
    def predict(self, context: PythonModelContext, data: pd.DataFrame) -> pd.DataFrame:
        import math
        import numpy as np
        
        batch_size = 64
        sample_size = len(data)
        
        classes = np.zeros(sample_size)
        confidences = np.zeros(sample_size)

        for batch_idx in range(0, math.ceil(sample_size / batch_size)):
            bfrom = batch_idx * batch_size
            bto = bfrom + batch_size
            
            c, p = self._predict_batch(data.iloc[bfrom:bto])
            classes[bfrom:bto] = c
            confidences[bfrom:bto] = p
            
        return pd.DataFrame({'rating': [self.labels[int(c)] for c in classes], 
                             'confidence': confidences})

In [24]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('bert-classification')

with mlflow.start_run() as run:
    mlflow.pyfunc.log_model('classifier', 
                            python_model=BertTextClassifier(), 
                            artifacts=None, 
                            signature=signature,
                            registered_model_name='bert-stackoverflow-classification')

Registered model 'bert-stackoverflow-classification' already exists. Creating a new version of this model...
2022/08/28 17:29:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bert-stackoverflow-classification, version 4
Created version '4' of model 'bert-stackoverflow-classification'.


In [25]:
import mlflow
model = mlflow.pyfunc.load_model('models:/bert-stackoverflow-classification/latest')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[INFO] Model will use CPU runtime


In [28]:
model.predict({'text': ['How do you get JavaScript Query Intellisense', 'What do you think about the recent advancement in nlp ?']})

Unnamed: 0,rating,confidence
0,java,0.14425
1,java,0.139777


In [None]:
mlflow models serve -m models:/bert-stackoverflow-classification/