In [4]:
import pandas as pd
import numpy as np
import json
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string
import sklearn.metrics as metrics


In [20]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [7]:
def preprocess_text(message):

    #stopwords
    new_stopwords=['Hi','Hello','Team','Thanks','Hey','regards','please','jira','@uber.com','@ext.uber.com',' !image.png|thumbnail!','!Capture.PNG|thumbnail!']
    stpwrd = nltk.corpus.stopwords.words('english')
    stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>1])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #message = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation]
    #tagged_sentence = nltk.tag.pos_tag(message.split())
    #edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' or tag != 'NNPS']
    #message =  " ".join(edited_sentence)
    return message

In [21]:
#read input from historical data into dataframe
data_df = pd.read_excel('/Users/jghosh2/Documents/my-notebook/L1 triage POC/data/Updated-JIRA_DUMP_Mar_2020_March_2022_Sourabh.xlsx',usecols=['Summary','Description','Component'])
data_df['Summary']=data_df['Summary'].astype(str)
data_df['Description']=data_df['Description'].astype(str)
data_df.dropna()
#choose sample data from entire data
data_df = data_df.sample(frac=1, random_state=42)
data_df['combined_text'] = data_df[['Summary','Description']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
data_df['processed_text'] = data_df['combined_text'].map(lambda s:preprocess_text(s)) 
data_df.dropna()
print(data_df.shape)
data_df=data_df.drop_duplicates(subset=['processed_text','Component'],keep='first')
#data_df_duplicated=data_df[data_df.duplicated(subset=['processed_text','target'],keep=False)]
#print(data_df_duplicated.shape)
print(data_df.shape)
data_df = data_df.reset_index(drop=True)

(44513, 5)
(40025, 5)


In [33]:
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
data_df['Component_label']= label_encoder.fit_transform(data_df['Component'])

In [34]:
data_df['Component_label'] = data_df['Component_label'].astype(int)

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [25]:
token = tokenizer.encode_plus(
    data_df['processed_text'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [27]:
X_input_ids = np.zeros((len(data_df), 256))
X_attn_masks = np.zeros((len(data_df), 256))

In [28]:
def generate_training_data(data_df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(data_df['processed_text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [29]:
X_input_ids, X_attn_masks = generate_training_data(data_df, X_input_ids, X_attn_masks, tokenizer)


0it [00:00, ?it/s]

In [36]:
labels = np.zeros((len(data_df), 61))
labels.shape

(40025, 61)

In [37]:
labels[np.arange(len(data_df)), data_df['Component_label'].values] = 1 # one-hot encoded target tensor


In [38]:
#creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(61,), dtype=tf.float64, name=None))>

In [39]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [40]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset


In [41]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(61,), dtype=tf.float64, name=None))>

In [42]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor


In [44]:
p = 0.8
train_size = int((len(data_df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.


In [45]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [46]:
from transformers import TFBertModel

In [47]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights


Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [52]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(61, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [53]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [54]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])


In [55]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2

KeyboardInterrupt: 

In [None]:
sentiment_model.save('sentiment_model')


In [5]:
X_train, X_test = train_test_split(data_df, test_size=0.2, random_state=111)

In [6]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = list(compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(data_df['Component']),
                                        y = data_df['Component']                                                    
                                    ))


In [7]:
class_weights.sort()
weights={}
for index, weight in enumerate(class_weights) :
    weights[index]=weight

In [8]:
dataset_train = tf.data.Dataset.from_tensor_slices((X_train['processed_text'].values, X_train['Component'].values))
dataset_test = tf.data.Dataset.from_tensor_slices((X_test['processed_text'].values, X_test['Component'].values))

In [9]:
list(range(0,len(data_df['Component'].unique())))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60]

In [31]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(data_df['Component'].unique()),
        values=tf.constant(list(range(0,len(data_df['Component'].unique())))),
    ),
    default_value=tf.constant(-1),
    name="target_encoding"
)

@tf.function
def target(x):
    return table.lookup(x)

In [32]:
def show_batch(dataset, size=5):
      for batch, label in dataset.take(size):
          print(batch.numpy())
          print(target(label).numpy())

In [33]:
#pd.set_option('max_rows',None)

In [34]:
def fetch(text, labels):
    return text, tf.one_hot(target(labels),6)

In [35]:
train_data_f=dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)

In [36]:
train_data, train_labels = next(iter(train_data_f.batch(5)))
train_data, train_labels

(<tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'need help split invoice hi team need help splitting invoice prepare mass addition',
        b'request create po another entity po creator nangulo uber com manager detailed justification benefit manager latam responsible vendor management benefit granted employee costa rica creation po entity require access entity portier costa rica srl',
        b'access oracle apex hi recently joined strategic finance team london would possible receive access oracle apex fp permission thank help best shaked',
        b'clone delegate request aj greulich hi fintech team set michael huaco delegate aj greulich coupa oracle approval date range start date end date reason paternity leave let know question thanks blaine http uber box com shared static xdf wm te bi pior dc gif blaine milner finops procure pay gpo bmilner uber com mailto bmilner uber com uber com http www uber com twitter http twitter com uber facebook http www facebook com uber blog http

In [37]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_data[:1])

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[ 0.14832334,  0.11642308, -0.00259464, -0.2741825 , -0.03419008,
         0.05448474,  0.09951627,  0.13680217, -0.21613999,  0.45808873,
        -0.10294521, -0.11740037, -0.03200293,  0.08250536, -0.05801883,
        -0.20927465, -0.0221511 ,  0.09924161, -0.09119239,  0.0615944 ,
         0.13973267,  0.16179985, -0.08059154, -0.08594684,  0.0673902 ,
        -0.00145557, -0.00130037,  0.11433198, -0.23935963, -0.00845843,
        -0.05642676, -0.01102808,  0.10869243,  0.01076306, -0.06180575,
         0.00575203, -0.19492295,  0.08626429,  0.04200781,  0.01668094,
         0.08677208, -0.13424514,  0.04854997, -0.10535169, -0.09532416,
         0.05975296, -0.10966925,  0.21322154,  0.14673808, -0.08014761,
        -0.00512229, -0.16650414, -0.02921075,  0.08152414, -0.18004051,
         0.16847077, -0.02556576, -0.30985662, -0.05359058,  0.14351867,
         0.10968975,  0.2667437 ,  0.01079381, -0.16184346, -0.15252608,
 

In [38]:
model = tf.keras.Sequential()
model.add(hub_layer)
for units in [128, 128, 64 , 32]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 128)               124642688 
                                                                 
 dense_10 (Dense)            (None, 128)               16512     
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 128)               16512     
                                                                 
 dropout_9 (Dropout)         (None, 128)               0         
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dropout_10 (Dropout)        (None, 64)               

In [39]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [40]:
train_data_f=train_data_f.shuffle(70000).batch(512)
test_data_f=test_data_f.batch(512)


In [41]:
history = model.fit(train_data_f,
                    epochs=4,
                    validation_data=test_data_f,
                    verbose=1,
                    class_weight=weights)

Epoch 1/4


  return dispatch_target(*args, **kwargs)


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
results = model.evaluate(dataset_test.map(fetch).batch(11491), verbose=2)

print(results)

1/1 - 0s - loss: 811630.8125 - accuracy: 0.1026 - 414ms/epoch - 414ms/step
[811630.8125, 0.10256090015172958]


In [22]:
test_data, test_labels = next(iter(dataset_test.map(fetch).batch(45963)))


In [23]:
y_pred=model.predict(test_data)


In [24]:
from sklearn.metrics import classification_report


In [25]:
print(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1)))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5837
           1       0.00      0.00      0.00        95
           2       0.00      0.00      0.00       515
           3       0.00      0.00      0.00       469
           4       0.00      0.00      0.00       268
           5       0.10      1.00      0.19       821

    accuracy                           0.10      8005
   macro avg       0.02      0.17      0.03      8005
weighted avg       0.01      0.10      0.02      8005



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
import pickle
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
pickle.dump(shallow_rf, open(filename_primary, 'wb'))

NameError: name 'shallow_rf' is not defined