In [1]:
import pandas as pd
import numpy as np
import pickle

import tensorflow_hub as hub
import tensorflow as tf
import keras.backend as K
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [2]:
def fetch_training_data(customer):
    
    training_data = pd.read_parquet("D:/"+customer+"_data.parquet")
    return training_data

In [3]:
def clean_text(ticket_data):  
    stop_words = set(stopwords.words('english'))
    ticket_data = re.sub('[^A-Za-z0-9]+', ' ', ticket_data)  
    word_tokens = word_tokenize(ticket_data)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    out = ' '.join(filtered_sentence)
    return out

In [4]:
def preprocess_data(training_data,description_col,target_col,customer,imbalance_percent_remove):
    
    training_data = training_data.dropna(subset=[description_col,target_col])

    # percent of total data to be in class
    #imbalance_percent_remove = 2
    records_remove = (len(training_data)/100)*imbalance_percent_remove
    print("Class imbalance control factor is set to "+str(imbalance_percent_remove)+"% of total data. Classes containing number of records below "+ str(records_remove)+" are removed.")
    
    value_counts = training_data[target_col].value_counts()
    value_counts = pd.DataFrame(value_counts)
    value_counts = value_counts.reset_index()
    value_counts.columns = ['feature','count']
    required_feature = list(value_counts[value_counts['count']>  records_remove]['feature'])
    training_data = training_data[training_data[target_col].isin(required_feature)]
    training_data = training_data[[description_col,target_col]]
    
    training_data[description_col] = training_data.apply(lambda x: clean_text(x[description_col]), axis=1)
    
    le = preprocessing.LabelEncoder()
    le.fit(training_data[target_col])
    filename = customer+"_label_encoder.pkl"
    pickle.dump(le, open(filename, 'wb'))
    training_data[target_col] = le.transform(training_data[target_col])

    return training_data

In [5]:
def test_train_split(training_data,description_col,target_col):
    
    X_train, X_test, y_train, y_test = train_test_split(training_data[description_col],training_data[target_col],test_size=0.3,random_state=42,stratify=training_data[target_col])
    return X_train, X_test, y_train, y_test

In [6]:
def create_embeddings(X_train, X_test):

    embedding = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    hub_layer = hub.KerasLayer(embedding, input_shape=[],dtype=tf.string, trainable=False,)
    
    data_array_X_train = X_train.to_numpy()
    data_tensor_X_train = tf.convert_to_tensor(data_array_X_train)
    embeddings_train= hub_layer(data_tensor_X_train)
    print("train embeddings created")
    
    data_array_X_test = X_test.to_numpy()
    data_tensor_X_test = tf.convert_to_tensor(data_array_X_test)
    embeddings_test= hub_layer(data_tensor_X_test)
    print("test embeddings created")
    return embeddings_train, embeddings_test

In [7]:
def create_model(total_class):

    model = Sequential()
    model.add(Dense(120, activation = "relu"))
    model.add(Dense(64, activation = "relu"))
    model.add(Dense(32, activation = "relu"))
    model.add(Dense(64, activation = "relu"))
    model.add(Dense(120, activation = "relu"))
    model.add(Dropout(0.2))
    model.add(Dense(total_class, activation = "softmax"))
    model.compile(Adam(lr = 0.01), "categorical_crossentropy", metrics = ["accuracy"])
    return model

In [8]:
def fiting_model(model,embeddings_train,y_train,y_test,total_class):
    
    y_train = to_categorical(y_train, total_class)
    y_test = to_categorical(y_test, total_class)
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    model.fit(embeddings_train, y_train, epochs=150, callbacks=[callback])
    return model

In [9]:
def save_model(model,customer):
    
    model.save(customer+'_classifier_model')
    return "Model saved"

In [10]:
def train_classifier(customer,description_col,target_col,imbalance_percent_remove):

    training_data = fetch_training_data(customer)
    training_data = preprocess_data(training_data,description_col,target_col,customer,imbalance_percent_remove)
    X_train, X_test, y_train, y_test = test_train_split(training_data,description_col,target_col)

    embeddings_train, embeddings_test = create_embeddings(X_train, X_test)
    total_class = len(list(y_train.unique()))

    model = create_model(total_class)
    model = fiting_model(model,embeddings_train,y_train,y_test,total_class)
    
    output = save_model(model,customer)

In [11]:
train_classifier("dummy_customer","DetailedDescription","Priority",2)

Class imbalance control factor is set to 2% of total data. Classes containing number of records below 119.4 are removed.
train embeddings created
test embeddings created
Epoch 1/150


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
INFO:tensorflow:Assets written to: nexops_classifier_model\assets


INFO:tensorflow:Assets written to: nexops_classifier_model\assets


In [12]:
def predict_group(customer,test_data):    
    model = load_model(customer+'_classifier_model')

    embedding = "https://tfhub.dev/google/universal-sentence-encoder/4"	
    hub_layer = hub.KerasLayer(embedding, input_shape=[],dtype=tf.string, trainable=False,)

    test_data = [test_data]
    test_df = pd.DataFrame()
    test_df["data"] = test_data

    data_array_X_train = test_df["data"].to_numpy()
    data_tensor_X_train = tf.convert_to_tensor(data_array_X_train)
    embeddings_train= hub_layer(data_tensor_X_train)

    y_predicted = model.predict(embeddings_train)
    print(np.argmax(y_predicted, axis=1))
    filename = customer+"_label_encoder.pkl"
    loaded_encoder = pickle.load(open(filename, 'rb'))
    output_class=loaded_encoder.inverse_transform(np.argmax(y_predicted, axis=1))
    return output_class

In [13]:
output = predict_group("dummy_customer","Delete batch #190161072 library INFILIB")
output[0]

[2]


'4 - Low'