## Install the required libraries

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame 
import nltk

In [2]:
alerts = pd.read_csv("classified_alerts.csv")

In [3]:
cls = set(alerts['class'])

In [4]:
print(cls)

{'c.f.j.d.e.unrecognizedpropertyexception.from', 'a.c.t.b.r.s.e.customexceptionhandler.handlehttpstatuscodeexception', 'a.c.t.b.r.c.l.loggingaspect', 'a.c.t.b.r.c.u.commonutils', 'o.s.w.c.httpservererrorexception.badgateway', 'o.s.w.c.httpservererrorexception.internalservererror', 'a.c.t.b.r.s.e.customexceptionhandler', 'j.lang.nullpointerexception', 'o.json.jsonexception', 'o.s.w.c.httpservererrorexception.gatewaytimeout', 'a.c.t.b.r.s.m.fromtarget.targetresponsedata'}


In [5]:
class_lst = list(cls)
class_lst.index(class_lst[3])

3

In [6]:
len(alerts)

10912

In [7]:
alerts.columns

Index(['appname', 'alertid', 'class', 'words'], dtype='object')

In [8]:
# Convert class to number
def category2Num(cat):
    global class_lst
    return class_lst.index(cat)
    
alerts['category_num'] = alerts['class'].apply(category2Num)

In [9]:
alerts.head()

Unnamed: 0,appname,alertid,class,words,category_num
0,spg-service-request-inbound-consumer,a1642502490828,a.c.t.b.r.s.e.customexceptionhandler,info actbrsecustomexceptionhandler requestid ...,6
1,spg-service-request-inbound-consumer,a1642502490829,a.c.t.b.r.s.e.customexceptionhandler,info actbrsecustomexceptionhandler customexce...,6
2,spg-service-request-inbound-consumer,a1642502490838,o.s.w.c.httpservererrorexception.badgateway,oswchttpservererrorexceptionbadgateway 502 bad...,4
3,spg-service-request-inbound-consumer,a1642502490888,a.c.t.b.r.s.e.customexceptionhandler,info actbrsecustomexceptionhandler customexce...,6
4,spg-service-request-inbound-consumer,a1642502490889,a.c.t.b.r.s.e.customexceptionhandler,info actbrsecustomexceptionhandler requestid ...,6


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [11]:
from string import punctuation

In [12]:
from imblearn.over_sampling import SMOTE

In [13]:
alerts = alerts.drop(['appname', 'alertid', 'class'], axis=1)
alerts.columns

Index(['words', 'category_num'], dtype='object')

In [14]:
alerts['category_num'].value_counts()

10    4262
1     3595
0     1287
6      544
5      340
3      272
4      204
7      136
9      136
2       68
8       68
Name: category_num, dtype: int64

In [15]:
X = alerts['words']
y = alerts['category_num']

In [16]:
print(X)

0        info actbrsecustomexceptionhandler  requestid ...
1        info actbrsecustomexceptionhandler  customexce...
2        oswchttpservererrorexceptionbadgateway 502 bad...
3        info actbrsecustomexceptionhandler  customexce...
4        info actbrsecustomexceptionhandler  requestid ...
                               ...                        
10907    cfjdexcunrecognizedpropertyexception unrecogni...
10908    warn oswsmmaexceptionhandlerexceptionresolver ...
10909    cfjdexcunrecognizedpropertyexception unrecogni...
10910    cfjdexcunrecognizedpropertyexception unrecogni...
10911    warn oswsmmaexceptionhandlerexceptionresolver ...
Name: words, Length: 10912, dtype: object


In [17]:
alerts.head()

Unnamed: 0,words,category_num
0,info actbrsecustomexceptionhandler requestid ...,6
1,info actbrsecustomexceptionhandler customexce...,6
2,oswchttpservererrorexceptionbadgateway 502 bad...,4
3,info actbrsecustomexceptionhandler customexce...,6
4,info actbrsecustomexceptionhandler requestid ...,6


In [18]:
my_stop_words = set(['info', 'of', 'and', 'cannot', 'warn', 'is', 'not', 'out', 'doctype', 'html', \
                     'failed', 'failure', 'handle', 'request', 'response', 'throwing', 'line', 'get', \
                     'exception', 'detail', 'details', 'be', 'to', 'in', 'public', 'throws', 'known', \
                     'at', 'from', 'with', 'class', 'marked', 'as', 'it', 'this', 'return', 'returning', \
                     'notfound', 'type', 'title'])

In [19]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= my_stop_words)

In [20]:
#Handle class imbalance
X_features = tfidf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=67)
sm = SMOTE()
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
X_test_smote, y_test_smote = sm.fit_resample(X_test, y_test)

In [21]:
def class_balanced_model_fit(X_train_smote, y_train_smote, X_test_smote, y_test_smote, ml_model, coef_show):
    classification = ml_model.fit(X_train_smote, y_train_smote)
    classification_pred = classification.predict(X_test_smote)
    accuracy = classification.score(X_test_smote, y_test_smote)
    model_performance = classification_report(y_test_smote, classification_pred)
    validation_pred_proba_grad = classification.predict_proba(X_test_smote)
    print(validation_pred_proba_grad)
    #roc_auc = roc_auc_score(y_test_smote, validation_pred_proba_grad[:,1])
    
    print("***** Accuracy of the classification model: ", accuracy, " *******")
    print('')
    print(model_performance)
    #print('')
    #print("***** ROC_AUC score: ", roc_auc, " *******")
    #print("*************************************************")
    
    if coef_show == 1:
        featureNames = tfidf.get_feature_names()
        coef = classification.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : featureNames, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print("************ Top 10 positive features (variables) ************")
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print("************ Top 10 negative features (variables) ************")        
        print(coeff_df.tail(20).to_string(index=False))
    
    return classification

In [22]:
LRClassModel = class_balanced_model_fit(X_train_smote, y_train_smote, 
                                     X_test_smote, y_test_smote, 
                                     LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500),
                                     coef_show=1)

[[2.33621670e-04 9.97769839e-01 2.08749967e-04 ... 2.17464771e-04
  2.33621670e-04 2.03919809e-04]
 [2.33621670e-04 9.97769839e-01 2.08749967e-04 ... 2.17464771e-04
  2.33621670e-04 2.03919809e-04]
 [2.03880897e-04 2.03880897e-04 2.25575512e-04 ... 1.86906857e-04
  2.03880897e-04 9.97582254e-01]
 ...
 [2.33621670e-04 2.33621670e-04 2.08749967e-04 ... 2.17464771e-04
  9.97769839e-01 2.03919809e-04]
 [2.33621670e-04 2.33621670e-04 2.08749967e-04 ... 2.17464771e-04
  9.97769839e-01 2.03919809e-04]
 [2.33621670e-04 2.33621670e-04 2.08749967e-04 ... 2.17464771e-04
  9.97769839e-01 2.03919809e-04]]
***** Accuracy of the classification model:  1.0  *******

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       869
           1       1.00      1.00      1.00       869
           2       1.00      1.00      1.00       869
           3       1.00      1.00      1.00       869
           4       1.00      1.00      1.00       869
           5    

In [23]:
type(LRClassModel)

sklearn.linear_model._logistic.LogisticRegression

In [24]:
import joblib
joblib.dump(LRClassModel, 'LRClassModel.pkl')

LRClassModel_from_pkl = joblib.load('LRClassModel.pkl')
print(LRClassModel_from_pkl)

LogisticRegression(max_iter=500, multi_class='multinomial')


In [25]:
X_test_smote

<9559x62 sparse matrix of type '<class 'numpy.float64'>'
	with 61148 stored elements in Compressed Sparse Row format>

In [26]:
#Test some random text for classification - Do reviews classification as good or bad sentiment. 
Xfeatu = tfidf.transform(
['cfjdexcunrecognizedpropertyexception unrecognized field error class actbrsmfromtargettargetresponsedata not marked as ignorable 4 known properties status code message incidentid', 
'warn oswsmmaexceptionhandlerexceptionresolver  failure in exceptionhandler public oshttpresponseentity jlangobject actbrsecustomexceptionhandlerhandlehttpstatuscodeexception oswclienthttpstatuscodeexception throws jioioexception',
'info actbrsecustomexceptionhandler  customexceptionhandler handlecustomexception is returning 404 notfound not found errorresource type title badrequest status 400 message cannot deserialize instance of jlangstring out of startarray token'])


In [27]:
print(LRClassModel.predict(Xfeatu))

[10  1  6]


In [28]:
print(LRClassModel_from_pkl.predict(Xfeatu))

[10  1  6]


### END OF TELSTRA CODE