# Recommendation DataFlow Prediction Training


In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
DATA_FILE='/content/drive/MyDrive/CaseData/casedata.csv'

In [3]:
# Import python modules
import pandas as pd
#for displaying 500 results in pandas dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import numpy as np
import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report
from sklearn.metrics import auc, roc_curve, roc_auc_score 

In [4]:
df = pd.read_csv(DATA_FILE, encoding='ISO-8859-1')
#df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.rename(columns={df.columns[12]: 'resoln_res4'})

Unnamed: 0,cas_lvl1,cas_lvl2,cas_lvl3,cas_lvl4,cas_lvl5,resoln_action1,resoln_res1,resoln_action2,resoln_res2,resoln_action3,resoln_res3,resoln_action4,resoln_res4
0,Trouble Management,All Prod No Svc,CPE No Sync,Dispatched to Premise,OOS,CheckNetworkOutage,Yes,AddCaseToOutageCase,End,,,,
1,Trouble Management,All Prod No Svc,CPE No Sync,Dispatched to Premise,OOS,CheckNetworkOutage,Yes,AddCaseToOutageCase,End,,,,
2,Trouble Management,All Prod No Svc,CPE No Sync,Dispatched to Premise,OOS,CheckNetworkOutage,Yes,AddCaseToOutageCase,End,,,,
3,Trouble Management,All Prod No Svc,CPE No Sync,Dispatched to Premise,OOS,CheckNetworkOutage,Yes,AddCaseToOutageCase,End,,,,
4,Trouble Management,All Prod No Svc,CPE No Sync,Dispatched to Premise,OOS,CheckNetworkOutage,Yes,AddCaseToOutageCase,End,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12521,Trouble Management,TV No Svc,Inside Issue,Dispatched to Premise,OOS,CheckNetworkOutage,No,RestartModemAndCheck,No,DispatchTechnician,End,,
12522,Trouble Management,TV No Svc,Inside Issue,Dispatched to Premise,OOS,CheckNetworkOutage,No,RestartModemAndCheck,No,DispatchTechnician,End,,
12523,Trouble Management,TV No Svc,Inside Issue,Dispatched to Premise,OOS,CheckNetworkOutage,No,RestartModemAndCheck,No,DispatchTechnician,End,,
12524,Trouble Management,TV No Svc,Inside Issue,Dispatched to Premise,OOS,CheckNetworkOutage,No,RestartModemAndCheck,No,DispatchTechnician,End,,


In [5]:
df.shape

(12526, 13)

In [6]:
print(df.groupby(['cas_lvl2', 'cas_lvl3' , 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5']).size())

cas_lvl2               cas_lvl3         cas_lvl3         cas_lvl4               cas_lvl5
All Prod Intermittent  CPE No Sync      CPE No Sync      Dispatched to Premise  IS          2954
All Prod No Svc        CPE No Sync      CPE No Sync      Dispatched to Premise  OOS         3309
TV No Svc              Inside Issue     Inside Issue     Dispatched to Premise  OOS         2954
                       No Pic No Sound  No Pic No Sound  PC CPE                 OOS         3309
dtype: int64


In [7]:
import pandas as pd
import numpy
from sklearn.naive_bayes import GaussianNB

In [8]:
from sklearn import preprocessing
categorical = ['cas_lvl1', 'cas_lvl2', 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5' , 'resoln_action1' , 'resoln_res1', 'resoln_action2' , 'resoln_res2' , 'resoln_action3' , 'resoln_res3','resoln_action4' ]
for feature in categorical:
        le = preprocessing.LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        print(feature)
        print(list(le.classes_))

cas_lvl1
['Trouble Management']
cas_lvl2
['All Prod Intermittent', 'All Prod No Svc', 'TV No Svc']
cas_lvl3
['CPE No Sync', 'Inside Issue', 'No Pic No Sound']
cas_lvl4
['Dispatched to Premise', 'PC CPE']
cas_lvl5
['IS', 'OOS']
resoln_action1
['CheckNetworkOutage', 'CheckRecentCaseThreshold', 'RestartModemAndCheck']
resoln_res1
['No', 'Yes']
resoln_action2
['AddCaseToOutageCase', 'CheckRecentCaseThreshold', 'CloseCase', 'DispatchTechnician', 'EscalateCase', 'RestartModemAndCheck']
resoln_res2
['End', 'No', 'Yes']
resoln_action3
['CloseCase', 'DispatchTechnician', 'EscalateCase', 'RestartModemAndCheck', nan]
resoln_res3
['End', 'No', 'Yes', nan]
resoln_action4
['CloseCase', 'DispatchTechnician', nan]


In [9]:
df.head()

Unnamed: 0,cas_lvl1,cas_lvl2,cas_lvl3,cas_lvl4,cas_lvl5,resoln_action1,resoln_res1,resoln_action2,resoln_res2,resoln_action3,resoln_res3,resoln_action4,Unnamed: 12
0,0,1,0,0,1,0,1,0,0,4,3,2,
1,0,1,0,0,1,0,1,0,0,4,3,2,
2,0,1,0,0,1,0,1,0,0,4,3,2,
3,0,1,0,0,1,0,1,0,0,4,3,2,
4,0,1,0,0,1,0,1,0,0,4,3,2,


In [10]:
# split the data into training and test data
# initialise Gaussian Naive Bayes

X1 = df[['cas_lvl1', 'cas_lvl2', 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5']]
y1 = df[['resoln_action1']]
print(X1.shape)


X2 = df[['cas_lvl1', 'cas_lvl2', 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5', 'resoln_action1' , 'resoln_res1']]
y2 = df[['resoln_action2']]
print(X2.shape)


X3 = df[['cas_lvl1', 'cas_lvl2', 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5' , 'resoln_action1' , 'resoln_res1', 'resoln_action2' , 'resoln_res2']]
y3 = df[['resoln_action3']]
print(X3.shape)

X4 = df[['cas_lvl1', 'cas_lvl2', 'cas_lvl3' , 'cas_lvl4' , 'cas_lvl5' , 'resoln_action1' , 'resoln_res1', 'resoln_action2' , 'resoln_res2' , 'resoln_action3' , 'resoln_res3']]
y4 = df[['resoln_action4']]
print(X4.shape)


#Split the dataset to 80% training and 20% testing
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.20, random_state = 101)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.20, random_state = 101)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.20, random_state = 101)
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size = 0.20, random_state = 101)

(12526, 5)
(12526, 7)
(12526, 9)
(12526, 11)


In [11]:
naive_b_resoln1_model = GaussianNB()
naive_b_resoln2_model = GaussianNB()
naive_b_resoln3_model = GaussianNB()
naive_b_resoln4_model = GaussianNB()

naive_b_resoln1_model.fit(X1_train,y1_train)
y1_pred = naive_b_resoln1_model.predict(X1_test)

naive_b_resoln2_model.fit(X2_train,y2_train)
y2_pred = naive_b_resoln2_model.predict(X2_test)

naive_b_resoln3_model.fit(X3_train,y3_train)
y3_pred = naive_b_resoln3_model.predict(X3_test)

naive_b_resoln4_model.fit(X4_train,y4_train)
y4_pred = naive_b_resoln4_model.predict(X4_test)

In [12]:
import pickle
filename = 'naive_b_resoln1_model.sav'
pickle.dump(naive_b_resoln1_model, open(filename, 'wb'))

filename = 'naive_b_resoln2_model.sav'
pickle.dump(naive_b_resoln2_model, open(filename, 'wb'))

filename = 'naive_b_resoln3_model.sav'
pickle.dump(naive_b_resoln3_model, open(filename, 'wb'))

filename = 'naive_b_resoln4_model.sav'
pickle.dump(naive_b_resoln4_model, open(filename, 'wb'))