In [3]:
import pandas as pd
import numpy as np
from collections import Counter

In [4]:
date_parser = lambda c: pd.to_datetime(c, format='%d/%m/%Y %H:%M:%S', errors='coerce')
data = pd.read_csv('ITSM_data.csv', parse_dates=['Open_Time','Reopen_Time','Close_Time','Resolved_Time'])

In [5]:
data.head(2)

Unnamed: 0,CI_Name,CI_Cat,CI_Subcat,WBS,Incident_ID,Status,Impact,Urgency,Priority,Unnamed: 9,...,Reopen_Time,Resolved_Time,Close_Time,Handle_Time_hrs,Closure_Code,No_of_Related_Interactions,Related_Interaction,No_of_Related_Incidents,No_of_Related_Changes,Related_Change
0,SUB000508,subapplication,Web Based Application,WBS000162,IM0000004,Closed,4,4,4.0,0.820967,...,NaT,2013-04-11 13:50:27,2013-04-11 13:51:17,3871691111,Other,1.0,SD0000007,2.0,,
1,WBA000124,application,Web Based Application,WBS000088,IM0000005,Closed,3,3,3.0,0.936566,...,2013-02-12 12:31:02,2013-02-12 12:36:21,2013-02-12 12:36:25,4354786389,Software,1.0,SD0000011,1.0,,


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46606 entries, 0 to 46605
Data columns (total 26 columns):
CI_Name                       46606 non-null object
CI_Cat                        46495 non-null object
CI_Subcat                     46495 non-null object
WBS                           46606 non-null object
Incident_ID                   46606 non-null object
Status                        46606 non-null object
Impact                        46606 non-null object
Urgency                       46606 non-null object
Priority                      45226 non-null float64
Unnamed: 9                    46606 non-null float64
Category                      46606 non-null object
KB_number                     46606 non-null object
Alert_Status                  46606 non-null object
No_of_Reassignments           46605 non-null float64
Unnamed: 14                   46606 non-null int64
Open_Time                     46606 non-null datetime64[ns]
Reopen_Time                   2284 non-null datet

In [7]:
Counter(data.No_of_Reassignments).most_common(20)

[(0.0, 27468),
 (1.0, 7268),
 (2.0, 5378),
 (3.0, 2191),
 (4.0, 1606),
 (5.0, 721),
 (6.0, 622),
 (7.0, 329),
 (8.0, 246),
 (9.0, 170),
 (10.0, 146),
 (11.0, 101),
 (13.0, 61),
 (12.0, 58),
 (14.0, 47),
 (15.0, 43),
 (17.0, 27),
 (16.0, 25),
 (18.0, 14),
 (21.0, 11)]

In [8]:
data.No_of_Reassignments[2].dtype

dtype('float64')

In [9]:
data.loc[data.No_of_Reassignments>3.0,'reassignment'] ='high'
data.loc[data.No_of_Reassignments<=3.0,'reassignment'] ='low'

In [10]:
data.head()

Unnamed: 0,CI_Name,CI_Cat,CI_Subcat,WBS,Incident_ID,Status,Impact,Urgency,Priority,Unnamed: 9,...,Resolved_Time,Close_Time,Handle_Time_hrs,Closure_Code,No_of_Related_Interactions,Related_Interaction,No_of_Related_Incidents,No_of_Related_Changes,Related_Change,reassignment
0,SUB000508,subapplication,Web Based Application,WBS000162,IM0000004,Closed,4,4,4.0,0.820967,...,2013-04-11 13:50:27,2013-04-11 13:51:17,3871691111,Other,1.0,SD0000007,2.0,,,high
1,WBA000124,application,Web Based Application,WBS000088,IM0000005,Closed,3,3,3.0,0.936566,...,2013-02-12 12:36:21,2013-02-12 12:36:25,4354786389,Software,1.0,SD0000011,1.0,,,high
2,DTA000024,application,Desktop Application,WBS000092,IM0000006,Closed,NS,3,,0.069016,...,2014-01-13 15:12:38,2014-01-13 15:13:11,4843119444,No error - works as designed,1.0,SD0000017,,,,low
3,WBA000124,application,Web Based Application,WBS000088,IM0000011,Closed,4,4,4.0,0.936214,...,2013-11-14 09:31:09,2013-11-14 09:31:24,4321833333,Operator error,1.0,SD0000025,,,,high
4,WBA000124,application,Web Based Application,WBS000088,IM0000012,Closed,4,4,4.0,0.062957,...,2013-08-11 13:55:49,2013-08-11 13:55:52,3383903333,Other,1.0,SD0000029,,,,low


In [11]:
Counter(data.reassignment)

Counter({'high': 4300, 'low': 42305, nan: 1})

In [12]:
data = data[data.reassignment.isnull()==False]

In [13]:
Counter(data.reassignment)

Counter({'high': 4300, 'low': 42305})

In [14]:
selected = data.loc[:,['CI_Cat','CI_Subcat','WBS','Category','reassignment']]
selected.dropna(inplace=True)

In [15]:
selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46494 entries, 0 to 46605
Data columns (total 5 columns):
CI_Cat          46494 non-null object
CI_Subcat       46494 non-null object
WBS             46494 non-null object
Category        46494 non-null object
reassignment    46494 non-null object
dtypes: object(5)
memory usage: 2.1+ MB


In [16]:
X = selected.loc[:,['CI_Cat','CI_Subcat','WBS','Category']]
y = selected.reassignment


In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [18]:
enc = LabelEncoder()
X.CI_Cat = enc.fit_transform(X.CI_Cat)
X.CI_Subcat = enc.fit_transform(X.CI_Subcat)
X.WBS = enc.fit_transform(X.WBS)
X.Category = enc.fit_transform(X.Category)

In [19]:
X.shape

(46494, 4)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,random_state=0)


In [21]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [24]:
X_train = X_train.astype('float')

In [25]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_sample(X_train,y_train)

In [27]:
model=RandomForestClassifier()
model.fit(X_train_smote,y_train_smote)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))



[[ 676  397]
 [2733 7818]]
0.7307295251204404
              precision    recall  f1-score   support

        high       0.20      0.63      0.30      1073
         low       0.95      0.74      0.83     10551

    accuracy                           0.73     11624
   macro avg       0.57      0.69      0.57     11624
weighted avg       0.88      0.73      0.78     11624



### Trying one hot encoding for better efficiency

In [24]:
selected = data.loc[:,['CI_Cat','CI_Subcat','WBS','Category','reassignment']]
selected.dropna(inplace=True)
X = selected.loc[:,['CI_Cat','CI_Subcat','WBS','Category']]
y = selected.reassignment
X = pd.get_dummies(X)

In [25]:
X.shape

(46494, 351)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,random_state=0)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [27]:
model=RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)

[[   7  666    0]
 [   3 9987   14]
 [   3  939    5]]


0.86020302821748107

#### Not much improvement through one hot encoding