In [1]:
import pandas as pd
import string
from pprint import pprint
from time import time
from string import punctuation
import sklearn.preprocessing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

In [2]:
data = pd.read_csv('TicketIncident.csv', encoding='latin-1')
data.columns = ['v2','v1']
data.shape

(48574, 2)

In [3]:
data.head()

Unnamed: 0,v2,v1
0,JOB Status from CA Workload Automation,N
1,clients need to be restarted and/or powered on,N
2,userLogin : userLogin Password Expired,Y
3,Service Catalog - Mobile Registration and Admi...,Y
4,[Boss] Can't found Income Certificate Request ...,N


In [6]:
data['v2'].replace(":"," ").head()
#data['v2'].replace([^a-zA-Z ]/g, "")


0               JOB Status from CA Workload Automation
1      clients need to be restarted and/or powered  on
2               userLogin : userLogin Password Expired
3    Service Catalog - Mobile Registration and Admi...
4    [Boss] Can't found Income Certificate Request ...
Name: v2, dtype: object

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',SGDClassifier())
])

parameter = {'tfidf_use_idf':(True,False)}

grid_search = GridSearchCV(pipeline,parameter,n_jobs=-1,verbose=1)
print ('Performing grid search now..')
print ('Parameters:')
pprint(parameter)
t0=time()
grid_search.fit(df_incident_collection['short_description'],df_incident_collection['Automatable (Y/N)'])
print ('done in %0.3fs'% (time()-t0))
print()

# Sample code to remove noisy words from a text

noise_list = ["is", "a", "this", "from" "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [  ] 
    print(word)
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

_remove_noise("this is a sample text")

import nltk
from nltk import sent_tokenize
s=sent_tokenize(df_incident_collection)
for i in s:
    print(i)


from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_incident_collection)
X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, df_incident_collection)

In [4]:
# Split into train and test
from sklearn import cross_validation
data_train, data_test, labels_train, labels_test = cross_validation.train_test_split(
    data.v2,
    data.v1, 
    test_size = 0.2, 
    random_state = 42)




In [5]:
data_train.head()

40               JOB Status from CA Workload Automation
67    Service Catalog - Mobile Registration and Admi...
15                       Chain of Custody account reset
68    Can't found Income Certificate Request Form in...
88               JOB Status from CA Workload Automation
Name: v2, dtype: object

In [6]:
# Display train labels
labels_train.head()

40    N
67    Y
15    N
68    N
88    N
Name: v1, dtype: object

In [7]:
# Display test data
data_test.head()

80               JOB Status from CA Workload Automation
77    lients need to have the proper group added to ...
73         clients need to be restarted and powered  on
94    User Receiving Proofing Oversight Task with Pr...
33         clients need to be restarted and powered  on
Name: v2, dtype: object

In [8]:
# Display test labels
labels_test.head()

80    N
77    N
73    N
94    N
33    N
Name: v1, dtype: object

In [9]:
# Generate Tf-idf vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5)
data_train_transformed = vectorizer.fit_transform(data_train)
data_test_transformed  = vectorizer.transform(data_test)

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(data_train_transformed, labels_train)
data_train_transformed = selector.transform(data_train_transformed).toarray()
data_test_transformed  = selector.transform(data_test_transformed).toarray()

In [10]:
# Display the transformed data
print(data_train_transformed[:10])

[[ 0.         0.         0.         0.         0.       ]
 [ 0.4213928  0.4213928  0.4213928  0.4213928  0.4213928]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.4213928  0.4213928  0.4213928  0.4213928  0.4213928]
 [ 0.         0.         0.         0.         0.       ]
 [ 0.         0.         0.         0.         0.       ]]


In [11]:
data_train_transformed.shape

(76, 5)

In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
clf.fit(data_train_transformed, labels_train)
predictions = clf.predict(data_test_transformed)

In [13]:
print (accuracy_score(labels_test, predictions))

0.8


In [14]:
import numpy as np
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 20
top_features = [features[i] for i in indices[:top_n]]
print(top_features)

['userlogin', 'added', 'administrators', 'their', 'the', 'local', 'proper', 'password', 'expired', 'lients', 'have', 'workload', 'with', 'email', 'from', 'ca', 'automation', 'oversight', 'job', 'access']
