# Document classification Problem

In [46]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
#reading the Input Data
data_frame = pd.read_csv("shuffled-full-set-hashed.csv",header=None);
data_frame.head()

Unnamed: 0,0,1
0,DELETION OF INTEREST,e04a09c87692 d6b72e591b91 5d066f0246f1 ed41171...
1,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
2,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
3,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
4,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...


In [49]:
# getting the length of document including NAN values in the main CSV File
len(data_frame)

62204

In [50]:
# Dropping all the NAN values in CSV File
data_frame.dropna(inplace = True)
#Printing the updated length
len(data_frame)

62159

In [51]:
# creating a List of Labels in CSV
unique_labels = set(data_frame[0].values.tolist())
unique_labels

{'APPLICATION',
 'BILL',
 'BILL BINDER',
 'BINDER',
 'CANCELLATION NOTICE',
 'CHANGE ENDORSEMENT',
 'DECLARATION',
 'DELETION OF INTEREST',
 'EXPIRATION NOTICE',
 'INTENT TO CANCEL NOTICE',
 'NON-RENEWAL NOTICE',
 'POLICY CHANGE',
 'REINSTATEMENT NOTICE',
 'RETURNED CHECK'}

In [52]:
# All labels in Document
labels = data_frame[0].values.tolist()

In [53]:
# All encoded text keyword in document
text = data_frame[1].values.tolist()

In [54]:
# Creating training and testing using train_test_split model
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.3)

In [55]:
# Append all the encoded words together
all_words = []
for string in X_train:
    all_words+= (string.split(" "))

In [56]:
# Frequency Calculation 
freq = nltk.FreqDist(all_words)

In [57]:
# Picking the top 250 frequent Words
common = freq.most_common(250)

In [58]:
common


[('586242498a88', 241930),
 ('d38820625542', 213243),
 ('6ce6cc5a3203', 166241),
 ('b9699ce57810', 126296),
 ('21e314d3afcc', 122579),
 ('25c57acdf805', 110253),
 ('1015893e384a', 102507),
 ('133d46f7ed38', 91357),
 ('036087ac04f9', 88680),
 ('641356219cbc', 87977),
 ('54709b24b45f', 86518),
 ('6ca2dd348663', 82418),
 ('c337a85b8ef9', 80908),
 ('1b6d0614f2c7', 78595),
 ('6bf9c0cb01b4', 78458),
 ('f95d0bea231b', 75182),
 ('0562c756a2f2', 74764),
 ('f7ae6f8257da', 74651),
 ('6b304aabdcee', 73643),
 ('7d9e333a86da', 72994),
 ('97b6014f9e50', 71718),
 ('b136f6349cf3', 70736),
 ('eeb86a6a04e4', 70458),
 ('10e45001c2f2', 68106),
 ('422068f04236', 67091),
 ('b208ae1e8232', 66348),
 ('b73e657498f2', 62479),
 ('1068682ce752', 59892),
 ('6b343f522f78', 58202),
 ('8f75273e5510', 57265),
 ('6af770640118', 57127),
 ('9bc65adc033c', 55661),
 ('ce1f034abb5d', 52389),
 ('26f768da5068', 51799),
 ('e943e5e5b779', 51193),
 ('04503bc22789', 51051),
 ('5e99d31d8fa4', 46491),
 ('9cdf4a63deb0', 45362),
 ('98

In [59]:
# Features Extraction
features = [i[0] for i in common]
features

['586242498a88',
 'd38820625542',
 '6ce6cc5a3203',
 'b9699ce57810',
 '21e314d3afcc',
 '25c57acdf805',
 '1015893e384a',
 '133d46f7ed38',
 '036087ac04f9',
 '641356219cbc',
 '54709b24b45f',
 '6ca2dd348663',
 'c337a85b8ef9',
 '1b6d0614f2c7',
 '6bf9c0cb01b4',
 'f95d0bea231b',
 '0562c756a2f2',
 'f7ae6f8257da',
 '6b304aabdcee',
 '7d9e333a86da',
 '97b6014f9e50',
 'b136f6349cf3',
 'eeb86a6a04e4',
 '10e45001c2f2',
 '422068f04236',
 'b208ae1e8232',
 'b73e657498f2',
 '1068682ce752',
 '6b343f522f78',
 '8f75273e5510',
 '6af770640118',
 '9bc65adc033c',
 'ce1f034abb5d',
 '26f768da5068',
 'e943e5e5b779',
 '04503bc22789',
 '5e99d31d8fa4',
 '9cdf4a63deb0',
 '98d0d51b397c',
 '4357c81e10c1',
 'de9738ee8b24',
 '4e5019f629a9',
 '93790ade6682',
 '6a01047db3ab',
 'a31962fbd5f3',
 'f0666bdbc8a5',
 '48d657cd9861',
 '6365c4563bd1',
 '5c02c2aaa67b',
 '4ad52689d690',
 '46c88d9303da',
 '36e7aa72ffe1',
 'fe3fe35491b4',
 '5ee06767bc0f',
 '557ec6c63cf9',
 '6bff0c8c1185',
 '1ab34730c1e0',
 'cbfb3eb99bea',
 'ba02159e05b1

In [60]:
# Write the features set to disk using pickle which will be used to send the code via flask to AWS EC2 server
import pickle
# save the features
with open('features.pkl', 'wb') as fid:
    pickle.dump(features, fid)

In [61]:
# Function to Get Features of every Row and returns Dictionary which represents, how many frequent words are present in that row 
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [62]:
# Recreating the training set and testing set in forms of an array
training_set = []
for i in range(len(X_train)):
    training_set.append(((X_train[i]),y_train[i]))
testing_set = []
for i in range(len(X_test)):
    testing_set.append(((X_test[i]),y_test[i]))

In [63]:
# An Array of Dictionary of Training Set
# Dictionary has the key of features where values represent if the encoded text is present in this row or not.
# With correct Labels append on the array
training_data = [(get_feature_dict(string.split(" ")), category) for string,category in training_set]

In [64]:
# An Array of Dictionary of Testing Set
# Dictionary has the key of features where values represent if the encoded text is present in this row or not.
# With correct Labels append on the array
testing_data = [(get_feature_dict(string.split(" ")), category) for string, category in testing_set]

Random Search Training For Best Parameters

In [65]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from pprint import pprint
print('Parameters currently in use:\n')
pprint(rfc.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


Random Hyperparameter Grid

In [66]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}


Random Search Training

In [67]:
# Function to get Feature Array of Any Row in document
def get_features_Array(words):
    current_features = []
    words_set = set(words)
    for w in features:
        current_features.append(w in words_set)
    return current_features

In [68]:
All_Features_X = [(get_features_Array(string.split(" "))) for string in X_train]

In [69]:
# from sklearn.model_selection import RandomizedSearchCV
# # Use the random grid to search for best hyperparameters

In [70]:
from nltk.classify.scikitlearn import SklearnClassifier
from time import time

In [71]:
# Starting Time
start = time()
# Classifier with Best Params Value found in 
rfc = RandomForestClassifier(n_estimators=90,min_samples_split=5,min_samples_leaf=2,max_features='auto',bootstrap=False)
RFC_classifier = SklearnClassifier(rfc)

In [72]:
RFC_classifier.train(training_data)
print((time() - start))

308.90891313552856


In [73]:
start = time()
print("Random Forest accuracy percent:", (nltk.classify.accuracy(RFC_classifier, testing_data))*100)
print((time() - start))

Random Forest accuracy percent: 86.72779922779922
3.0430431365966797


In [74]:
start = time()
rfc_without_params = RandomForestClassifier()
RFC_classifier_no_params = SklearnClassifier(rfc_without_params)
RFC_classifier_no_params.train(training_data)
print((time() - start))
start = time()
print("Random Forest accuracy without params percent:", (nltk.classify.accuracy(RFC_classifier_no_params, testing_data))*100)
print((time() - start))

337.41733407974243
Random Forest accuracy without params percent: 86.44894894894894
3.389662027359009


Saving Trained Classifier To Disk so that it can be trnsferred to EC2

In [75]:
# save the classifier
with open('rfc.pkl', 'wb') as fid:
    pickle.dump(RFC_classifier, fid)
with open('rfc1.pkl', 'wb') as fid:
    pickle.dump(RFC_classifier_no_params, fid)

In [76]:
# Ignoring SVM since it is taking to much time on testing and Training
#from sklearn.svm import SVC
#from nltk.classify.scikitlearn import SklearnClassifier

In [77]:
#start = time()
#svc = SVC(gamma=0.001, C=10)
#SVC_classifier = SklearnClassifier(svc)

In [78]:
#Ignorong SVM since it is taking 
#SVC_classifier.train(training_data)

In [79]:
#print("SVM accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_data))*100)
#print((time() - start))

In [83]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [84]:
start = time()
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_data)
print((time() - start))
start = time()
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_data))*100)
print((time() - start))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


21.168360233306885
LogisticRegression_classifier accuracy percent: 83.59073359073359
2.1799089908599854


In [85]:
start = time()
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_data)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_data))*100)
print((time() - start))



LinearSVC_classifier accuracy percent: 82.86679536679536
125.66774773597717


Confusion Matrix

In [89]:
# Getting Array of Dictionary of Testing Data Set
# Dictionary Represent the Features of Testing Set
testing_dict_X = [(get_feature_dict(string.split(" "))) for string, category in testing_set]

In [95]:
# Getting Array of Labels on Testing Set
testing_Y = [category for string, category in testing_set]

In [99]:
# Predicted Value From Classifier
y_pred = RFC_classifier.classify_many(testing_dict_X)

# Printing the values of output which are predicted as Y_pred on test data

In [98]:
y_pred

['DELETION OF INTEREST',
 'POLICY CHANGE',
 'BILL',
 'DELETION OF INTEREST',
 'BILL',
 'DELETION OF INTEREST',
 'POLICY CHANGE',
 'CANCELLATION NOTICE',
 'CANCELLATION NOTICE',
 'POLICY CHANGE',
 'CANCELLATION NOTICE',
 'POLICY CHANGE',
 'CANCELLATION NOTICE',
 'BILL',
 'POLICY CHANGE',
 'REINSTATEMENT NOTICE',
 'BILL',
 'BILL',
 'BILL',
 'CANCELLATION NOTICE',
 'BINDER',
 'BILL',
 'POLICY CHANGE',
 'BINDER',
 'BILL',
 'BILL',
 'POLICY CHANGE',
 'CANCELLATION NOTICE',
 'EXPIRATION NOTICE',
 'BILL',
 'BILL',
 'BILL',
 'BILL',
 'BINDER',
 'POLICY CHANGE',
 'CANCELLATION NOTICE',
 'BILL',
 'BILL',
 'BILL',
 'DELETION OF INTEREST',
 'CANCELLATION NOTICE',
 'CANCELLATION NOTICE',
 'BILL',
 'BILL',
 'CANCELLATION NOTICE',
 'CANCELLATION NOTICE',
 'EXPIRATION NOTICE',
 'CANCELLATION NOTICE',
 'CANCELLATION NOTICE',
 'BILL',
 'POLICY CHANGE',
 'BILL',
 'BILL',
 'POLICY CHANGE',
 'BILL',
 'BILL',
 'BILL',
 'POLICY CHANGE',
 'BILL',
 'CANCELLATION NOTICE',
 'DELETION OF INTEREST',
 'BILL',
 'CAN