In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re

In [4]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Module 3 Mentor deck - NLP/Module 3 Week 2 Mentor deck NLP/case study/complaints.csv')
df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,I have outdated information on my credit repor...,Credit reporting
1,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
2,An account on my credit report has a mistaken ...,Credit reporting
3,This company refuses to provide me verificatio...,Debt collection
4,This complaint is in regards to Square Two Fin...,Debt collection


### Tokenizer
Regular expression based tokenizers to consider only alphabetical sequences and ignore numerical sequences.

In [0]:
def complaint_to_words(comp):
    
    words = RegexpTokenizer('\w+').tokenize(comp)
    words = [re.sub(r'([xx]+)|([XX]+)|(\d+)', '', w).lower() for w in words]
    words = list(filter(lambda a: a != '', words))
    
    return words

### Vocabulary

In [0]:
all_words = list()
for comp in df['Consumer complaint narrative']:
    for w in complaint_to_words(comp):
        all_words.append(w)

In [7]:
print('Size of vocabulary: {}'.format(len(set(all_words))))

Size of vocabulary: 76908


In [8]:
print('Complaint\n', df['Consumer complaint narrative'][10], '\n')
print('Tokens\n', complaint_to_words(df['Consumer complaint narrative'][10]))

Complaint
 Without provocation, I received notice that my credit line was being decreased by nearly 100 %. My available credit was reduced from $ XXXX to XXXX ( the rough amount of my available balance ). 

When I called to question the change, I was provided a nob-descript response referencing my XXXX report. It was my understanding that under the FCRA I was entitled to a copy of this report, but was refused by Citi and have been given no further explanation. 

This is predatory in that it affects my utilization of credit, further subjecting me to increase in APrs, etc and a higher cost of credit without any reason. 

Tokens
 ['without', 'provocation', 'i', 'received', 'notice', 'that', 'my', 'credit', 'line', 'was', 'being', 'decreased', 'by', 'nearly', 'my', 'available', 'credit', 'was', 'reduced', 'from', 'to', 'the', 'rough', 'amount', 'of', 'my', 'available', 'balance', 'when', 'i', 'called', 'to', 'question', 'the', 'change', 'i', 'was', 'provided', 'a', 'nob', 'descript', 'resp

### Indexing
Indexing each word by assigning it a unique number

In [0]:
index_dict = dict()
count = 1
index_dict['<unk>'] = 0
for word in set(all_words):
    index_dict[word] = count
    count += 1

### Dataset
Utilizing indexed words to replace words by index. This makes the dataset numerical and keras readable.

In [0]:
embeddings_index = {}
f = open('/content/drive/My Drive/Module 3 Mentor deck - NLP/Module 3 Week 2 Mentor deck NLP/case study/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
data_list = list()
for comp in df['Consumer complaint narrative']:
    sentence = np.zeros(300)
    count = 0
    for w in complaint_to_words(comp):
        try:
            sentence += embeddings_index[w]
            count += 1
        except KeyError:
            continue
    data_list.append(sentence / count)

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Product'])
df['Target'] = le.transform(df['Product'])
df.head()

Unnamed: 0,Consumer complaint narrative,Product,Target
0,I have outdated information on my credit repor...,Credit reporting,5
1,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan,2
2,An account on my credit report has a mistaken ...,Credit reporting,5
3,This company refuses to provide me verificatio...,Debt collection,7
4,This complaint is in regards to Square Two Fin...,Debt collection,7


### One hot Encoding

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(data_list), df.Target.values, 
    test_size=0.15, random_state=0)

In [18]:
print(X_train.shape)

(152809, 300)


In [19]:
print(y_train.shape)

(152809,)


In [20]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
clf = BernoulliNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.4839618793340008
