In [None]:
import os
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

!pip install kaggle



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/TextClassificationAttention
#!kaggle datasets download adityajn105/glove6b50d
#!kaggle datasets download dushyantv/consumer_complaints

/content/drive/MyDrive/TextClassificationAttention


In [None]:
#!unzip consumer_complaints.zip
#!unzip glove6b50d.zip

In [None]:
data = pd.read_csv("/content/drive/MyDrive/TextClassificationAttention/Consumer_Complaints.csv")
print(data.columns)
print(data.head())


Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer Complaint', 'Company Public Response', 'Company', 'State',
       'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via',
       'Date Sent to Company', 'Company Response to Consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID',
       'Unnamed: 18'],
      dtype='object')
  Date received           Product     Sub-product  \
0    03-12-2014          Mortgage  Other mortgage   
1    10-01-2016  Credit reporting             NaN   
2    10/17/2016     Consumer Loan    Vehicle loan   
3    06-08-2014       Credit card             NaN   
4    09/13/2014   Debt collection     Credit card   

                                      Issue                   Sub-issue  \
0  Loan modification,collection,foreclosure                         NaN   
1    Incorrect information on credit report              Account status   
2                Managing the loan or lease                  

# Pre processing

In [None]:
lr = 0.0005
vec_len = 50
seq_len = 20
num_epochs = 50
label_col = "Product"
tokens_path = "/content/drive/MyDrive/TextClassificationAttention/tokens.pkl"
labels_path = "/content/drive/MyDrive/TextClassificationAttention/labels.pkl"
data_path = "/content/drive/MyDrive/TextClassificationAttention/Consumer_Complaints.csv"
model_path = "/content/drive/MyDrive/TextClassificationAttention/attention.pth"
vocabulary_path = "/content/drive/MyDrive/TextClassificationAttention/vocabulary.pkl"
embeddings_path = "/content/drive/MyDrive/TextClassificationAttention/embeddings.pkl"
glove_vector_path = "/content/drive/MyDrive/TextClassificationAttention/glove.6B.50d.txt"
text_col_name = "Consumer Complaint"
label_encoder_path = "/content/drive/MyDrive/TextClassificationAttention/label_encoder.pkl"
product_map = {'Vehicle loan or lease': 'vehicle_loan',
               'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
               'Credit card or prepaid card': 'card',
               'Money transfer, virtual currency, or money service': 'money_transfer',
               'virtual currency': 'money_transfer',
               'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'}

In [None]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

Glove embedding

In [None]:
with open(glove_vector_path, "rt") as f:
    emb = f.readlines()

In [None]:
vocabulary, embeddings = [], []

for item in emb:
    vocabulary.append(item.split()[0])
    embeddings.append(item.split()[1:])

In [None]:
embeddings = np.array(embeddings, dtype=np.float32)

In [None]:
vocabulary = ["<pad>", "<unk>"] + vocabulary

In [None]:
embeddings = np.vstack([np.ones(50, dtype=np.float32),
                        np.mean(embeddings, axis=0),
                        embeddings])

In [None]:
save_file(embeddings_path, embeddings)
save_file(vocabulary_path, vocabulary)

# Process text

In [None]:
data = pd.read_csv(data_path)

In [None]:
data.dropna(subset=[text_col_name], inplace=True)

In [None]:
data.replace({label_col: product_map}, inplace=True)

# Encode labels

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(data[label_col])
labels = label_encoder.transform(data[label_col])

In [None]:
save_file(labels_path, labels)
save_file(label_encoder_path, label_encoder)

# Process text column


In [None]:
input_text = list(data[text_col_name])

In [None]:
len(input_text)

277814

# Convert to lowecase

In [None]:
input_text = [i.lower() for i in tqdm(input_text)]

100%|██████████| 277814/277814 [00:00<00:00, 737053.34it/s]


# Remove punctuations except apostrophe

In [None]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i)
              for i in tqdm(input_text)]

100%|██████████| 277814/277814 [00:13<00:00, 21047.88it/s]


# remove Numbers


In [None]:
input_text = [re.sub("\d+", "", i) for i in tqdm(input_text)]

100%|██████████| 277814/277814 [00:08<00:00, 31598.33it/s]


# Remove more than one consecutive instance of 'x'

In [None]:
input_text = [re.sub(r'[x]{2,}', "", i) for i in tqdm(input_text)]

100%|██████████| 277814/277814 [00:05<00:00, 47203.01it/s]


# Remove multiple spaces with single space

In [None]:
nput_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

100%|██████████| 277814/277814 [00:22<00:00, 12217.23it/s]


# Tokenize the text

In [None]:
import nltk
nltk.download('punkt_tab')

#tokens = [word_tokenize(t) for t in tqdm(input_text, mininterval=60)]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Take the first 20 tokens in each complaint text

In [None]:
tokens = [i[:20] if len(i) > 19 else ['<pad>'] * (20 - len(i)) + i
          for i in tqdm(tokens)]


100%|██████████| 277814/277814 [00:03<00:00, 73475.70it/s] 


# Convert tokens to integer indices from vocabulary

In [None]:
def token_index(tokens, vocabulary, missing='<unk>'):
    """
    :param tokens: List of word tokens
    :param vocabulary: All words in the embeddings
    :param missing: Token for words not present in the vocabulary
    :return: List of integers representing the word tokens
    """
    idx_token = []
    for text in tqdm(tokens):
        idx_text = []
        for token in text:
            if token in vocabulary:
                idx_text.append(vocabulary.index(token))
            else:
                idx_text.append(vocabulary.index(missing))
        idx_token.append(idx_text)
    return idx_token


# save the tokens

In [None]:
tokens = token_index(tokens, vocabulary)
save_file(tokens_path, tokens)

100%|██████████| 277814/277814 [1:01:17<00:00, 75.54it/s]


#Create attention

In [None]:
class AttentionModel(nn.Module):

    def __init__(self, vec_len, seq_len, n_classes):
        super(AttentionModel, self).__init__()
        self.vec_len = vec_len
        self.seq_len = seq_len
        self.attn_weights = torch.cat([torch.tensor([[0.]]),
                                       torch.randn(vec_len, 1) /
                                       torch.sqrt(torch.tensor(vec_len))])
        self.attn_weights.requires_grad = True
        self.attn_weights = nn.Parameter(self.attn_weights)
        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.linear = nn.Linear(vec_len + 1, n_classes)

    def forward(self, input_data):
        hidden = torch.matmul(input_data, self.attn_weights)
        hidden = self.activation(hidden)
        attn = self.softmax(hidden)
        attn = attn.repeat(1, 1, self.vec_len + 1).reshape(attn.shape[0],
                                                           self.seq_len,
                                                           self.vec_len + 1)
        attn_output = input_data * attn
        attn_output = torch.sum(attn_output, axis=1)
        output = self.linear(attn_output)
        return output

#Create PyTorch dataset

In [None]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self, tokens, embeddings, labels):
        """
        :param tokens: List of word tokens
        :param embeddings: Word embeddings (from glove)
        :param labels: List of labels
        """
        self.tokens = tokens
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        emb = torch.tensor(self.embeddings[self.tokens[idx], :])
        input_ = torch.cat((torch.ones(emb.shape[0],1), emb), dim=1)
        return torch.tensor(self.labels[idx]), input_

#Function to train the model

In [None]:
def train(train_loader, valid_loader, model, criterion, optimizer,
          device, num_epochs, model_path):
    """
    Function to train the model
    :param train_loader: Data loader for train dataset
    :param valid_loader: Data loader for validation dataset
    :param model: Model object
    :param criterion: Loss function
    :param optimizer: Optimizer
    :param device: CUDA or CPU
    :param num_epochs: Number of epochs
    :param model_path: Path to save the model
    """
    best_loss = 1e8
    for i in range(num_epochs):
        print(f"Epoch {i+1} of {num_epochs}")
        valid_loss, train_loss = [], []
        model.train()
        # Train loop
        for batch_labels, batch_data in tqdm(train_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            train_loss.append(loss.item())
            optimizer.zero_grad()
            # Backward pass
            loss.backward()
            # Gradient update step
            optimizer.step()
        model.eval()
        # Validation loop
        for batch_labels, batch_data in tqdm(valid_loader):
            # Move data to GPU if available
            batch_labels = batch_labels.to(device)
            batch_data = batch_data.to(device)
            # Forward pass
            batch_output = model(batch_data)
            batch_output = torch.squeeze(batch_output)
            # Calculate loss
            loss = criterion(batch_output, batch_labels)
            valid_loss.append(loss.item())
        t_loss = np.mean(train_loss)
        v_loss = np.mean(valid_loss)
        print(f"Train Loss: {t_loss}, Validation Loss: {v_loss}")
        if v_loss < best_loss:
            best_loss = v_loss
            # Save model if validation loss improves
            torch.save(model.state_dict(), model_path)
        print(f"Best Validation Loss: {best_loss}")

#Function to test the model

In [None]:
def test(test_loader, model, criterion, device):
    """
    Function to test the model
    :param test_loader: Data loader for test dataset
    :param model: Model object
    :param criterion: Loss function
    :param device: CUDA or CPU
    """
    model.eval()
    test_loss = []
    test_accu = []
    for batch_labels, batch_data in tqdm(test_loader):
        # Move data to device
        batch_labels = batch_labels.to(device)
        batch_data = batch_data.to(device)
        # Forward pass
        batch_output = model(batch_data)
        batch_output = torch.squeeze(batch_output)
        # Calculate loss
        loss = criterion(batch_output, batch_labels)
        test_loss.append(loss.item())
        batch_preds = torch.argmax(batch_output, axis=1)
        # Move predictions to CPU
        if torch.cuda.is_available():
            batch_labels = batch_labels.cpu()
            batch_preds = batch_preds.cpu()
        # Compute accuracy
        test_accu.append(accuracy_score(batch_labels.detach().
                                        numpy(),
                                        batch_preds.detach().
                                        numpy()))
    test_loss = np.mean(test_loss)
    test_accu = np.mean(test_accu)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accu}")

#Train attention model

In [None]:
tokens = load_file(tokens_path)
labels = load_file(labels_path)
embeddings = load_file(embeddings_path)
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)
vocabulary = load_file(vocabulary_path)

Split data into train, validation and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                    test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.25)

Create PyTorch datasets

In [None]:
train_dataset = TextDataset(X_train, embeddings, y_train)
valid_dataset = TextDataset(X_valid, embeddings, y_valid)
test_dataset = TextDataset(X_test, embeddings, y_test)

Create data loaders

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=16,
                                           shuffle=True,
                                           drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=16)

Create model object

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                      else "cpu")
model = AttentionModel(vec_len, seq_len, num_classes)

Move the model to GPU if available


In [None]:
if torch.cuda.is_available():
    model = model.cuda()

Define loss function and optimizer

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Training loop

In [None]:
train(train_loader, valid_loader, model, criterion, optimizer,
      device, num_epochs, model_path)

Epoch 1 of 50


100%|██████████| 10418/10418 [00:25<00:00, 414.90it/s]
100%|██████████| 3473/3473 [00:04<00:00, 841.76it/s]


Train Loss: 1.255738702538856, Validation Loss: 1.1066387673379225
Best Validation Loss: 1.1066387673379225
Epoch 2 of 50


100%|██████████| 10418/10418 [00:24<00:00, 421.23it/s]
100%|██████████| 3473/3473 [00:04<00:00, 824.02it/s]


Train Loss: 1.0755504596385554, Validation Loss: 1.056090299244954
Best Validation Loss: 1.056090299244954
Epoch 3 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.14it/s]
100%|██████████| 3473/3473 [00:04<00:00, 834.15it/s]


Train Loss: 1.0446229579267219, Validation Loss: 1.038802680006777
Best Validation Loss: 1.038802680006777
Epoch 4 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.43it/s]
100%|██████████| 3473/3473 [00:04<00:00, 824.04it/s]


Train Loss: 1.0319891958444722, Validation Loss: 1.0285619565776745
Best Validation Loss: 1.0285619565776745
Epoch 5 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.73it/s]
100%|██████████| 3473/3473 [00:04<00:00, 716.41it/s]


Train Loss: 1.0246236741680128, Validation Loss: 1.023001933102988
Best Validation Loss: 1.023001933102988
Epoch 6 of 50


100%|██████████| 10418/10418 [00:24<00:00, 432.65it/s]
100%|██████████| 3473/3473 [00:05<00:00, 673.89it/s]


Train Loss: 1.0191699383520398, Validation Loss: 1.0177368418716608
Best Validation Loss: 1.0177368418716608
Epoch 7 of 50


100%|██████████| 10418/10418 [00:23<00:00, 434.77it/s]
100%|██████████| 3473/3473 [00:04<00:00, 712.29it/s]


Train Loss: 1.0095845026400867, Validation Loss: 1.005208261257422
Best Validation Loss: 1.005208261257422
Epoch 8 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.90it/s]
100%|██████████| 3473/3473 [00:04<00:00, 818.48it/s]


Train Loss: 1.0016892412489677, Validation Loss: 0.9995055579071666
Best Validation Loss: 0.9995055579071666
Epoch 9 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.66it/s]
100%|██████████| 3473/3473 [00:04<00:00, 830.23it/s]


Train Loss: 0.997764420590295, Validation Loss: 0.9967921788037561
Best Validation Loss: 0.9967921788037561
Epoch 10 of 50


100%|██████████| 10418/10418 [00:24<00:00, 418.29it/s]
100%|██████████| 3473/3473 [00:04<00:00, 832.91it/s]


Train Loss: 0.9950216556378073, Validation Loss: 0.9944318123279784
Best Validation Loss: 0.9944318123279784
Epoch 11 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.87it/s]
100%|██████████| 3473/3473 [00:04<00:00, 818.06it/s]


Train Loss: 0.993113871272614, Validation Loss: 0.9930578821284813
Best Validation Loss: 0.9930578821284813
Epoch 12 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.06it/s]
100%|██████████| 3473/3473 [00:04<00:00, 784.71it/s]


Train Loss: 0.991750494912939, Validation Loss: 0.9922084847822884
Best Validation Loss: 0.9922084847822884
Epoch 13 of 50


100%|██████████| 10418/10418 [00:24<00:00, 428.24it/s]
100%|██████████| 3473/3473 [00:05<00:00, 692.13it/s]


Train Loss: 0.9905621492318032, Validation Loss: 0.9909086845602798
Best Validation Loss: 0.9909086845602798
Epoch 14 of 50


100%|██████████| 10418/10418 [00:23<00:00, 438.33it/s]
100%|██████████| 3473/3473 [00:05<00:00, 670.49it/s]


Train Loss: 0.9896454509683089, Validation Loss: 0.9898014178098947
Best Validation Loss: 0.9898014178098947
Epoch 15 of 50


100%|██████████| 10418/10418 [00:24<00:00, 432.93it/s]
100%|██████████| 3473/3473 [00:04<00:00, 729.79it/s]


Train Loss: 0.9888484041575027, Validation Loss: 0.9888712290192831
Best Validation Loss: 0.9888712290192831
Epoch 16 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.59it/s]
100%|██████████| 3473/3473 [00:04<00:00, 830.94it/s]


Train Loss: 0.988181118942607, Validation Loss: 0.988128582544983
Best Validation Loss: 0.988128582544983
Epoch 17 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.44it/s]
100%|██████████| 3473/3473 [00:04<00:00, 838.03it/s]


Train Loss: 0.9875622561223266, Validation Loss: 0.9882779765928852
Best Validation Loss: 0.988128582544983
Epoch 18 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.16it/s]
100%|██████████| 3473/3473 [00:04<00:00, 830.73it/s]


Train Loss: 0.9869490313504794, Validation Loss: 0.987378104143928
Best Validation Loss: 0.987378104143928
Epoch 19 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.87it/s]
100%|██████████| 3473/3473 [00:04<00:00, 828.04it/s]


Train Loss: 0.986495810304317, Validation Loss: 0.9879264991838789
Best Validation Loss: 0.987378104143928
Epoch 20 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.65it/s]
100%|██████████| 3473/3473 [00:04<00:00, 781.50it/s]


Train Loss: 0.9860955294093625, Validation Loss: 0.9874426510915474
Best Validation Loss: 0.987378104143928
Epoch 21 of 50


100%|██████████| 10418/10418 [00:24<00:00, 429.84it/s]
100%|██████████| 3473/3473 [00:05<00:00, 687.14it/s]


Train Loss: 0.985507749786279, Validation Loss: 0.9865405066896165
Best Validation Loss: 0.9865405066896165
Epoch 22 of 50


100%|██████████| 10418/10418 [00:23<00:00, 441.15it/s]
100%|██████████| 3473/3473 [00:05<00:00, 661.62it/s]


Train Loss: 0.9853471016976535, Validation Loss: 0.9854999168904961
Best Validation Loss: 0.9854999168904961
Epoch 23 of 50


100%|██████████| 10418/10418 [00:24<00:00, 430.94it/s]
100%|██████████| 3473/3473 [00:04<00:00, 762.22it/s]


Train Loss: 0.9848421458794989, Validation Loss: 0.986618501821513
Best Validation Loss: 0.9854999168904961
Epoch 24 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.38it/s]
100%|██████████| 3473/3473 [00:04<00:00, 834.89it/s]


Train Loss: 0.9845013365773121, Validation Loss: 0.9849014830819269
Best Validation Loss: 0.9849014830819269
Epoch 25 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.81it/s]
100%|██████████| 3473/3473 [00:04<00:00, 829.41it/s]


Train Loss: 0.9843180471525972, Validation Loss: 0.9852400704845036
Best Validation Loss: 0.9849014830819269
Epoch 26 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.25it/s]
100%|██████████| 3473/3473 [00:04<00:00, 837.11it/s]


Train Loss: 0.9838481827494759, Validation Loss: 0.9859165395504111
Best Validation Loss: 0.9849014830819269
Epoch 27 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.41it/s]
100%|██████████| 3473/3473 [00:04<00:00, 822.09it/s]


Train Loss: 0.9835943459197206, Validation Loss: 0.9849867461447135
Best Validation Loss: 0.9849014830819269
Epoch 28 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.47it/s]
100%|██████████| 3473/3473 [00:04<00:00, 776.33it/s]


Train Loss: 0.9834078643839312, Validation Loss: 0.9845672782511592
Best Validation Loss: 0.9845672782511592
Epoch 29 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.98it/s]
100%|██████████| 3473/3473 [00:05<00:00, 677.14it/s]


Train Loss: 0.9831553827331148, Validation Loss: 0.9836875126302123
Best Validation Loss: 0.9836875126302123
Epoch 30 of 50


100%|██████████| 10418/10418 [00:23<00:00, 435.57it/s]
100%|██████████| 3473/3473 [00:05<00:00, 685.02it/s]


Train Loss: 0.9828436924525451, Validation Loss: 0.9833767315495643
Best Validation Loss: 0.9833767315495643
Epoch 31 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.36it/s]
100%|██████████| 3473/3473 [00:04<00:00, 804.91it/s]


Train Loss: 0.9827612449984835, Validation Loss: 0.9834930572134255
Best Validation Loss: 0.9833767315495643
Epoch 32 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.78it/s]
100%|██████████| 3473/3473 [00:04<00:00, 825.45it/s]


Train Loss: 0.98231158369069, Validation Loss: 0.9837551246175428
Best Validation Loss: 0.9833767315495643
Epoch 33 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.95it/s]
100%|██████████| 3473/3473 [00:04<00:00, 814.75it/s]


Train Loss: 0.9819662333046848, Validation Loss: 0.9834405564022723
Best Validation Loss: 0.9833767315495643
Epoch 34 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.76it/s]
100%|██████████| 3473/3473 [00:04<00:00, 832.74it/s]


Train Loss: 0.9818470791657361, Validation Loss: 0.9828674562925431
Best Validation Loss: 0.9828674562925431
Epoch 35 of 50


100%|██████████| 10418/10418 [00:24<00:00, 427.02it/s]
100%|██████████| 3473/3473 [00:04<00:00, 772.05it/s]


Train Loss: 0.9816579968938673, Validation Loss: 0.9825234037721806
Best Validation Loss: 0.9825234037721806
Epoch 36 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.48it/s]
100%|██████████| 3473/3473 [00:05<00:00, 688.10it/s]


Train Loss: 0.9814110343897152, Validation Loss: 0.9827808590244911
Best Validation Loss: 0.9825234037721806
Epoch 37 of 50


100%|██████████| 10418/10418 [00:23<00:00, 435.12it/s]
100%|██████████| 3473/3473 [00:05<00:00, 665.34it/s]


Train Loss: 0.981285727414272, Validation Loss: 0.981840180228294
Best Validation Loss: 0.981840180228294
Epoch 38 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.31it/s]
100%|██████████| 3473/3473 [00:04<00:00, 770.91it/s]


Train Loss: 0.9809175679432635, Validation Loss: 0.9828504261145357
Best Validation Loss: 0.981840180228294
Epoch 39 of 50


100%|██████████| 10418/10418 [00:24<00:00, 425.41it/s]
100%|██████████| 3473/3473 [00:04<00:00, 827.74it/s]


Train Loss: 0.9808033344586697, Validation Loss: 0.9825022798809235
Best Validation Loss: 0.981840180228294
Epoch 40 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.45it/s]
100%|██████████| 3473/3473 [00:04<00:00, 827.21it/s]


Train Loss: 0.9806233123173684, Validation Loss: 0.9822048930348837
Best Validation Loss: 0.981840180228294
Epoch 41 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.41it/s]
100%|██████████| 3473/3473 [00:04<00:00, 828.01it/s]


Train Loss: 0.9804581178941889, Validation Loss: 0.9818949001325574
Best Validation Loss: 0.981840180228294
Epoch 42 of 50


100%|██████████| 10418/10418 [00:24<00:00, 422.20it/s]
100%|██████████| 3473/3473 [00:04<00:00, 769.11it/s]


Train Loss: 0.9802612818221164, Validation Loss: 0.9808651503210895
Best Validation Loss: 0.9808651503210895
Epoch 43 of 50


100%|██████████| 10418/10418 [00:24<00:00, 429.51it/s]
100%|██████████| 3473/3473 [00:05<00:00, 678.41it/s]


Train Loss: 0.9799339538928208, Validation Loss: 0.9816217129267569
Best Validation Loss: 0.9808651503210895
Epoch 44 of 50


100%|██████████| 10418/10418 [00:23<00:00, 440.67it/s]
100%|██████████| 3473/3473 [00:05<00:00, 688.09it/s]


Train Loss: 0.9798378546350884, Validation Loss: 0.9818789867736268
Best Validation Loss: 0.9808651503210895
Epoch 45 of 50


100%|██████████| 10418/10418 [00:24<00:00, 429.41it/s]
100%|██████████| 3473/3473 [00:04<00:00, 796.08it/s]


Train Loss: 0.9796481972853861, Validation Loss: 0.9818676761186607
Best Validation Loss: 0.9808651503210895
Epoch 46 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.46it/s]
100%|██████████| 3473/3473 [00:04<00:00, 833.92it/s]


Train Loss: 0.9794733141452205, Validation Loss: 0.9801921488450647
Best Validation Loss: 0.9801921488450647
Epoch 47 of 50


100%|██████████| 10418/10418 [00:24<00:00, 426.98it/s]
100%|██████████| 3473/3473 [00:04<00:00, 826.37it/s]


Train Loss: 0.9792804466208265, Validation Loss: 0.9801879286628754
Best Validation Loss: 0.9801879286628754
Epoch 48 of 50


100%|██████████| 10418/10418 [00:24<00:00, 423.46it/s]
100%|██████████| 3473/3473 [00:04<00:00, 817.50it/s]


Train Loss: 0.9791590079999881, Validation Loss: 0.9805058835306533
Best Validation Loss: 0.9801879286628754
Epoch 49 of 50


100%|██████████| 10418/10418 [00:24<00:00, 424.94it/s]
100%|██████████| 3473/3473 [00:04<00:00, 790.36it/s]


Train Loss: 0.9791298950429634, Validation Loss: 0.9797732829790173
Best Validation Loss: 0.9797732829790173
Epoch 50 of 50


100%|██████████| 10418/10418 [00:24<00:00, 432.44it/s]
100%|██████████| 3473/3473 [00:05<00:00, 686.16it/s]


Train Loss: 0.9788074452300565, Validation Loss: 0.9800696365262808
Best Validation Loss: 0.9797732829790173


Test the model

In [None]:
test(test_loader, model, criterion, device)

100%|██████████| 3473/3473 [00:07<00:00, 495.37it/s]

Test Loss: 0.9828092007928096, Test Accuracy: 0.6694434992016335





#Predict on new text

In [None]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that
I can view my Experian Credit Report and getting notified when there is activity on
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9
hours on the phone with Experian. Every time I call I get transferred repeatedly and
then my last transfer and automated message states to press 1 and leave a message and
someone would call me. Every time I press 1 I get an automatic message stating than you
before I even leave a message and get disconnected. I call Experian again, explain what
is happening and the process begins again with the same end result. I was trying to have
this issue attended and resolved informally but I give up after 9 hours. There are hard
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall
and I respectfully request that Experian remove the hard hit inquiries immediately just
like they've done in the past when I was able to speak to a live Experian representative
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX
XX/XX/XXXX'''

Process input text

In [None]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)
tokens = word_tokenize(input_text)

In [None]:
tokens = ['<pad>']*(20-len(tokens))+tokens

In [None]:
idx_token = []
for token in tokens:
    if token in vocabulary:
        idx_token.append(vocabulary.index(token))
    else:
        idx_token.append(vocabulary.index('<unk>'))

In [None]:
token_emb = embeddings[idx_token,:]
token_emb = token_emb[:seq_len, :]
inp = torch.from_numpy(token_emb)

In [None]:
inp = torch.cat((torch.ones(inp.shape[0],1), inp), dim=1)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                      else "cpu")

In [None]:
inp = inp.to(device)
inp = torch.unsqueeze(inp, 0)

In [None]:
label_encoder = load_file(label_encoder_path)
num_classes = len(label_encoder.classes_)

In [None]:
# Create model object
model = AttentionModel(vec_len, seq_len, num_classes)

# Load trained weights
model.load_state_dict(torch.load(model_path))

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()

# Forward pass
out = torch.squeeze(model(inp))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted  Class: {prediction}")

Predicted  Class: credit_report


  model.load_state_dict(torch.load(model_path))


#save model

In [None]:
import torch

def save_model(model, model_path):
    """
    Saves the model to a file.

    Args:
        model: The model to save.
        model_path: The path to save the model to.
    """
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

def load_model(model, model_path):
    """
    Loads the model from a file.

    Args:
        model: The model to load.
        model_path: The path to load the model from.
    """
    model.load_state_dict(torch.load(model_path))
    print(f"Model loaded from {model_path}")

In [None]:
model_path = "/content/drive/MyDrive/TextClassificationAttention/models"
# Create an instance of the model
model_instance = AttentionModel(vec_len, seq_len, num_classes)
save_model(model_instance, model_path)  # Save the model instance

Model saved to /content/drive/MyDrive/TextClassificationAttention/models


In [None]:
model = AttentionModel(vec_len, seq_len, num_classes)  # Create the model instance
load_model(model, model_path)  # Load the saved weights

Model loaded from /content/drive/MyDrive/TextClassificationAttention/models


  model.load_state_dict(torch.load(model_path))
