In [1]:
#[embed layer : 64 , 128]
#linear classification with embedding
import pandas
Data = pandas.read_csv('Cleaned_Data.csv')
Data = Data.drop(['Unnamed: 0'] , axis =1)
Data = Data.dropna()
Data

Unnamed: 0,file content,class number
0,As summer the usual are being made Me was thin...,0
1,Southern 41493 Please send me any to this ride...,0
2,1 Trying to figure out a way to put a halogen ...,0
3,In article Frank Ball wrote The 400 model is e...,0
4,Eric Nelson My 83 Nighthawk two related with t...,0
...,...,...
18743,12GB hard Drive Brand NEW with full factory wa...,19
18744,my 14 monitor id dead due to the failure if yo...,19
18745,selling 388 worth of for 100 or Ill split it i...,19
18746,Computer card good for doing graphics on your ...,19


In [2]:
for line , label in zip(Data['file content'][0:10] ,Data['class number'][0:10]) :
    print(line,label)
    

As summer the usual are being made Me was thinking of going for some in the local state For that I was to get a The next question is how shall I carry the thing on the bike given the metal frame and all I have a big 12 high and was that I would be able to bungee cord the to the Any one have any on such experimentation Taking the idea further what would happen if the was fully loaded with a full load 40lbs Is the load distribution going to be very affected How will the bike perform with such a load clinging to the back rest If I really secure it with no shifting do I still increase my of Collective wisdom I really cant afford leather pants Boots and jeans are all I can make do with What you think of the knee which use the one and like Is that a Bad Idea Are there any 81 CB650 DoD 1224 I would give my right arm to be ambidextrous 0
Southern 41493 Please send me any to this ride list Remember only street that are open to all are posted Please phone for further Also send me your address if

In [3]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
#train_iter = AG_NEWS(split='train')
counter = Counter()
for line , label in zip(Data['file content'] ,Data['class number']) :
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [4]:
[vocab[token] for token in ['here', 'is', 'an', 'example']]


[118, 9, 30, 288]

In [5]:
len(vocab)

64964

In [6]:
iter_data = Data.to_records(index=False)
iter_data = list(iter_data)

In [7]:
iter_data

[('As summer the usual are being made Me was thinking of going for some in the local state For that I was to get a The next question is how shall I carry the thing on the bike given the metal frame and all I have a big 12 high and was that I would be able to bungee cord the to the Any one have any on such experimentation Taking the idea further what would happen if the was fully loaded with a full load 40lbs Is the load distribution going to be very affected How will the bike perform with such a load clinging to the back rest If I really secure it with no shifting do I still increase my of Collective wisdom I really cant afford leather pants Boots and jeans are all I can make do with What you think of the knee which use the one and like Is that a Bad Idea Are there any 81 CB650 DoD 1224 I would give my right arm to be ambidextrous', 0),
 ('Southern 41493 Please send me any to this ride list Remember only street that are open to all are posted Please phone for further Also send me your 

In [8]:
num_class = len(set([label for (text, label) in iter_data]))


In [9]:
num_class

20

In [10]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [11]:
text_pipeline('here is the an example')


[118, 9, 2, 30, 288]

In [12]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text , _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(iter_data, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [13]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f12eaac0cd0>

In [14]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [15]:
num_class = len(set([label for (text, label) in iter_data]))
vocab_size = len(vocab)
emsize = 128
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [16]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [17]:
list(iter_data)

[('As summer the usual are being made Me was thinking of going for some in the local state For that I was to get a The next question is how shall I carry the thing on the bike given the metal frame and all I have a big 12 high and was that I would be able to bungee cord the to the Any one have any on such experimentation Taking the idea further what would happen if the was fully loaded with a full load 40lbs Is the load distribution going to be very affected How will the bike perform with such a load clinging to the back rest If I really secure it with no shifting do I still increase my of Collective wisdom I really cant afford leather pants Boots and jeans are all I can make do with What you think of the knee which use the one and like Is that a Bad Idea Are there any 81 CB650 DoD 1224 I would give my right arm to be ambidextrous', 0),
 ('Southern 41493 Please send me any to this ride list Remember only street that are open to all are posted Please phone for further Also send me your 

In [None]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 25 # epoch
LR =  2 # learning rate
BATCH_SIZE = 32 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
# train_iter, test_iter = AG_NEWS()
# train_dataset = list(train_iter)
# test_dataset = list(test_iter)
num_train = int(len(iter_data) * 0.80)
#iter_data

train_dataset, test_dataset = random_split(iter_data, [num_train, len(iter_data) - num_train])

num_train = int(len(train_dataset) * 0.95)

split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time: 10.37s | valid accuracy    0.135 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  9.74s | valid accuracy    0.260 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 11.81s | valid accuracy    0.304 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 11.16s | valid accuracy    0.338 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 11.04s | valid accuracy    0.372 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time: 11.26s |

In [24]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.861
