In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sms.tsv',sep='\t',names = ['label','sms'])
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5575, 2)


In [3]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,spam,"%^^×？×^×&#****,>,;//×&>>*(*^%=÷#~^&,****)"
4,ham,U dun say so early hor... U c already then say...


In [4]:
classes = sorted(set(df['label']))
class_to_idx = {}

for i, c in enumerate(classes):
    class_to_idx.update({c:i})

nclass = len(classes)

print('# of classes: %d' %nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


In [5]:
# new_df = pd.DataFrame({'label':df['label'],
#                       'sms':df['sms'].str.slice(
#                       start=0,stop=max_length)})

In [6]:
new_df = pd.DataFrame(df.drop_duplicates())

In [7]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,spam,URGENT! Last weekend's draw shows that you hav...
1,spam,Wanna have a laugh? Try CHIT-CHAT on your mobi...
2,ham,"No I'm good for the movie, is it ok if I leave..."
3,ham,Fyi I'm gonna call you sporadically starting a...
4,ham,Ü say until like dat i dun buy ericsson oso ca...


In [8]:
train_ratio = 0.9

s,e = 0,int(df_shuffled.shape[0]*train_ratio)
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})
print('index for train: %d~%d' %(s,e))

s,e = e, e+int(df_shuffled.shape[0]*(1.0 - train_ratio))
print('inedx for test: %d~%d' %(s,e))
df_test = pd.DataFrame({'label':df_shuffled['label'][s:e],
                       'sms':df_shuffled['sms'][s:e]})

index for train: 0~4654
inedx for test: 4654~5171


In [9]:
print(df_train.shape)
print(df_test.shape)

(4654, 2)
(517, 2)


In [10]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',
               header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',
               header=False, index=False, sep='\t')

In [15]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

In [16]:
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

learning_rate = 0.001

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [34]:
from data_loader import DataLoader

In [35]:
loaders = DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.train.tsv',
    batch_size=batch_size,
    valid_ratio=.2,
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

In [37]:
test_loaders = DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.test.tsv',
    batch_size=batch_size,
    valid_ratio=.01,
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

In [38]:
print('|train| = ', len(loaders.train_loader.dataset),
     '|valid| =', len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print('|vocab| =',vocab_size, '|classes| =',num_classes)

|train| =  3723 |valid| = 931
|vocab| = 1543 |classes| = 2


In [39]:
n = 3
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i>n:
        break
    print('[%d]' %i)
    print('한 번에 로드되는 데이터 크기:',len(labels))
    
    for j in range(n):
        label = labels[j].numpy()
        text = texts[j].numpy()
        print('label: ', label)
        print('text: ',text.shape)

[0]
한 번에 로드되는 데이터 크기: 128
label:  0
text:  (22,)
label:  1
text:  (22,)
label:  0
text:  (22,)
[1]
한 번에 로드되는 데이터 크기: 128
label:  0
text:  (25,)
label:  0
text:  (25,)
label:  0
text:  (25,)
[2]
한 번에 로드되는 데이터 크기: 128
label:  0
text:  (17,)
label:  1
text:  (17,)
label:  0
text:  (17,)
[3]
한 번에 로드되는 데이터 크기: 128
label:  0
text:  (30,)
label:  1
text:  (30,)
label:  1
text:  (30,)


In [43]:
class RNN(nn.Module):
    def __init__(self,
                input_size,
                word_vec_size,
                hidden_size,
                n_classes,
                num_layers=4,
                dropout_p=0.3):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        self.emb = nn.Embedding(input_size, word_vec_size)
        
        self.lstm = nn.LSTM(input_size=word_vec_size,
                           hidden_size = hidden_size,
                           num_layers = num_layers,
                           dropout=dropout_p,
                           batch_first=True,
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.activation = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        x = self.emb(x)
        
        x, _ = self.lstm(x)
        
        out = self.activation(self.fc(x[:,-1]))
        
        return out
    

In [44]:
model = RNN(input_size = vocab_size,
           word_vec_size=word_vec_size,
           hidden_size=hidden_size,
           n_classes=num_classes,
           num_layers=num_layers,
           dropout_p=dropout_p)

In [45]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0
    
    model.eval()
    for i, data in enumerate(dloader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        output = model(texts)
        _, output_index = torch.max(output, 1)
        total+=labels.size(0)
        correct += (output_index==labels).sum().float()
        
    model.train()
    return (100*correct/total).numpy()

In [46]:
print('Accuracy of Test Data: %.2f' %ComputeAccr(loaders.valid_loader,model))

Accuracy of Test Data: 13.00


In [47]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, [imgs, labels] in enumerate(loaders.train_loader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        print('[%d]' %i)
        
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1)%10==0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f},Accr:{:.2f}'
                 .format(epoch+1,num_epochs, i+1,total_step,
                        loss.item(),
                        ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], Step [10/30], Loss: 0.0613,Accr:87.00
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], Step [20/30], Loss: 0.0072,Accr:86.04
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]


In [None]:
print('accuracy of Valid Data: %.2f' %ComputeAccr(loaders.valid_loader, model))

In [None]:
netname='./nets/rnn_weight.pkl'
torch.save(model,netname,)