In [138]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghckd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [139]:
train = pd.read_csv(filepath_or_buffer="../data/train.csv")
test = pd.read_csv(filepath_or_buffer="../data/test.csv")

In [144]:
train[train["keyword"] == np.nan]

Unnamed: 0,id,keyword,location,text,target


### Preprocessing

In [109]:
def preprocess(data):
    def text_cleaner(text:str):
        if text is not np.nan:
            text = text.lower()
            text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
            stop = stopwords.words("english")
            text = " ".join([word for word in text.split() if word not in (stop)])
        
        return text

    def keyword_splitter(x:str):
        if "%20" in x:
            x = tuple(x.split("%20"))
        else:
            x = (x)
        
        return x

    for col in ["location", "text"]:
        data[col] = data[col].apply(text_cleaner)    
        
    # data["keyword"] = data["keyword"].fillna("")
    # data["keyword_split"] = data["keyword"].apply(keyword_splitter)

    # data["location"] = data["location"].fillna("")
    data["text"] = data["text"].fillna(np.nan)
    
    data = data.drop(columns=["id", "keyword", "location"])
    
    return data

In [110]:
train = preprocess(train)
test = preprocess(test)

In [111]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [112]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(texts):
    for text in texts:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train["text"].tolist()), 
                                  specials=['<UNK>'],                    # 스페셜 토큰
                                  min_freq=2,                            # 최소 빈도 토큰
                                  max_tokens=5000
                                  )
vocab.set_default_index(vocab['<UNK>'])

### train, valid split

In [113]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(
    train,
    test_size=0.3,
    random_state=0
)

train_set = train_set.reset_index(drop=True)
valid_set = valid_set.reset_index(drop=True)

In [114]:
print('훈련 샘플의 개수 : {}'.format(len(train_set)))
print('검증 샘플의 개수 : {}'.format(len(valid_set)))
print('테스트 샘플의 개수 : {}'.format(len(test)))

훈련 샘플의 개수 : 5329
검증 샘플의 개수 : 2284
테스트 샘플의 개수 : 3263


In [115]:
print("훈련 샘플의 y의 비율 :", train_set["target"].sum()/len(train_set))
print("검증 샘플의 y의 비율 :", valid_set["target"].sum()/len(valid_set))

훈련 샘플의 y의 비율 : 0.4362919872396322
검증 샘플의 y의 비율 : 0.4141856392294221


### Custom Dataset

In [116]:
from torch.utils.data import Dataset, DataLoader

In [117]:
class CustomDataset(Dataset):
    def __init__(self, vocab, tokenizer, text, labels=None):
        super().__init__()
        self.text = text
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        label = self.labels[index]
        return self.vocab(self.tokenizer(text)), label

In [118]:
train_set = CustomDataset(vocab, tokenizer, train_set["text"], train_set["target"])
valid_set = CustomDataset(vocab, tokenizer, valid_set["text"], valid_set["target"])
test_set = CustomDataset(vocab, tokenizer, test)

In [119]:
import torch
from torch import nn

from torch.nn.utils.rnn import pad_sequence

In [120]:
# Set seed.
seed = 1234
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [121]:
def collate_batch(batch, max_sequence_length):
    label_list, text_list = [], []
    
    for text, label in batch:
        processed_text = torch.tensor(text[:max_sequence_length], dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(label)
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    
    text_list = pad_sequence([torch.tensor(text) for text in text_list], 
                             batch_first=True, 
                             )
    
    return text_list.to(device), label_list.to(device)

In [122]:
BATCH_SIZE = 64
MAX_SEQUENCE_LENGTH = 1000

train_loader = DataLoader(train_set, 
                          batch_size=BATCH_SIZE, 
                          shuffle=True, 
                          collate_fn=lambda x: collate_batch(x, MAX_SEQUENCE_LENGTH))

valid_loader = DataLoader(valid_set, 
                          batch_size=BATCH_SIZE, 
                          shuffle=False, 
                          collate_fn=lambda x: collate_batch(x, MAX_SEQUENCE_LENGTH))
test_loader = DataLoader(test_set, 
                          batch_size=BATCH_SIZE, 
                          collate_fn=lambda x: collate_batch(x, MAX_SEQUENCE_LENGTH))


AttributeError: 'function' object has no attribute '__dataframe__'

### Vocab

In [123]:
# from collections import Counter
# def build_vocab(data, max_vocab_size):
#     counter = Counter()
#     for text in data:
#         counter.update(text.split())
#     vocab = {token: index + 2 for index, (token, count) in enumerate(counter.most_common(max_vocab_size))}
#     vocab["<unk>"] = 0
#     vocab["<pad>"] = 1
    
#     return vocab

# vocab = build_vocab(train["text"], 5000)

In [124]:
# def tokenize(text, vocab):
#     return [vocab.get(token, vocab["<unk>"]) for token in text.split()]

# def collate_fn(samples):
#     if isinstance(samples[0], tuple):
#         texts, labels = zip(*samples)
#         tokenized_texts = [tokenize(text, vocab) for text in texts]
#         padded_texts = pad_sequence(
#             [torch.tensor(text) for text in tokenized_texts], 
#             batch_first=True
#             )
#         return padded_texts, torch.tensor(labels)
#     else:
#         tokenized_texts = [tokenize(text, vocab) for text in samples]
#         padded_texts = pad_sequence(
#             [torch.tensor(text) for text in tokenized_texts], 
#             batch_first=True
#             )
#         return padded_texts

In [125]:
# batch_size = 32

# train_loader = DataLoader(
#     , 
#     batch_size=batch_size, 
#     shuffle=True,
#     collate_fn=collate_fn
#     )
# valid_loader = DataLoader(
#     val_set, 
#     batch_size=batch_size, 
#     shuffle=True,
#     collate_fn=collate_fn
#     )
# test_loader = DataLoader(
#     test_set.data["text"], 
#     batch_size=batch_size,
#     collate_fn=collate_fn
#     )

### Model

In [126]:
import torch.optim as optim

In [127]:
def binary_accuracy(preds, y):
    # preds = torch.round(torch.sigmoid(preds))
    correct = (preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [128]:
class TextLSTM(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embed_dim, 
        hidden_size,
        output_size, 
        dropout):
        super(TextLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, bidirectional=False)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 64), 
            nn.ReLU(), 
            nn.Dropout(dropout), 
            nn.Linear(64, output_size), 
            nn.Sigmoid()      
        )
        self.hidden_size = hidden_size
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, _ = self.lstm(embedded)
        hidden = output[:, -1, :]
        return self.fc(hidden)

In [129]:
vocab_size = len(vocab)
embed_dim = 32
hidden_size = 64
output_size = 1
dropout = 0.5

model = TextLSTM(
    vocab_size=vocab_size,
    embed_dim=embed_dim, 
    hidden_size=hidden_size, 
    output_size=output_size, 
    dropout=dropout
    )

criterion = nn.BCELoss()  ##nn.BCEWithLogitsLoss() ##  
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [130]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch
        predictions = model(text).squeeze(1)

        loss = criterion(predictions, labels.float())
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [131]:
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, labels.float())
            total_loss += loss.item() * text.size(0)
            
            preds = torch.round(torch.sigmoid(predictions))
            correct = (preds == labels).float()
            total_correct += correct.sum().item()
            
            total_samples += text.size(0)
    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples 
    
    return avg_loss, accuracy           

In [132]:
for epoch in range(50):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    # scheduler.step(valid_loss)
    
    # # Early stopping
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     early_stopping_counter = 0
    # else:
    #     early_stopping_counter += 1
    #     if early_stopping_counter >= patience:
    #         print("Early stopping triggered!")
    #         break

  text_list = pad_sequence([torch.tensor(text) for text in text_list],


Epoch: 01
	Train Loss: 0.689 | Train Acc: 0.00%
	 Val. Loss: 0.686 |  Val. Acc: 41.42%
Epoch: 02
	Train Loss: 0.687 | Train Acc: 0.00%
	 Val. Loss: 0.678 |  Val. Acc: 41.42%
Epoch: 03
	Train Loss: 0.684 | Train Acc: 0.00%
	 Val. Loss: 0.680 |  Val. Acc: 41.42%
Epoch: 04
	Train Loss: 0.686 | Train Acc: 0.00%
	 Val. Loss: 0.680 |  Val. Acc: 41.42%
Epoch: 05
	Train Loss: 0.684 | Train Acc: 0.00%
	 Val. Loss: 0.680 |  Val. Acc: 41.42%
Epoch: 06
	Train Loss: 0.682 | Train Acc: 0.00%
	 Val. Loss: 0.679 |  Val. Acc: 41.42%
Epoch: 07
	Train Loss: 0.678 | Train Acc: 0.02%
	 Val. Loss: 0.685 |  Val. Acc: 41.42%
Epoch: 08
	Train Loss: 0.681 | Train Acc: 0.00%
	 Val. Loss: 0.680 |  Val. Acc: 41.42%
Epoch: 09
	Train Loss: 0.684 | Train Acc: 0.00%
	 Val. Loss: 0.679 |  Val. Acc: 41.42%
Epoch: 10
	Train Loss: 0.680 | Train Acc: 0.00%
	 Val. Loss: 0.682 |  Val. Acc: 41.42%
Epoch: 11
	Train Loss: 0.680 | Train Acc: 0.00%
	 Val. Loss: 0.686 |  Val. Acc: 41.42%
Epoch: 12
	Train Loss: 0.679 | Train Acc: 0

In [133]:
# from sklearn.metrics import confusion_matrix

# def predict(model, test_loader):
#     model.eval()  # Set the model to evaluation mode
#     predictions = []
#     true_labels = []
#     with torch.no_grad():  # Disable gradient tracking during inference
#         for texts, labels in test_loader:
#             output = model(texts).squeeze(1)
#             print(output)
#             probabilities = torch.sigmoid(output)  # Apply sigmoid to obtain probabilities
#             predictions.extend(torch.round(probabilities).cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
    
#     return predictions, true_labels

# # Assuming you have a test_loader
# test_predictions, true_labels = predict(model, test_loader)

# # Calculate confusion matrix
# conf_matrix = confusion_matrix(true_labels, test_predictions)

# print("Confusion Matrix:")
# print(conf_matrix)