In [238]:
# import 
import torch
import pandas as pd
import numpy as np
import random
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import torch.optim as optim
import html
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [239]:
# random seed
def set_seed(seed_value):
    random.seed(seed_value)  # 파이썬 난수 생성기
    np.random.seed(seed_value)  # Numpy 난수 생성기
    torch.manual_seed(seed_value)  # PyTorch 난수 생성기

    # CUDA 환경에 대한 시드 설정 (GPU 사용 시)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
SEED = 42
set_seed(SEED)

In [240]:
# data 불러오기
train_df = pd.read_csv('../../data/train.csv') 
test_df = pd.read_csv('../../data/test.csv') 

In [241]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   Description  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [242]:
# new column 추가 title + description
train_df['text'] = train_df['Title'] + " " + train_df['Description']

train_df['text'].head(10)

test_df['text'] = test_df['Title'] + " " + test_df['Description']


In [243]:
# 전처리 전 길이 확인
train_df['text'].str.len().describe()

count    120000.000000
mean        236.460025
std          66.529799
min          17.000000
25%         196.000000
50%         232.000000
75%         266.000000
max        1012.000000
Name: text, dtype: float64

## 전처리

In [244]:
# 전처리 뭐할지? 확인용
sample = train_df.sample(10)
sample

Unnamed: 0,Class Index,Title,Description,text
71787,3,"BBC set for major shake-up, claims newspaper","London - The British Broadcasting Corporation, the world #39;s biggest public broadcaster, is to cut almost a quarter of its 28 000-strong workforce, in the biggest shake-up in its 82-year history, The Times newspaper in London said on Monday.","BBC set for major shake-up, claims newspaper London - The British Broadcasting Corporation, the world #39;s biggest public broadcaster, is to cut almost a quarter of its 28 000-strong workforce, in the biggest shake-up in its 82-year history, The Times newspaper in London said on Monday."
67218,3,Marsh averts cash crunch,Embattled insurance broker #39;s banks agree to waive clause that may have prevented access to credit. NEW YORK (Reuters) - Marsh amp; McLennan Cos.,Marsh averts cash crunch Embattled insurance broker #39;s banks agree to waive clause that may have prevented access to credit. NEW YORK (Reuters) - Marsh amp; McLennan Cos.
54066,2,"Jeter, Yankees Look to Take Control (AP)",AP - Derek Jeter turned a season that started with a terrible slump into one of the best in his accomplished 10-year career.,"Jeter, Yankees Look to Take Control (AP) AP - Derek Jeter turned a season that started with a terrible slump into one of the best in his accomplished 10-year career."
7168,4,Flying the Sun to Safety,"When the Genesis capsule comes back to Earth with its samples of the sun, helicopter pilots will be waiting for it, ready to snag it out of the sky.","Flying the Sun to Safety When the Genesis capsule comes back to Earth with its samples of the sun, helicopter pilots will be waiting for it, ready to snag it out of the sky."
29618,3,Stocks Seen Flat as Nortel and Oil Weigh,"NEW YORK (Reuters) - U.S. stocks were set to open near unchanged on Thursday after a warning from technology bellwether Nortel Networks Corp. &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=NT.N target=/stocks/quickinfo/fullquote""&gt;NT.N&lt;/A&gt; dimmed hopes, while stubbornly high oil prices also weighed on sentiment.","Stocks Seen Flat as Nortel and Oil Weigh NEW YORK (Reuters) - U.S. stocks were set to open near unchanged on Thursday after a warning from technology bellwether Nortel Networks Corp. &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=NT.N target=/stocks/quickinfo/fullquote""&gt;NT.N&lt;/A&gt; dimmed hopes, while stubbornly high oil prices also weighed on sentiment."
101425,2,Inter Milan seeks redemption win against Juventus,"It is early in the season for a decisive match, yet Inter Milan-Juventus on Sunday is shaping up as exactly that. Serie A leader Juventus stands 15 points ahead of Inter, but both teams see the meeting as key to their season.","Inter Milan seeks redemption win against Juventus It is early in the season for a decisive match, yet Inter Milan-Juventus on Sunday is shaping up as exactly that. Serie A leader Juventus stands 15 points ahead of Inter, but both teams see the meeting as key to their season."
20441,3,Saudi Arabia cuts oil prices,Oil prices eased yesterday as top world exporter Saudi Arabia slashed prices for its westbound crude sales in an effort to shift the large volumes it is offering to cool world markets.,Saudi Arabia cuts oil prices Oil prices eased yesterday as top world exporter Saudi Arabia slashed prices for its westbound crude sales in an effort to shift the large volumes it is offering to cool world markets.
2662,1,Google Cuts Its IPO Price Range,"SAN JOSE, Calif. - In a sign that Google Inc.'s initial public offering will not be as hot or big as expected, the Internet search giant slashed its estimated per-share price range and reduced the number of shares to be sold by insiders...","Google Cuts Its IPO Price Range SAN JOSE, Calif. - In a sign that Google Inc.'s initial public offering will not be as hot or big as expected, the Internet search giant slashed its estimated per-share price range and reduced the number of shares to be sold by insiders..."
20371,3,FOCUS: Santander Says HBOS Counterbid To Face Problems,"LONDON (Dow Jones)--Banco Santander Central Hispano SA (STD), the Spanish bank planning to buy UK lender Abbey National PLC (ANBA), Monday attacked its potential domestic","FOCUS: Santander Says HBOS Counterbid To Face Problems LONDON (Dow Jones)--Banco Santander Central Hispano SA (STD), the Spanish bank planning to buy UK lender Abbey National PLC (ANBA), Monday attacked its potential domestic"
108151,4,HP Revises Cluster Plans,"HP (Quote, Chart) is dropping its efforts to port some Tru64 Unix products to HP-UX with the help of storage player Veritas . The two companies announced a multi-year agreement Thursday that finds HP #39;s sales","HP Revises Cluster Plans HP (Quote, Chart) is dropping its efforts to port some Tru64 Unix products to HP-UX with the help of storage player Veritas . The two companies announced a multi-year agreement Thursday that finds HP #39;s sales"


In [245]:
# 전처리

# 0. 4분의 1만 쓰겠다!
train_df = train_df.sample(frac=0.25, random_state=42)

# 1. 빈값
train_df = train_df.replace(r'^\s*$', np.nan, regex=True)
train_df = train_df.dropna()

# 2. etc...
def clean_text(line):
    line = html.unescape(line) # tag 복구
    line = line.replace("#39;", "'")   # 작은 따옴표
    line = line.replace("#36;", "$")   # 달러
    line = line.replace("amp;", "&")   # &
    line = re.sub('<.*?>', '', line) # delete html tag
    line = re.sub(r'http\S+', '', line) # delete html links
    line = ' '.join(line.split()) # 공백 제거
    line = re.sub(r'\((tm|r|TM|R)\)', '', line) # 상표 기호 제거
    line = re.sub(r'\\', ' ', line) # 역슬래시
    return line

In [246]:
# output????
test_sentence = train_df['Description'].iloc[10]
tokens = clean_text(test_sentence)
print("before cleaning: ", test_sentence)
print("after cleaning: ", tokens)

before cleaning:  Manugistics Group Inc. disclosed Monday that it fired its president, as the maker of business software tries to reverse slipping sales, rising costs and a falling stock price. &lt;BR&gt;&lt;FONT face="verdana,MS Sans Serif,arial,helvetica" size="-2"\ color="#666666"&gt;&lt;B&gt;-The Washington Post&lt;/B&gt;&lt;/FONT&gt;
after cleaning:  Manugistics Group Inc. disclosed Monday that it fired its president, as the maker of business software tries to reverse slipping sales, rising costs and a falling stock price. -The Washington Post


In [247]:
# 적용
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

In [248]:
train_df['Class Index'].value_counts(normalize = True)

Class Index
2    0.252000
4    0.250933
1    0.249067
3    0.248000
Name: proportion, dtype: float64

In [249]:
# 길이 확인
train_df['text'].str.len().describe()

count    30000.000000
mean       231.333667
std         61.281713
min         48.000000
25%        194.000000
50%        229.000000
75%        261.000000
max        976.000000
Name: text, dtype: float64

## 토크나이저

In [250]:
# tokenizer load
MODEL_NAME = "bert-base-uncased" # 대소문자 -> 소문자 / 기본 모델
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=False)

In [251]:
# output????
test_sentence = train_df['text'].iloc[1]
tokens = tokenizer.tokenize(test_sentence) # 단순 단어 쪼개기
print("before tokenizer: ", test_sentence)
print("after tokenizer: ", tokens)

ids = tokenizer.convert_tokens_to_ids(tokens) # token -> vocab ids

train_encodings = tokenizer(
    test_sentence, 
    # return_tensors="pt" # pytorch tensor
)

print()
print("after encodings: ", train_encodings['input_ids']) # CLS SEP token added
print("convert to ids: ", ids)
print(tokenizer.decode(train_encodings['input_ids']))
print(tokenizer.decode(ids))

before tokenizer:  Marsh averts cash crunch Embattled insurance broker 's banks agree to waive clause that may have prevented access to credit. NEW YORK (Reuters) - Marsh & McLennan Cos.
after tokenizer:  ['marsh', 'ave', '##rts', 'cash', 'crunch', 'em', '##bat', '##tled', 'insurance', 'broker', "'", 's', 'banks', 'agree', 'to', 'wai', '##ve', 'clause', 'that', 'may', 'have', 'prevented', 'access', 'to', 'credit', '.', 'new', 'york', '(', 'reuters', ')', '-', 'marsh', '&', 'mc', '##len', '##nan', 'co', '##s', '.']

after encodings:  [101, 9409, 13642, 21217, 5356, 24514, 7861, 14479, 14782, 5427, 20138, 1005, 1055, 5085, 5993, 2000, 23701, 3726, 11075, 2008, 2089, 2031, 8729, 3229, 2000, 4923, 1012, 2047, 2259, 1006, 26665, 1007, 1011, 9409, 1004, 11338, 7770, 7229, 2522, 2015, 1012, 102]
convert to ids:  [9409, 13642, 21217, 5356, 24514, 7861, 14479, 14782, 5427, 20138, 1005, 1055, 5085, 5993, 2000, 23701, 3726, 11075, 2008, 2089, 2031, 8729, 3229, 2000, 4923, 1012, 2047, 2259, 1006, 

In [252]:
# train
train_labels, texts = [train_df[col].tolist() for col in ['Class Index', 'text']]
train_encodings = tokenizer(
    texts,
    padding="max_length", # model 최대길이 BERT = 512
    truncation=True, # cutting
    return_tensors='pt'
)

# test
test_labels, texts = [test_df[col].tolist() for col in ['Class Index', 'text']]
test_encodings = tokenizer(
    texts,
    padding="max_length", # model 최대길이 BERT = 512
    truncation=True, # cutting
    return_tensors='pt'
)

In [253]:
# data Loader
class CustomDataset(Dataset):
    def __init__(self, encodings, labels, is_train=True):
        self.encodings = encodings
        self.labels = labels
        self.is_train = is_train
        
    #  데이터 전체 길이  
    def __len__(self):
        return len(self.labels) # or encodings['input_ids']
    
    # 어떻게 반환?
    def __getitem__(self, idx):
        input_ids = self.encodings['input_ids'][idx]
        if self.is_train:
            label = torch.tensor(self.labels[idx] - 1, dtype = torch.long) # 0~3
            return input_ids, label # train 만 class index 값 넘기기
        else:
            return input_ids

In [257]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(
            embed_dim, 
            hidden_dim, 
            # num_layers=n_layers, 
            # bidirectional=True, # 양방향 문맥을 더 잘 읽기 위해
            batch_first=True,  # 입력 순서를 (Batch size, 문장 길이, 단어 특징) 로 맞추기 위해????
            dropout=dropout) 
        self.fc = nn.Linear(
            hidden_dim,
            output_dim)# label 4 개 # 초ㅣ종 정답 층
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x): # 실제 실행?
        embedded = self.dropout(self.embedding(x)) # 입력된 문장들 벡터로 변환
        _, (hidden, _) = self.lstm(embedded)
        last_hidden = hidden[-1]
        return self.fc(self.dropout(last_hidden)) # fc 층에 통과시켜서 label 4개중에 어디에 속할지 점수 계산해서 반환

In [259]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MIN_FREQ = 2
BATCH_SIZE = 64

train_dataset = CustomDataset(train_encodings, train_labels, is_train=True)
test_dataset = CustomDataset(test_encodings, test_labels, is_train=True)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


EMBED_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 4
DROPOUT = 0.5

model = LSTMClassifier(
    vocab_size=tokenizer.vocab_size, 
    embed_dim=EMBED_DIM, 
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM, 
    dropout=DROPOUT)

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for data, label in train_loader:
        data = data.to(device)
        label = label.to(device)
        
        # 이전 배치 초기화
        optimizer.zero_grad()
        # 예...측?
        output = model(data)
        # loss function
        loss = criterion(output, label)
        acc = (output.argmax(1) == label).float().mean()
        # 역전파 - 기울기 계산
        loss.backward()
        # 가중치 업데이트 
        optimizer.step()
        # 손실값 누적?
        epoch_loss += loss.item()
        epoch_acc += acc.item()   
         
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)    

# 검증? inference?
def evaluate(model, test_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for data, label in test_loader:
            data = data.to(device)
            label = label.to(device)
            
            output = model(data)
            loss = criterion(output, label)
            acc = (output.argmax(1) == label).float().mean()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(test_loader), epoch_acc / len(test_loader)


EPOCHS = 5
print(f"학습 시작! (Device: {device})")

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion, device)
    
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')



학습 시작! (Device: cpu)


KeyboardInterrupt: 

## Train