In [1]:
# 필요한 라이브러리 import
import re
import nltk
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import clear_output
import os

nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s2060\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s2060\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 전처리 함수
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W+|\d+', ' ', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = nltk.stem.PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(stemmed_tokens)

In [3]:
# 데이터 로드 및 전처리
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [4]:
# 데이터셋 준비
def prepare_dataset(dataframe, tokenizer):
    encodings = tokenizer(dataframe['text'].tolist(), truncation=True, padding=True, max_length=512)
    dataset = TensorDataset(torch.tensor(encodings['input_ids']), 
                            torch.tensor(encodings['attention_mask']), 
                            torch.tensor(dataframe['target'].values))
    return dataset

In [5]:
# 모델 평가 함수
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids)

    # F1 점수 계산
    f1 = f1_score(true_labels, predictions, average='binary')
    return f1

In [16]:
# 데이터 로드 및 전처리
train_df = load_and_preprocess_data('./train.csv')
train_df, validation_df = train_test_split(train_df, test_size=0.3, random_state=42, stratify=train_df['target'])

print(len(train_df))
print(len(validation_df))

5329
2284


In [17]:
# 데이터셋 준비
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
train_dataset = prepare_dataset(train_df, tokenizer)
validation_dataset = prepare_dataset(validation_df, tokenizer)

print('train_dataset[0]:', train_dataset[0])
print('validation_dataset[0]:', validation_dataset[0])

train_dataset[0]: (tensor([  101,  2474, 15942,  2327, 25022,  3775,  2417,  2422,  2448, 10611,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), tensor(1))
validation_dataset[0]: (tensor([  101,  2047,  4400, 10017,  3775,  4681,  1056, 28394,  2102,  8254,
         2278,  1057, 29653,  2094,  4961,  3422,  2585,  1057,  1035,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), tens

In [18]:
# 데이터 로더 설정
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
validation_dataloader = DataLoader(validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=32)

In [19]:
# 모델 설정 및 학습
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
model.to(device)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=0.000008, weight_decay=0.01)

# TensorDataset을 DataLoader로 변환
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
validation_dataloader = DataLoader(validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=32)

# F1 점수를 저장할 리스트
train_f1_scores = []
val_f1_scores = []

# 모델 저장 경로 설정
output_dir = './model_save/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 조기 종료를 위한 설정
best_val_f1 = 0
patience = 10
patience_counter = 0
best_epoch = 0

# 훈련 루프
epochs = 100
for epoch in tqdm(range(epochs), desc="Epochs"):
    model.train()
    total_loss = 0
    train_predictions, train_true_labels = [], []

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        train_predictions.extend(np.argmax(logits, axis=1).flatten())
        train_true_labels.extend(label_ids)

    # Calculate train F1 score
    train_f1 = f1_score(train_true_labels, train_predictions, average='binary')
    train_f1_scores.append(train_f1)

    # Validation phase
    val_f1 = evaluate_model(model, validation_dataloader, device)
    val_f1_scores.append(val_f1)

    # Print training/validation statistics
    clear_output(wait=True)
    print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_dataloader)}")
    print(f"Epoch {epoch+1}/{epochs} - Train F1 Score: {train_f1}")
    print(f"Epoch {epoch+1}/{epochs} - Validation F1 Score: {val_f1}")
    print(f"Best Validation F1 Score: {best_val_f1}")
    print(f"Best Epoch: {best_epoch}")
    print(f"Patience Counter: {patience_counter}")

    # 조기 종료 로직
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_epoch = epoch
        patience_counter = 0
        torch.save(model.state_dict(), os.path.join(output_dir, 'best_model_state.bin'))
        print(f"Saved best model with F1 score {best_val_f1} in {output_dir}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            plt.figure(figsize=(12, 6))
            plt.plot(train_f1_scores, label='Train F1 Score', marker='o')
            plt.plot(val_f1_scores, label='Validation F1 Score', marker='o')
            plt.title('F1 Score over Epochs')
            plt.xlabel('Epoch')
            plt.ylabel('F1 Score')
            plt.legend()
            plt.grid(True)
            plt.savefig(os.path.join(output_dir, 'f1_score.png'))
            plt.show()
            print(f"No improvement in F1 score for {patience} consecutive epochs, stopping training.")
            break


    # Plot F1 scores
    plt.figure(figsize=(12, 6))
    plt.plot(train_f1_scores, label='Train F1 Score', marker='o')
    plt.plot(val_f1_scores, label='Validation F1 Score', marker='o')
    plt.title('F1 Score over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.grid(True)
    plt.show()


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

KeyboardInterrupt: 

In [12]:
# 훈련 완료 후 최종 모델 로드
model_path = os.path.join(output_dir, 'best_model_state.bin')
# model_path = './0.79.bin'
model.load_state_dict(torch.load(model_path))

# 모델을 평가 모드로 설정
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
# 데이터 로드 및 전처리
test_df = load_and_preprocess_data('./test.csv')

print(test_df)


         id keyword location   
0         0     NaN      NaN  \
1         2     NaN      NaN   
2         3     NaN      NaN   
3         9     NaN      NaN   
4        11     NaN      NaN   
...     ...     ...      ...   
3258  10861     NaN      NaN   
3259  10865     NaN      NaN   
3260  10868     NaN      NaN   
3261  10874     NaN      NaN   
3262  10875     NaN      NaN   

                                                   text  
0                              happen terribl car crash  
1         heard earthquak differ citi stay safe everyon  
2     forest fire spot pond gees flee across street ...  
3                        apocalyps light spokan wildfir  
4                    typhoon soudelor kill china taiwan  
...                                                 ...  
3258    earthquak safeti lo angel ûò safeti fasten xrwn  
3259  storm ri wors last hurrican citi amp other har...  
3260                          green line derail chicago  
3261                meg issu hazard

In [14]:
# 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# 토크나이징 및 인코딩
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=512)

# PyTorch의 TensorDataset을 사용하여 데이터셋 생성
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), 
                             torch.tensor(test_encodings['attention_mask']))

# DataLoader 설정
test_dataloader = DataLoader(test_dataset, batch_size=32)

# 예측 수행
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()

        test_predictions.extend(np.argmax(logits, axis=1).flatten())

# 예측 결과의 첫 10개를 확인
print(test_predictions)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 




In [15]:
# 예측 결과와 id를 결합하여 제출 데이터프레임 생성
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'target': test_predictions
})

# 제출 파일로 저장
submission_file_path = './submission.csv'
submission_df.to_csv(submission_file_path, index=False)

# 제출 파일 경로 확인
submission_file_path


'./submission.csv'