# [문장 유형 분류 AI 경진대회](https://dacon.io/competitions/official/236037/overview/description)

## Dataset Info.

+ train.csv [파일]
    - ID : 샘플 문장 별 고유 ID
    - 문장 : 샘플 별 한개의 문장
    - 유형 : 문장의 유형 (사실형, 추론형, 대화형, 예측형)
    - 극성 : 문장의 극성 (긍정, 부정, 미정)
    - 확실성 : 문장의 확실성 (확실, 불확실)
    - label : 문장 별 유형, 극성, 시제, 확실성에 대한 Class (총 72개 종류의 Class 존재)  
    예시) 사실형-긍정-현재-확실


- test.csv [파일]
    + ID : 샘플 문장 별 고유 ID
    + 문장 : 샘플 별 한개의 문장


+ sample_submission.csv [파일] - 제출 양식
    - ID : 샘플 문장 별 고유 ID
    - label : 예측한 문장 별 유형, 극성, 시제, 확실성에 대한 Class  
    예시) 사실형-긍정-현재-확실


## Code

### 01_Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')

print (f"PyTorch version:{torch.__version__}") # 1.12.1 이상
print(f"MPS 장치를 지원하도록 build 되었는지: {torch.backends.mps.is_built()}") # True 여야 합니다.
print(f"MPS 장치가 사용 가능한지: {torch.backends.mps.is_available()}") # True 여야 합니다.
!python -c 'import platform;print(platform.platform())'

PyTorch version:1.13.0.dev20220913
MPS 장치를 지원하도록 build 되었는지: True
MPS 장치가 사용 가능한지: True
macOS-12.5-arm64-arm-64bit


### 02_Hyperparameter Setting

In [3]:
CFG = {
    'EPOCHS': 10,
    'LEARNING_RATE': 1e-4,
    'BATCH_SIZE': 256,
    'SEED': 190813
}

### 03_Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(CFG['SEED']) # Seed 고정

### 04_Data Load

In [5]:
df = pd.read_csv('./01_data/train.csv')
test = pd.read_csv('./01_data/test.csv')

### 05_Train / Validation Split

In [6]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])

### 06_Data Pre-processing

#### A_문장(Text) 벡터화 via TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(min_df=4, analyzer='word', ngram_range=(1, 2))
vectorizer.fit(np.array(train['문장']))

train_vec = vectorizer.transform(train['문장'])
val_vec = vectorizer.transform(val['문장'])
test_vec = vectorizer.transform(test['문장'])

print(train_vec.shape, val_vec.shape, test_vec.shape)

(13232, 9295) (3309, 9295) (7090, 9295)


#### B_Label Encoding (`유형`, `극성`, `시제`, `확실성`)

In [10]:
type_le = preprocessing.LabelEncoder()
train['유형'] = type_le.fit_transform(train['유형'].values)
val['유형'] = type_le.transform(val['유형'].values)

polarity_le = preprocessing.LabelEncoder()
train['극성'] = type_le.fit_transform(train['극성'].values)
val['극성'] = type_le.transform(val['극성'].values)

tense_le = preprocessing.LabelEncoder()
train['시제'] = type_le.fit_transform(train['시제'].values)
val['시제'] = type_le.transform(val['시제'].values)

certainty_le = preprocessing.LabelEncoder()
train['확실성'] = type_le.fit_transform(train['확실성'].values)
val['확실성'] = type_le.transform(val['확실성'].values)

In [14]:
train_type = train['유형'].values
train_polarity = train['극성'].values
train_tense = train['시제'].values
train_certainty = train['확실성'].values

train_labels = {
    'type': train_type,
    'polarity': train_polarity,
    'tense': train_tense,
    'certainty': train_certainty
}

### 07_Custom Dataset

In [15]:
class CustomDatast(Dataset):
    def __init__(self, st_vec, st_labels):
        self.st_vec = st_vec
        self.labels = st_labels
        
    def __getitem__(self, index):
        st_vector = torch.FloatTensor(self.st_vec[index].toarray()).squeeze(0)
        if self.st_labels is not None:
            st_type = self.st_labels['type'][index]
            st_polarity = self.st_labels['polarity'][index]
            st_tense = self.st_labels['tense'][index]
            st_certainty = self.st_labels['certainty'][index]
            return st_vector, st_type, st_polarity, st_tense, st_certainty
        else:
            return st_vector
        
    def __len__(self):
        return len(self.st_vec.toarray())

In [None]:
train_dataset = CustomDataset(train_vec, train_labels)
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], suffle=True, num_workers=0)

train_dataset = CustomDataset(train_vec, train_labels
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], suffle=True, num_workers=0)