#Install

In [17]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import warnings
import random
from copy import deepcopy
warnings.filterwarnings("ignore")

In [18]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score

In [19]:
import torch
from torch import nn
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler

In [20]:
device='cuda' if torch.cuda.is_available() else 'cpu'

if device=='cuda':
  torch.cuda.manual_seed_all(0)

#Data Processing

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
base_path = '/content/drive/MyDrive/2022_Unithon_Data/data/dataset'

In [45]:
# 파일 불러오기
def dataframe_from_csv(target):
  return pd.read_csv(target).rename(columns=lambda x:x.strip())

def dataframe_from_csvs(targets):
  return pd.concat([dataframe_from_csv(x) for x in targets])

train_files = sorted([x for x in Path(base_path+'/train/').glob('*.csv')])
val_files = sorted([x for x in Path(base_path+'/val/').glob('*.csv')])

train = dataframe_from_csvs(train_files)
val = dataframe_from_csvs(val_files)
test = pd.read_csv(base_path+'/test.csv')
print(f'train: {len(train)}')
print(f'validation: {len(val)}')
print(f'test: {len(test)}')

train: 62564
validation: 7820
test: 7820


In [46]:
train.head()

Unnamed: 0,site,sid,leaktype,C01,C02,C03,C04,C05,C06,C07,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,S-4687025030,S-0359369085120315,out,19,30,27,26,24,14,10,...,24,20,24,38,60,198,185,46,17,4680
1,S-4677025028,S-0359369083968368,out,2,343,46,22,12,8,6,...,2,2,2,2,2,4,2,2,2,300
2,S-4673025027,S-0359369085133797,out,0,36,5,3,3,0,3,...,0,0,3,3,3,3,3,3,3,240
3,S-4772025022,S-0359369084117593,out,2,24,4,4,2,2,2,...,2,4,2,4,4,4,4,2,4,370
4,S-4729010102,S-0359369084010178,out,3,25,5,5,5,5,5,...,5,5,6,8,6,5,5,6,5,430


In [47]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train['site'] = le.fit_transform(train['site'])
test['site']=le.transform(test['site'])

train['leaktype'] = le.fit_transform(train['leaktype'])

train['leaktype'].nunique()

5

In [48]:
train = train.drop(['site', 'sid'], axis=1)
val = val.drop(['site', 'sid'], axis=1)
test = test.drop(['site', 'sid'], axis=1)
train['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
val['leaktype'].replace(['out','in','noise','other','normal'], [0,1,2,3,4], inplace=True)
test['leaktype']=""

# train을 target과 feature로 나눠줍니다.
train_x=train.drop(['leaktype'], axis=1)
train_y=train['leaktype']
val_x=val.drop(['leaktype'], axis=1)
val_y=val['leaktype']
test_x=test.drop(['leaktype'], axis=1)
test_y=test['leaktype']

In [49]:
print(train_x.head())
print(train_y.head())

   C01  C02  C03  C04  C05  C06  C07  C08  C09  C10  ...  C17  C18  C19  C20  \
0   19   30   27   26   24   14   10    9    6   15  ...   24   20   24   38   
1    2  343   46   22   12    8    6    4    4    2  ...    2    2    2    2   
2    0   36    5    3    3    0    3    3    3    3  ...    0    0    3    3   
3    2   24    4    4    2    2    2    4    2    4  ...    2    4    2    4   
4    3   25    5    5    5    5    5    5    6    5  ...    5    5    6    8   

   C21  C22  C23  C24  C25   C26  
0   60  198  185   46   17  4680  
1    2    4    2    2    2   300  
2    3    3    3    3    3   240  
3    4    4    4    2    4   370  
4    6    5    5    6    5   430  

[5 rows x 26 columns]
0    4
1    4
2    4
3    4
4    4
Name: leaktype, dtype: int64


#Data Loader

In [50]:
transformer = RobustScaler().fit(train_x)
X_train=transformer.transform(train_x)
X_test=transformer.transform(test_x)
X_val=transformer.transform(val_x)

In [51]:
X_train = torch.FloatTensor(X_train).to(device)
y_train = torch.LongTensor(train_y.to_numpy()).to(device)
X_test = torch.FloatTensor(X_test).to(device)
y_val = torch.LongTensor(val_y.to_numpy()).to(device)
X_val = torch.FloatTensor(X_val).to(device)

In [52]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

print(y_val.shape)
print(y_train.shape)

torch.Size([62564, 26])
torch.Size([7820, 26])
torch.Size([7820, 26])
torch.Size([7820])
torch.Size([62564])


In [30]:
# 데이터 로더 지정
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train)
#62564
train_loader = DataLoader(train_dataset, batch_size=62564,shuffle=True,drop_last=True)

#Pytorch Linear Model

In [31]:
class SimpleLinearClassifier(nn.Module):
  def __init__(self, input_dim=26):
    super(SimpleLinearClassifier, self).__init__()

    linear1 = torch.nn.Linear(input_dim,128)
    ln1 = nn.LayerNorm(128)
    linear2 = torch.nn.Linear(128,256)
    ln2 = nn.LayerNorm(256)
    linear3 = torch.nn.Linear(256,512)
    ln3 = nn.LayerNorm(512)
    linear4 = torch.nn.Linear(512,512)
    ln4 = nn.LayerNorm(512)
    linear5 = torch.nn.Linear(512,512)
    ln5 = nn.LayerNorm(512)
    linear6 = torch.nn.Linear(512,256)
    ln6 = nn.LayerNorm(256)
    linear7 = torch.nn.Linear(256,128)
    ln7 = nn.LayerNorm(128)
    linear8 = torch.nn.Linear(128,5)

    

    relu = torch.nn.ReLU()
    dropout = torch.nn.Dropout(p = 0.05)

    torch.nn.init.xavier_uniform_(linear1.weight)
    torch.nn.init.xavier_uniform_(linear2.weight)
    torch.nn.init.xavier_uniform_(linear3.weight)
    torch.nn.init.xavier_uniform_(linear4.weight)
    torch.nn.init.xavier_uniform_(linear5.weight)
    torch.nn.init.xavier_uniform_(linear6.weight)
    torch.nn.init.xavier_uniform_(linear7.weight)
    torch.nn.init.xavier_uniform_(linear8.weight)


    self.layer = torch.nn.Sequential(linear1, relu, dropout, 
                            linear2, relu, dropout,
                            linear3, relu, dropout,
                            linear4, relu, dropout,
                            linear5, relu, dropout,
                            linear6, relu, dropout,
                            linear7, relu, dropout,
                            linear8)
  def forward(self, x):
    out = self.layer(x)
    return out


In [32]:
class SimpleLinearClassifier(nn.Module):
  def __init__(self, input_dim=26):
    super(SimpleLinearClassifier, self).__init__()

    linear1 = torch.nn.Linear(538,1024)
    linear2 = torch.nn.Linear(1024,1024)
    linear3 = torch.nn.Linear(1024,512)
    linear4 = torch.nn.Linear(512,512)
    linear5 = torch.nn.Linear(512,128)
    linear6 = torch.nn.Linear(128,5)


    relu = torch.nn.LeakyReLU(negative_slope=0.01, inplace=False)
    dropout = torch.nn.Dropout(p = 0.18)

    torch.nn.init.xavier_uniform_(linear1.weight)
    torch.nn.init.xavier_uniform_(linear2.weight)
    torch.nn.init.xavier_uniform_(linear3.weight)
    torch.nn.init.xavier_uniform_(linear4.weight)
    torch.nn.init.xavier_uniform_(linear5.weight)
    torch.nn.init.xavier_uniform_(linear6.weight)

    self.layer = torch.nn.Sequential(linear1, relu, dropout,
                            linear2, relu, dropout,
                            linear3, relu, dropout,
                            linear4, relu, dropout,
                            linear5, relu, dropout,
                            linear6)
  def forward(self, x):
    out = self.layer(x)
    return out

In [33]:
classifier = SimpleLinearClassifier(input_dim=26).to(device)

In [37]:
# loss 및 optimzer 지정, learning rate & epoch 값 지정
loss = torch.nn.CrossEntropyLoss().to(device)
#optimizer = torch.optim.Adam(classifier.parameters(), lr = 0.001)
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
#                                        lr_lambda=lambda epoch: 0.95 ** epoch,
#                                        last_epoch=-1,
#                                        verbose=False)
optimizer = torch.optim.SGD(classifier.parameters(), lr=0.002)
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
#                                       lr_lambda=lambda epoch: 0.95 ** epoch)

epoch = 10000

#Train

In [38]:
# 학습
classifier.train()
total_batch = len(train_loader)
loss_list = []
acc_list = []
best_acc = 0
best_acc_classifier = None

for i in tqdm(range(epoch+1)):
  sum_cost = 0.0
  train_acc = 0.0

  for X, y in train_loader:
    X = X.to(device)
    y = y.to(device)
    
    # H(x) 계산
    output = classifier(X)
    #Accuracy
    max_vals, max_indices = torch.max(output, 1)
    train_acc += (max_indices == y).sum().data.cpu().numpy()
    # cost 계산
    cost = loss(output, y)

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    sum_cost += cost
  #scheduler.step()
  output_train = classifier(X_train)
  train_max_vals, train_max_indices = torch.max(output_train, 1)
  print(len(train_max_indices))
  train_acc = (train_max_indices == y_train).sum().data.cpu().numpy() / len(train_max_indices)


  output_val = classifier(X_val)
  val_max_vals, val_max_indices = torch.max(output_val, 1)
  val_acc = (val_max_indices == y_val).sum().data.cpu().numpy() / len(val_max_indices)

  avg_cost = sum_cost / total_batch
  if val_acc > best_acc:
    best_acc = val_acc
    best_acc_classifier = deepcopy(classifier.state_dict())
  print("Epoch :", i, "Cost :", format(avg_cost), "Accuracy_train", format(train_acc), "Accuracy_val", format(val_acc))

  0%|          | 0/10001 [00:00<?, ?it/s]


RuntimeError: ignored

#Validate

In [None]:
best_acc

In [None]:
best_classifier = SimpleLinearClassifier(input_dim=26).to(device)
best_classifier.load_state_dict(best_acc_classifier)
output_test = best_classifier(X_test)
output_valid = best_classifier(X_val)
test_max_vals,test_max_indices = torch.max(output_test, 1)
valid_max_vals,valid_max_indices = torch.max(output_valid, 1)
print(test_max_indices)

In [None]:
title = 'version2_01_069'

In [None]:
print(val_y)

In [None]:
print("Validation F1 score: ", f1_score(val_y, valid_max_indices.cpu(), average='macro'))

In [None]:
print("Validation Accuracy Score: ", accuracy_score(val_y, valid_max_indices.cpu()))

#Test

In [None]:
submission = pd.read_csv(base_path+'/sample_submission.csv')
submission['leaktype']=test_max_indices.cpu()
submission

In [None]:
submission.to_csv(base_path+'submission_'+title+'.csv', index=False)