In [1]:
import os
import sys
sys.path.append("../src/")
import argparse
import pandas as pd

from sklearn.metrics import mean_absolute_error

from utils import *
from preprocess import PreProcessor
# from model import Model, OptunaProcessor

X_train, y_train, X_valid, y_valid = load_dataset(mode='train')
categorical_feature = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG', 'BREADTH', 'DEPTH', 'DRAUGHT', 'year']
minmaxscale_feature = ['DIST', 'BUILT', 'DEADWEIGHT', 'GT', 'LENGTH', 'DUBAI', 'BRENT', 'WTI', 'BDI_ADJ', 'PORT_SIZE']

y_pred = pd.Series(name="CI_HOUR")
    
## preprocess data set
preprocessing = PreProcessor(categorical_feature=categorical_feature, minmaxscale_feature=minmaxscale_feature)
mean_values_train = preprocessing.nan_mean_fit(X_train)

X_train = preprocessing.preprocess(X_train, method='mean', mean_values=mean_values_train)
X_valid = preprocessing.preprocess(X_valid, method='mean', mean_values=mean_values_train)

encoder_dict = preprocessing.categorical_process_fit(X_train)
scaler = preprocessing.minmaxscale_process_fit(X_train)

X_train = preprocessing.transform(X_train, encoder=encoder_dict, scaler=scaler)
X_valid = preprocessing.transform(X_valid, encoder=encoder_dict, scaler=scaler)

X_train, y_train = reset_data(X_train, y_train)
X_valid, y_valid = reset_data(X_valid, y_valid)

### 0인 비율

In [36]:
print("train set :", sum(y_train['CI_HOUR'] == 0) / len(y_train))
print("valid set :", sum(y_valid['CI_HOUR'] == 0) / len(y_valid))

train set : 0.4020831389381357
valid set : 0.3986192882349206


In [37]:
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12,5))
# plt.subplot(1,2,1)
# sns.kdeplot(y_train)
# # sns.boxplot(y_train)
# plt.subplot(1,2,2)
# sns.kdeplot(y_valid)
# # sns.boxplot(y_valid)
# plt.show()

In [2]:
import numpy as np

np.log1p(0)

0.0

In [38]:
# sns.kdeplot(np.log1p(y_train.loc[y_train.CI_HOUR != 0]))

### Binary Problem
* target값이 0인지 아닌지 분류하는 문제
* 이유 : 0이 아닐 경우 target값을 로그변환시 정규분포에 근사하는 것을 확인할 수 있다.
* Method1 : logistic
* Method2 : Boosting Classification
* Method3 : svm

In [39]:
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

y_train_binary = y_train["CI_HOUR"].apply(lambda x : 1 if x != 0 else x).astype('int')
y_valid_binary = y_valid["CI_HOUR"].apply(lambda x : 1 if x != 0 else x).astype('int')

In [46]:
class ClassificationModel:
    def __init__(
        self, 
        X_train:pd.DataFrame, 
        y_train:pd.Series,
        X_valid:pd.DataFrame=None, 
        y_valid:pd.Series=None,
        classifier_name:str='lightgbm',
        # classifier_params:dict=None
        ):
        self.X_train = X_train
        self.y_train_binary = y_train["CI_HOUR"].apply(lambda x : 1 if x != 0 else x).astype('int')
        self.X_valid = X_valid 
        self.y_valid_binary = y_valid["CI_HOUR"].apply(lambda x : 1 if x != 0 else x).astype('int')
        self.classifier_name = classifier_name
        # self.classifier_params = self.classifier_params
        
    def fit(self):
        if self.classifier_name == "logistic":
            classifier = LogisticRegression(
                random_state=0, 
                class_weight='balanced', 
                max_iter=100, 
                multi_class='ovr', 
                verbose=0
                )

        elif self.classifier_name == "lightgbm":
            classifier = LGBMClassifier(
                objective='binary',
                class_weight='balanced',
                is_unbalance=True,
                )
            
        classifier.fit(self.X_train, self.y_train_binary)
        y_pred_binary = classifier.predict(self.X_valid)
        print(f"    ##{self.classifier_name}##")
        print("accuracy : ", accuracy_score(y_true=self.y_valid_binary, y_pred=y_pred_binary))
        print("f1 : ", f1_score(y_true=self.y_valid_binary, y_pred=y_pred_binary))
        print("precision : ", precision_score(y_true=self.y_valid_binary, y_pred=y_pred_binary))
        print("recall : ", recall_score(y_true=self.y_valid_binary, y_pred=y_pred_binary))
        
        return classifier
    
    def output_index(self, classifier:object, df:pd.Series):
        binary_target_pred = pd.Series(classifier.predict(df))
        print("Length of None Zero Target : ", sum(binary_target_pred))
        zero_index = binary_target_pred.loc[binary_target_pred == 0].index
        none_zero_index = binary_target_pred.loc[binary_target_pred != 0].index
        
        return zero_index, none_zero_index       
    
    def after_split_by_classifier(self, classifier:object): 
        train_zero_index, train_none_zero_index = self.output_index(classifier=classifier, df=self.X_train)
        valid_zero_index, valid_none_zero_index = self.output_index(classifier=classifier, df=self.X_valid)
        X_train_zero = self.X_train.loc[train_zero_index,:]
        X_train_none_zero = self.X_train.loc[train_none_zero_index,:]
        X_valid_zero = self.X_valid.loc[valid_zero_index,:]
        X_valid_none_zero = self.X_valid.loc[valid_none_zero_index,:]

In [47]:
extractor = ClassificationModel(
    X_train=X_train, 
    y_train=y_train,
    X_valid=X_valid, 
    y_valid=y_valid, 
    classifier_name='lightgbm',         
    )
classifier = extractor.fit()

train_zero_index, train_none_zero_index = extractor.output_index(classifier=classifier, df=X_train)
valid_zero_index, valid_none_zero_index = extractor.output_index(classifier=classifier, df=X_valid)

X_train_zero = X_train.loc[train_zero_index,:]
X_train_none_zero = X_train.loc[train_none_zero_index,:]
X_valid_zero = X_train.loc[train_zero_index,:]
X_valid_none_zero = X_train.loc[train_none_zero_index,:]

    ##lightgbm##
accuracy :  0.9999274264512442
f1 :  0.9999396581634963
precision :  0.9999849138581299
recall :  0.9998944065648947
Length of None Zero Target :  153789
Length of None Zero Target :  66286


In [53]:
test = pd.Series([0,1,3,4,5,12,0,2,3,4,0])
test.loc[[1,3,4,5]] = 0

In [54]:
test

0     0
1     0
2     3
3     0
4     0
5     0
6     0
7     2
8     3
9     4
10    0
dtype: int64