In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler
import imblearn

# 전처리

In [None]:
df_org = pd.read_csv('train.csv', encoding='cp949')

In [None]:
df_org.head(5)

In [None]:
del df_org['검사결과코드']
del df_org['핵심적발']
del df_org['신고인부호']
del df_org['신고번호']
del df_org['원산지국가코드']

In [None]:
df_org = df_org.fillna('없음')

In [None]:
discrete_columns = ['신고일자',  '통관지세관부호',
        '수입자부호', '해외거래처부호', '특송업체부호', '수입통관계획코드', 
       '수입신고구분코드', '수입거래구분코드',
       '수입종류코드', '징수형태코드', 
       '운송수단유형코드', '반입보세구역부호', 
       'HS10단위부호', '적출국가코드', '관세율구분코드']
# 범주형 변수들을 문자열로 지정 ('object' -> 'string')
for var in discrete_columns:
    df_org[var] = df_org[var].astype(str)

In [None]:
# 수치형 변수 지정
numeric_columns = ['scaledKG', 'scaledAmmout', 'scaledRate', '우범여부']


In [None]:
std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df_org['scaledKG'] = rob_scaler.fit_transform(df_org['신고중량(KG)'].values.reshape(-1,1))
df_org['scaledAmmout'] = rob_scaler.fit_transform(df_org['과세가격원화금액'].values.reshape(-1,1))

df_org.drop(['신고중량(KG)'], axis=1, inplace=True)
df_org.drop(['과세가격원화금액'], axis=1, inplace=True)
df_org['scaledRate'] = rob_scaler.fit_transform(df_org['관세율'].values.reshape(-1,1))
df_org.drop(['관세율'], axis=1, inplace=True)

In [None]:
df_org.head(10)

In [None]:
df_org.shape

In [None]:
label_encoding_ref = {}
for var in discrete_columns:
    label_encoding_ref[var] = {code: i+1 for i, code in enumerate(df_org[var].unique())}
    print(label_encoding_ref[var])
    df_org[var] = [label_encoding_ref[var][x] for x in df_org[var]]

In [None]:
df_org

In [None]:
a = [4,3,2,5,11,14,8]
for i in a : 
    df_org.loc[df_org['통관지세관부호'] == i , '통관지세관부호'] = 99
    
a = [2,12,3,18,1,14,4,7,36,46,43]
for i in a : 
    df_org.loc[df_org['특송업체부호'] == i , '특송업체부호'] = 99
    
a = [1,2,3,4,5,7]
for i in a : 
    df_org.loc[df_org['수입통관계획코드'] == i , '수입통관계획코드'] = 99
    
a = [1,3]
for i in a : 
    df_org.loc[df_org['수입신고구분코드'] == i , '수입신고구분코드'] = 99
    
a = [2,1,3,4,5]
for i in a : 
    df_org.loc[df_org['수입거래구분코드'] == i , '수입거래구분코드'] = 99
    
a = [2,4]
for i in a : 
    df_org.loc[df_org['수입종류코드'] == i , '수입종류코드'] = 99
    
a = [2,4]
for i in a : 
    df_org.loc[df_org['수입종류코드'] == i , '수입종류코드'] = 99

a = [2,1,3,4,5,6]
for i in a : 
    df_org.loc[df_org['징수형태코드'] == i , '징수형태코드'] = 99

a = [1,2,3,4]
for i in a : 
    df_org.loc[df_org['운송수단유형코드'] == i , '운송수단유형코드'] = 99

In [None]:
temp = df_org['우범여부']==1
df_org_target1 = df_org[temp]
temp = df_org['우범여부']==0
df_org_target0 = df_org[temp]
df_org_target1.shape, df_org_target0.shape

In [None]:
for i in df_org.columns:
    print("column : ", i)
    df_우범1 = df_org_target1[i].value_counts().rename_axis('unique').reset_index(name='counts')
    df_우범0 = df_org_target0[i].value_counts().rename_axis('unique').reset_index(name='counts')
    df_전체 = df_org[i].value_counts().rename_axis('unique').reset_index(name='counts')
    
    div = (df_우범1['counts'] / df_우범0['counts']).reset_index(name='rate')
    div = pd.concat([div,df_우범0['unique']],axis=1,sort=False)
    div = pd.concat([div,df_전체['counts']],axis=1,sort=False)
    div = div.reindex(columns=['unique','rate','counts'])
    
    ## 그룹화 찾기 파라미터 

    # 작은 데이터인 경우 
    b = div['rate'] >= 1
    small = b

    # 큰 데이터인 경우 
    a = div['counts'] > 1000
    b = div['rate'] >= 0.4
    big = a & b

    data = (small).reset_index(name='result')
    div = pd.concat([div,data['result']],axis=1,sort=False)
    div = div.reindex(columns=['unique','rate','counts','result'])
    
    print(div)
    print("###########################")
    print("##########END#############")

In [None]:
df_org.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
# 날짜 순에 따라 80:20 으로 훈련 및 테스트 데이터 분리
df_org_train = df_org.iloc[:int(len(df_org)*0.8), :]
df_org_test = df_org.iloc[int(len(df_org)*0.8):, :]

In [None]:
df_org_train.head(1)

In [None]:
df_org_test.head(1)

In [None]:
from sklearn.preprocessing import  OneHotEncoder
xgbohe = OneHotEncoder(categories="auto", handle_unknown='ignore')

In [None]:
df_org_train['label'] = 'train'
df_org_test['label'] = 'test'

concat_df = pd.concat([df_org_train,df_org_test])

df_org_trains = pd.get_dummies(concat_df, columns = discrete_columns)

df_train =  df_org_trains[df_org_trains['label'] == 'train']
df_test =  df_org_trains[df_org_trains['label'] == 'test']

#df_org_tests = pd.get_dummies(df_org_test, columns = discrete_columns)

# 모델예측 대상인 우범여부 변수 분리
org_train_y = df_train.pop('우범여부')
org_test_y  = df_test.pop('우범여부')

In [None]:
org_test_y.head(10)

In [None]:
df_train = df_train.drop('label', axis=1)
df_test = df_test.drop('label', axis=1)

In [None]:
# 데이터 사이즈 확인
print(df_train.shape, df_test.shape)
print(org_train_y.shape, org_test_y.shape)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 11)
X_train_over, y_train_over = smote.fit_resample(df_train, org_train_y)

In [None]:
# 훈련데이터와 테스트데이터에서 우범건수 비교
from collections import Counter
cnt_train = Counter(y_train_over)
cnt_test = Counter(org_test_y)

print(f"훈련  데이터 비우범건수: {cnt_train[0]}, 훈련  데이터 우범건수: {cnt_train[1]}")
print(f"테스트데이터 비우범건수: {cnt_test[0]}, 테스트데이터 우범건수: {cnt_test[1]}")

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score,roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression 

In [None]:
밑에서 다시 시작

In [None]:
X_train_over

In [None]:
def get_model_train_eval(model, ftr_train=None, ftr_test=None, tgt_train=None, tgt_test=None): 
    model.fit(ftr_train, tgt_train) 
    y_pred = model.predict(ftr_test) 
    show_metrics(tgt_test, y_pred)
    return model

In [None]:
from lightgbm import LGBMClassifier 
lgbm = LGBMClassifier(n_estimators=1000, num_leaves=32, n_jobs=-1, boost_from_average=False)
lgbm = get_model_train_eval(lgbm, ftr_train=X_train_over, ftr_test=df_org_tests, tgt_train=y_train_over, tgt_test = org_test_y)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)  
regressor.fit(X_train_over, y_train_over) 
#feature importance
feature_importances = pd.DataFrame(regressor.feature_importances_,index = dataset.columns,columns=['importance']).sort_values('importance',ascending=False)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score,roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression 

# 여기서부터 다시 실행

In [None]:
# 모델 활성화 및 설정
xgb_clf = XGBClassifier(n_estimators=10, max_depth=4,n_jobs=-1) #
# 모델 훈련 시 성능평가 데이터 설정 (별도의 성능평가 데이터를 구성할 수도 있으나, 여기서는 그냥 테스트데이터를 사용)
eval_set = [(X_train_over, y_train_over), (df_test, org_test_y)]
# 모델 훈련 (training/fitting)
xgb_clf.fit(X_train_over, y_train_over, eval_metric=["logloss"], eval_set=eval_set, verbose=True)

In [None]:
# 모델 성능 시각화
from matplotlib import pyplot
# retrieve performance metrics
results = xgb_clf.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
#pyplot.show()
plt.savefig('XGBoost Log Loss.png')

In [None]:
# evaluate xgboost model
print("------Evaluating xgboost model------")
# Predict
test_pred = xgb_clf.predict_proba(df_org_tests)[:,1]
# Calculate auc
xgb_auc = roc_auc_score(org_test_y, test_pred)
print(xgb_auc)

In [None]:
# 검사율에 따른 우범예측 함수 생성

def inspection_performance(predicted_fraud, test_fraud):
    
    # Set default values before a loop
    
    Inspect_Rate = []
    Precision=[]
    Recall=[]
    
    # Create a loop for making confusion matrix at each inspection rate

    for i in range(0,101,1):
        
        # Find the ith value in ascending order.
        threshold = np.percentile(predicted_fraud, i)
        # Precision = number of frauds / number of inspection
        precision = np.mean(test_fraud[predicted_fraud >= threshold])
        # Recall = number of inspected frauds / number of frauds
        recall = sum(test_fraud[predicted_fraud >= threshold])/sum(test_fraud)
        # Save values
        Inspect_Rate.append(100-i)
        Precision.append(precision)
        Recall.append(recall)
        
    
    compiled_conf_matrix = pd.DataFrame({
        
        'Inspect_Rate':Inspect_Rate,
        'Precision':Precision,
        'Recall':Recall
    })

    return compiled_conf_matrix

In [None]:
# 검사율에 따른 우범예측 
basic_performance = inspection_performance(test_pred, org_test_y.astype(float))

In [None]:
# 검사율 1~10% 지정 시 Precision 및 Recall 분석
# Precision (적중률) = (검사선별된 우범건수)/(검사선별 건수)
# Recall (적발률) = (검사선별된 우범건수) / (전체 우범건수)
basic_performance.iloc[range(99,89,-1),:]

In [None]:
# 모델에 기여도가 높은 변수 시각화
from xgboost import plot_importance
plt.rcParams["font.family"] = 'Malgun Gothic'
plt.rcParams["figure.figsize"] = (15,10)
plot_importance(xgb_clf, max_num_features=30)
plt.show()