# 라이브러리 및 데이터

In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import missingno as msno
from sklearn.preprocessing import PowerTransformer
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import IsolationForest

warnings.filterwarnings('ignore')
#os.chdir('C:/Users/kdm22/Documents/Python,R_2023/p-sat-31th-summer-vacation-seminar')
#시드 고정
np.random.seed(3031)

In [2]:
train = pd.read_csv("training_set.csv")
test = pd.read_csv("test_set.csv")
# ID_code 제거
train = train.drop(columns=['ID_code'])
test = test.drop(columns=['ID_code'])

# 파생변수 생성

## 1. 결측치 개수 파생변수 생성

In [3]:
train['num_nan'] = train.isnull().sum(axis=1)
test['num_nan'] = test.isnull().sum(axis=1)

In [4]:
train = train.drop(columns=['X58'])
test = test.drop(columns=['X58'])

## 2. 결측치 비율 20% 이상 열 제거, 해당 열 대신 결측치 유무로 binary 변수 생성


In [5]:
def remove_columns_with_high_missing(train, test):
    # Find columns with a missing value rate greater than or equal to the threshold.
    
    for column in train.columns:
        missing_sum = train[column].isnull().sum()
        if missing_sum > 11000:
            # 이진 변수 생성: 결측치가 있으면 1, 없으면 0
            train[f'{column}_binary'] = np.where(train[column].isnull(), 1, 0)
            test[f'{column}_binary'] = np.where(test[column].isnull(), 1, 0)

            train = train.drop(column, axis=1)
            test = test.drop(column, axis=1)
               
    return train, test

train, test = remove_columns_with_high_missing(train, test)

## 3. train data의 결측치 개수 34개(20%) 이상 행 제거

In [6]:
def remove_rows_with_high_missing(data, missing_threshold=34):
    rows_to_remove = data.isnull().sum(axis=1) > missing_threshold
    data = data[~rows_to_remove]
    return data
train = remove_rows_with_high_missing(train)

### 인덱스 초기화

In [7]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Isolation forest로 파생변수 생성하기

In [8]:
X = train.drop(['class'],axis=1)
y = train['class']

In [9]:
Xtest= test.copy()

In [10]:
for var in X.columns:
    if X[var].isna().sum() != 0:
        X[var][X[var].isna()] = X[var].median()

In [11]:
for var in Xtest.columns:
    if Xtest[var].isna().sum() != 0:
        Xtest[var][Xtest[var].isna()] = X[var].median()

In [12]:
model = IsolationForest(contamination=0.08, random_state=42) # 4624
model.fit(X)
score = model.decision_function(X)
anomaly = model.predict(X)

X['anomaly'] = anomaly
X['score'] = score
X['class'] = y

In [13]:
anomaly = model.predict(Xtest)
score = model.decision_function(Xtest)

test['anomaly'] = anomaly
test['score'] = score

In [16]:
X.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

In [17]:
X

Unnamed: 0,X1,X3,X5,X6,X7a,X7b,X7c,X7d,X7e,X7f,...,X72_binary,X73_binary,X74_binary,X75_binary,X76_binary,X77_binary,X78_binary,anomaly,score,class
0,3490,8.000000e+01,0.0,0.0,0.0,0.0,0.0,0.0,179466.0,52702.0,...,0,0,0,0,0,0,0,1,0.113947,0
1,92,1.400000e+01,0.0,0.0,0.0,0.0,0.0,4528.0,3486.0,3848.0,...,0,0,0,0,0,0,0,1,0.131765,0
2,10,1.800000e+01,4.0,6.0,0.0,0.0,0.0,0.0,224.0,1802.0,...,0,0,0,0,0,0,0,1,0.137487,0
3,156758,2.130706e+09,0.0,0.0,0.0,0.0,0.0,0.0,38538.0,2412800.0,...,0,0,0,0,0,0,0,-1,-0.003053,0
4,121048,3.352000e+03,326.0,924.0,0.0,0.0,46.0,490092.0,3101434.0,4041666.0,...,0,0,0,0,0,0,0,-1,-0.108592,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52158,69628,3.320000e+02,0.0,0.0,0.0,0.0,0.0,0.0,44998.0,1580944.0,...,0,0,0,0,0,0,0,1,0.101496,0
52159,23910,8.680000e+02,0.0,0.0,0.0,5932.0,217980.0,1155898.0,1349820.0,224616.0,...,0,0,0,0,0,0,0,1,0.001940,0
52160,14,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,728.0,4008.0,...,0,0,0,0,0,0,0,1,0.137923,0
52161,437486,1.540000e+02,0.0,0.0,0.0,96944.0,981638.0,2445458.0,4387124.0,4934430.0,...,0,0,0,0,0,0,0,-1,-0.173980,1
