In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler,Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [31]:
df = pd.read_csv('./datasets/1.EDA완료.csv', encoding="utf-8-sig", index_col=0)
df

Unnamed: 0,category,amt,gender,is_fraud,distance,age,recency,city_pop_category,trans_hour_category,trans_month_category,region
0,misc_net,4.97,F,0,48.838332,31,0.0,소도시,night,상반기,south
1,grocery_pos,107.23,F,0,18.773002,41,0.0,소도시,night,상반기,west
2,entertainment,220.11,M,0,67.236235,57,0.0,소도시,night,상반기,west
3,gas_transport,45.00,M,0,59.448672,52,0.0,소도시,night,상반기,west
4,misc_pos,41.96,M,0,48.191593,33,0.0,소도시,night,상반기,south
...,...,...,...,...,...,...,...,...,...,...,...
1296670,entertainment,15.56,M,0,74.410630,59,16781.0,소도시,day,상반기,west
1296671,food_dining,51.70,M,0,46.667580,41,7962.0,소도시,day,상반기,south
1296672,food_dining,105.93,M,0,61.545493,53,29074.0,소도시,day,상반기,west
1296673,food_dining,74.90,M,0,52.585258,40,91018.0,소도시,day,상반기,mid_west


In [32]:
col = ['region','trans_month_category','trans_hour_category','city_pop_category','gender','category']
df_dummy = pd.get_dummies(df[col])
df.drop(col, axis=1, inplace=True, errors='ignore')
df = pd.concat([df, df_dummy],axis=1)
mapping = {'trans_category_기타': '기타거래','trans_category_온라인거래':'온라인거래','trans_category_오프라인거래':'오프라인거래',
           'trans_month_category_상반기':'상반기','trans_month_category_하반기':'하반기','trans_hour_category_day':'주간',
           'trans_hour_category_night':'야간','city_pop_category_대도시':'대도시','city_pop_category_중소도시':'중소도시',
           'city_pop_category_소도시':'소도시','gender_F':'남성','gender_M':'여성','amt':'거래량','distance':'카드거래발생거리',
           'age':'실소유자나이','recency':'카드거래간시간','region_mid_west':'중서부','region_north_east':'북동부','region_south':'남부',
           'region_west':'서부','is_fraud':'이상거래'}
df.rename(columns = mapping, inplace=True)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296675 entries, 0 to 1296674
Data columns (total 32 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   거래량                      1296675 non-null  float64
 1   이상거래                     1296675 non-null  int64  
 2   카드거래발생거리                 1296675 non-null  float64
 3   실소유자나이                   1296675 non-null  int64  
 4   카드거래간시간                  1296675 non-null  float64
 5   중서부                      1296675 non-null  uint8  
 6   북동부                      1296675 non-null  uint8  
 7   남부                       1296675 non-null  uint8  
 8   서부                       1296675 non-null  uint8  
 9   상반기                      1296675 non-null  uint8  
 10  하반기                      1296675 non-null  uint8  
 11  주간                       1296675 non-null  uint8  
 12  야간                       1296675 non-null  uint8  
 13  대도시                      1296675 non-null 

In [34]:
df.columns

Index(['거래량', '이상거래', '카드거래발생거리', '실소유자나이', '카드거래간시간', '중서부', '북동부', '남부',
       '서부', '상반기', '하반기', '주간', '야간', '대도시', '소도시', '중소도시', '남성', '여성',
       'category_entertainment', 'category_food_dining',
       'category_gas_transport', 'category_grocery_net',
       'category_grocery_pos', 'category_health_fitness', 'category_home',
       'category_kids_pets', 'category_misc_net', 'category_misc_pos',
       'category_personal_care', 'category_shopping_net',
       'category_shopping_pos', 'category_travel'],
      dtype='object')

---
# Train Test Split

In [35]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['이상거래'], axis=1)
y = df[['이상거래']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [36]:
y_train.value_counts()

이상거래
0       902418
1         5254
dtype: int64

In [37]:
y_test.value_counts()

이상거래
0       386751
1         2252
dtype: int64

---
# Train Data

---
# UnderSampling ( RamdomUnderSampling )

In [38]:
# 언더샘플링 ( Train Data )
undersample = RandomUnderSampler(sampling_strategy=0.1,random_state=0)
x_under, y_under = undersample.fit_resample(X_train, y_train)
df_under = pd.concat([x_under,y_under],axis=1)
df_under['이상거래'].value_counts()

0    52540
1     5254
Name: 이상거래, dtype: int64

---
# OverSampling ( SMOTE )

In [39]:
# 오버샘플링 ( Train Data )
oversample = SMOTE(sampling_strategy = 1, random_state=0, n_jobs=-1)
x_over, y_over = oversample.fit_resample(x_under, y_under)
train_df = pd.concat([x_over,y_over],axis=1)
train_df['이상거래'].value_counts()

0    52540
1    52540
Name: 이상거래, dtype: int64

---
# Test Data

---
# UnderSampling ( RamdomUnderSampling )

In [40]:
# 언더샘플링 ( Test Data )
undersample = RandomUnderSampler(sampling_strategy=0.1,random_state=0)
x_under, y_under = undersample.fit_resample(X_test, y_test)
test_df = pd.concat([x_under,y_under],axis=1)
test_df['이상거래'].value_counts()

0    22520
1     2252
Name: 이상거래, dtype: int64

In [41]:
train_df

Unnamed: 0,거래량,카드거래발생거리,실소유자나이,카드거래간시간,중서부,북동부,남부,서부,상반기,하반기,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,이상거래
0,98.030000,35.110218,32,353251.000000,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,9.230000,71.272596,47,17002.000000,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,57.670000,20.339990,90,4033.000000,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,2.770000,42.889090,43,120903.000000,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,26.130000,48.883946,29,133656.000000,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105075,329.963305,43.388465,52,7199.880482,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
105076,922.658403,46.196296,19,1766.558505,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
105077,651.097475,54.684640,34,2194.320669,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
105078,883.172145,37.250699,32,2122.498433,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [42]:
test_df

Unnamed: 0,거래량,카드거래발생거리,실소유자나이,카드거래간시간,중서부,북동부,남부,서부,상반기,하반기,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,이상거래
0,129.95,50.075922,33,3772.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,83.44,21.083213,63,40019.0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1.79,78.422702,32,26329.0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,121.19,36.466010,61,53956.0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
4,102.44,70.117999,34,3054.0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24767,293.92,66.104965,71,4517.0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
24768,1161.58,63.375722,69,69551.0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
24769,283.19,14.978807,18,1405.0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
24770,790.45,39.574304,56,42186.0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1


In [43]:
train_df.to_csv('./datasets/train_data(Sampling완료).csv')
test_df.to_csv('./datasets/test_data(Sampling완료).csv')