### 언더 샘플링

In [1]:
#불균형 데이터(weights=[0.99] - 1:99, 0:1의 비율)
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, 
                           n_clusters_per_class=1, weights=[0.99], flip_y=0,
                           random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy], axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,1.347439,1.412824,0
2,0.537238,0.372730,0
3,2.134462,1.404819,0
4,2.315827,1.356858,0
...,...,...,...
9995,2.440385,1.695643,0
9996,-0.790502,0.194243,0
9997,1.878130,0.829500,0
9998,2.585933,1.927995,0


In [2]:
X1 = df[['a', 'b']]
y1 = df['y']

In [3]:
df['y'].value_counts() # 불균형 데이터셋

0    9900
1     100
Name: y, dtype: int64

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=10)

In [5]:
#불균형 데이터셋으로 만든 모형
model1 = LogisticRegression(random_state=0)
model1.fit(X_train, y_train)
print('학습용:', model1.score(X_train, y_train))
print('검증용:', model1.score(X_test, y_test))

학습용: 0.994125
검증용: 0.995


In [6]:
pred1 = model1.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred1)
cm
# score는 높으나 recall의 경우 10/(10+10) = 0.5로 낮아지는 문제 발생

array([[1980,    0],
       [  10,   10]], dtype=int64)

In [7]:
#소수 클래스의 정확도와 precision, precision,recall,f1-score 확인
from sklearn.metrics import classification_report
print(classification_report(y_test, pred1))
# 모형의 전반적인 정확도(accuracy)는 높지만 소수 클래스의 재현율(recall)이 0.5로 낮은 문제점

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1980
           1       1.00      0.50      0.67        20

    accuracy                           0.99      2000
   macro avg       1.00      0.75      0.83      2000
weighted avg       1.00      0.99      0.99      2000



In [8]:
#균형데이터
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
                           n_clusters_per_class=1, flip_y=0, random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df2 = pd.concat([dfX, dfy], axis=1)
df2['y'].value_counts()

0    5000
1    5000
Name: y, dtype: int64

In [9]:
X2 = df2[['a', 'b']]
y2 = df2['y']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=10)

In [11]:
model2 = LogisticRegression(random_state=0)
model2.fit(X_train, y_train)
print('학습용:', model2.score(X_train, y_train))
print('검증용:', model2.score(X_test, y_test))

학습용: 0.896125
검증용: 0.891


In [12]:
#정확도와 precision, precision,recall,f1-score 확인
pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))
# 정확도와 재현율이 비슷한 결과

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1000
           1       0.90      0.87      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [13]:
#불균형 데이터(weights=[0.99] - 1:99, 0:1의 비율)
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, 
                           n_clusters_per_class=1, weights=[0.99], flip_y=0,
                           random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy], axis=1)

In [14]:
#무작위로 다수 클래스의 데이터를 없애는 단순 샘플링
from imblearn.under_sampling import RandomUnderSampler
X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])
y_samp.y.value_counts()

0    100
1    100
Name: y, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)

In [16]:
model3 = LogisticRegression(random_state=42)
model3.fit(X_train, y_train)
print('학습용:', model3.score(X_train, y_train))
print('검증용:', model3.score(X_test, y_test))

학습용: 0.8625
검증용: 0.925


  y = column_or_1d(y, warn=True)


In [17]:
pred3 = model3.predict(X_test)
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92        20
           1       0.90      0.95      0.93        20

    accuracy                           0.93        40
   macro avg       0.93      0.93      0.92        40
weighted avg       0.93      0.93      0.92        40



In [18]:
#토멕링크(Tomek's link): 다수 클래스에 속한 샘플을 제거해서 데이터 균형을 맞춤
# 비슷한 차이일때는 효과가 좋으나 한쪽으로 심하게 쏠린경우 효과가 적음
from imblearn.under_sampling import TomekLinks
X_sample, y_sample = TomekLinks(sampling_strategy='majority').fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])
y_samp.y.value_counts()
# 'majority': 다수 클래스의 샘플을 제거
# 'not minority': 소수 클래스를 제외하고 샘플링
# 'not majority': 다수 클래스를 제외하고 샘플링
# 'all': 모든 클래스를 샘플링
# 'auto': not minority와 같음(기본 옵션)

0    9874
1     100
Name: y, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)

In [20]:
model4 = LogisticRegression(random_state=42)
model4.fit(X_train, y_train)
print('학습용:', model4.score(X_train, y_train))
print('검증용:', model4.score(X_test, y_test))

학습용: 0.9942348665246271
검증용: 0.9959899749373433


  y = column_or_1d(y, warn=True)


In [21]:
pred4 = model4.predict(X_test)
print(classification_report(y_test, pred4))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1975
           1       1.00      0.60      0.75        20

    accuracy                           1.00      1995
   macro avg       1.00      0.80      0.87      1995
weighted avg       1.00      1.00      1.00      1995



### 오버샘플링

In [22]:
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, 
                           n_clusters_per_class=1, weights=[0.99], flip_y=0,
                           random_state=1)
dfX = pd.DataFrame(X, columns=['a', 'b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy], axis=1)
df.y.value_counts()

0    9900
1     100
Name: y, dtype: int64

In [23]:
from imblearn.over_sampling import RandomOverSampler
X_sample, y_sample = RandomOverSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])
y_samp.y.value_counts()

0    9900
1    9900
Name: y, dtype: int64

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)

In [25]:
model1 = LogisticRegression(random_state=42)
model1.fit(X_train, y_train)
print('학습용:', model1.score(X_train, y_train))
print('검증용:', model1.score(X_test, y_test))

학습용: 0.8955176767676768
검증용: 0.8987373737373737


  y = column_or_1d(y, warn=True)


In [26]:
from sklearn.metrics import classification_report
pred1 = model1.predict(X_test)
print(classification_report(y_test, pred1))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1980
           1       0.91      0.88      0.90      1980

    accuracy                           0.90      3960
   macro avg       0.90      0.90      0.90      3960
weighted avg       0.90      0.90      0.90      3960



In [27]:
#SMOTE: 소수 클래스의 샘플을 주변의 이웃을 고려해 약간씩 이동시킨 포인트들을 추가하는 방식
from imblearn.over_sampling import SMOTE
X_sample, y_sample = SMOTE(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample, columns=['a', 'b'])
y_samp = pd.DataFrame(data=y_sample, columns=['y'])
y_samp.y.value_counts()

0    9900
1    9900
Name: y, dtype: int64

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)

In [29]:
model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)
print('학습용:', model2.score(X_train, y_train))
print('검증용:', model2.score(X_test, y_test))

학습용: 0.9096590909090909
검증용: 0.9085858585858586


  y = column_or_1d(y, warn=True)


In [30]:
pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1980
           1       0.92      0.89      0.91      1980

    accuracy                           0.91      3960
   macro avg       0.91      0.91      0.91      3960
weighted avg       0.91      0.91      0.91      3960

