# 패키지들은 연습 문제를 풀며 연습하기 위해 별도 로드 X

# Question 1.
- 서비스 이탈예측 데이터.
- 2 유형 환경과 일치시키기 위해 y_train과 x_train을 하나의 train 데이터 셋으로 병합하여 진행.

## 데이터 병합 및 확인 단계

In [None]:
import pandas as pd

x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv')

train = pd.concat([x_train, y_train['Exited']], axis = 1)
display(train.info())
display(x_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       6499 non-null   int64  
 1   Surname          6499 non-null   object 
 2   CreditScore      6499 non-null   int64  
 3   Geography        6499 non-null   object 
 4   Gender           6499 non-null   object 
 5   Age              6499 non-null   int64  
 6   Tenure           6499 non-null   int64  
 7   Balance          6499 non-null   float64
 8   NumOfProducts    6499 non-null   int64  
 9   HasCrCard        6499 non-null   int64  
 10  IsActiveMember   6499 non-null   int64  
 11  EstimatedSalary  6499 non-null   float64
 12  Exited           6499 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 660.2+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3501 entries, 0 to 3500
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       3501 non-null   int64  
 1   Surname          3501 non-null   object 
 2   CreditScore      3501 non-null   int64  
 3   Geography        3501 non-null   object 
 4   Gender           3501 non-null   object 
 5   Age              3501 non-null   int64  
 6   Tenure           3501 non-null   int64  
 7   Balance          3501 non-null   float64
 8   NumOfProducts    3501 non-null   int64  
 9   HasCrCard        3501 non-null   int64  
 10  IsActiveMember   3501 non-null   int64  
 11  EstimatedSalary  3501 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 328.3+ KB


None

In [None]:
# 1. 데이터의 통계량 확인
display(train.describe(include = ['object', 'category']))
display(train.describe())

# 결과 변수
#   - count: 결측치를 제외한 전체 데이터의 수 (행의 수와 일치).
#   - unique: 서로 다른 범주의 수 (범주 종류의 수).
#   - top: 최빈 범주
#   - freq: 최빈 범주의 개수.

Unnamed: 0,Surname,Geography,Gender
count,6499,6499,6499
unique,2289,3,4
top,Brown,France,Male
freq,21,3227,3485


Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,6499.0,6499.0,6499.0,6499.0,6499.0,6499.0,6499.0,6499.0,6499.0,6499.0
mean,15691570.0,650.39683,38.95707,5.041545,76836.581068,1.519772,0.708878,0.514387,100346.564524,0.203724
std,71875.84,96.618957,10.502803,2.891779,62407.570894,0.578975,0.454314,0.499831,57944.655305,0.402797
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15629490.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,50907.565,0.0
50%,15691810.0,651.0,37.0,5.0,97560.16,1.0,1.0,1.0,100496.84,0.0
75%,15753580.0,718.0,44.0,8.0,127844.69,2.0,1.0,1.0,150480.155,0.0
max,15815660.0,850.0,92.0,10.0,238387.56,4.0,1.0,1.0,199970.74,1.0


## 변수 선택 단계

In [49]:
# Surname과 CustomerId는 제외
#   - Surname: 범주의 종류가 2289개로 의미없는 정보임.
#   - CustomerId: Id 컬럼이기에 제외
train_df = train.drop(columns = ['Surname', 'CustomerId'])
x_test_df = x_test.drop(columns = ['Surname', 'CustomerId'])

## Encoding 단계

In [58]:
# 범주형 변수만 따로 추출
target = train.drop(columns = 'Exited')
cat_lst = target.select_dtypes(exclude = 'number').columns.tolist()
cat_lst

# 원-핫 인코딩 적용
#   - drop_first = False
#       - False: 기준 범주 없이 모든 범주를 0과 1로 encoding 할 때 사용한다. 이는 기준 범주 없이 개별 범주로 처리하기 때문.
#       - True: 통계모델에서 기준 범주를 설정할 때 사용한다. 통계에서는 기준 범주를 기준으로하여 변화량을 측정하기 때문. (이 경우 기준범주가 0이 된다).
train_encoded = pd.get_dummies(train, columns = cat_lst, drop_first = False)
x_test_encoded = pd.get_dummies(x_test, columns = cat_lst, drop_first = False)

## 데이터를 분리하고 학습결과 확인.

In [70]:
from sklearn.model_selection import train_test_split

# train_encoded를 설명변수와 결과변수로 분류
X = train_encoded.drop(columns = 'Exited')
y = train_encoded['Exited']

# train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

## 학습

In [71]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 23)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 결과 지표들 확인

In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# 1. 예측
val_pred_label = rf.predict(X_val)               # 클래스 예측
val_pred_proba = rf.predict_proba(X_val)[:, 1]  # 양성 클래스 확률 (AUC에 사용)

# 2. 각 지표 출력
print("📊 모델 성능 평가 결과:")
print(f"Accuracy : {accuracy_score(y_val, val_pred_label):.4f}")
print(f"Precision: {precision_score(y_val, val_pred_label):.4f}")
print(f"Recall   : {recall_score(y_val, val_pred_label):.4f}")
print(f"F1-score : {f1_score(y_val, val_pred_label):.4f}")
print(f"AUC-ROC  : {roc_auc_score(y_val, val_pred_proba):.4f}")

# 3. 추가 보고서
print("\n📋 Classification Report:")
print(classification_report(y_val, val_pred_label))

# 4. 혼동행렬
print("🧮 Confusion Matrix:")
print(confusion_matrix(y_val, val_pred_label))


📊 모델 성능 평가 결과:
Accuracy : 0.8554
Precision: 0.9021
Recall   : 0.3249
F1-score : 0.4778
AUC-ROC  : 0.8302

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      1553
           1       0.90      0.32      0.48       397

    accuracy                           0.86      1950
   macro avg       0.88      0.66      0.70      1950
weighted avg       0.86      0.86      0.83      1950

🧮 Confusion Matrix:
[[1539   14]
 [ 268  129]]


## test 데이터에 적용하여 csv파일 생성.

In [None]:
test_pred_label = rf.predict(x_test_encoded)
test_pred_proba = rf.predict_proba

# ID 생성
pd.DataFrame({'pred' : test_pred_label}).to_csv('수험번호.csv', index=True)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Gender
- Geography
- Surname
Feature names seen at fit time, yet now missing:
- Gender_ male
- Gender_Female
- Gender_Male
- Gender_female
- Geography_France
- ...


In [14]:
# 1. 결과변수가 무엇인지 확인
temp = train.columns[~train.columns.isin(test.columns)]
print(temp)

Index(['Exited'], dtype='object')
