# 빅데이터 분석기사 실기 기출 4회 작업형 2 
[캐글 경진대회 링크](https://www.kaggle.com/competitions/big-data-analytics-certification/overview)  
[캐글 노트북 공유 링크](https://www.kaggle.com/code/minjunim/3-lgbm-roc-0-82) 

## [마케팅] 자동차 시장 세분화
- 자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화했습니다.
- 기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해주세요!
- 예측할 값(y): "Segmentation" (1,2,3,4)
- 평가: Macro f1-score
- data: train.csv, test.csv

제출 형식
```
ID,Segmentation
458989,1
458994,2
459000,3
459003,4
```

## 시험 환경처럼 진행

In [27]:
import pandas as pd
import numpy as np 

train = pd.read_csv("./Data/train.csv")
test = pd.read_csv("./Data/test.csv")

print(train.shape, "\n")
print(train.head(), "\n")
print(test.shape, "\n")
print(test.head(), "\n")

# 1. EDA
# 1-1) train EDA
print(train.info(), "\n")
print(train.isnull().sum(), "\n")     # train, test 다 결측치 존재 X
print(train.describe(), "\n")

# 1-2) test EDA
print(test.info(), "\n")
print(test.isnull().sum(), "\n")
print(test.describe(), "\n")

# 1-3) train, test nunique     
print(train.describe(include="O"), "\n")     # train, test 동일 
print(test.describe(include="O"), "\n")



# 2. 전처리
# 2-1) 결측치 처리, id같은 불필요 칼럼 제거, X와 y 분리
# 결측치 없음

# train ID 제거
train = train.drop("ID", axis=1)

# test ID 따로 
test_ID = test.pop("ID")

# X와 y 분리
y = train.pop("Segmentation")

print(train.head(3), "\n")
print(test.head(3), "\n")
print(y.head(3), "\n")



# 2-2) 스케일링
con_cols = train.select_dtypes(exclude="object").copy().columns
print("연속형 칼럼 : ", con_cols)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train[con_cols] = scaler.fit_transform(train[con_cols])
test[con_cols] = scaler.transform(test[con_cols])

# 2-3) 인코딩
from sklearn.preprocessing import LabelEncoder

# 범주형 칼럼 선택
cat_cols = train.select_dtypes(include="object").copy().columns
print("범주형 칼럼 : ", cat_cols)

for col in cat_cols :
	le = LabelEncoder()
	train[col] = le.fit_transform(train[col])
	test[col] = le.transform(test[col])

print(train.head(3), "\n")
print(test.head(3), "\n")



# 3. 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, y, random_state=2023, test_size=0.2, stratify=y)
print(X_train.shape, X_val.shape, "\n")



# 4. 모델링 : 다중분류
# 성능 지표 : macro f1-score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# 랜덤포레스트 분류
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=2023)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
rf_f1 = f1_score(y_val, rf_pred, average="macro")
print("랜덤포레스트 macro f1 : ", rf_f1)

# lgbm 분류
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=2023)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_val)
lgbm_f1 = f1_score(y_val, lgbm_pred, average="macro")
print(classification_report(y_val, lgbm_pred))
print("LGBM macro f1 : ", lgbm_f1)


# 최종 : lgbm
pred = lgbm.predict(test)

# 5. 제출 : df, csv
submit = pd.DataFrame({"ID" : test_ID, "Segmentation" : pred})
submit.to_csv("4_submission.csv", index=False)

check = pd.read_csv("4_submission.csv")
print(check.head(3))




# # 하이퍼 파라미터 튜닝
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer, f1_score

# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(random_state=2023)

# from lightgbm import LGBMClassifier
# lgbm = LGBMClassifier(random_state=2023)

# models = [rf, lgbm]

# params = {"max_depth" : [1, 2, 3], "n_estimators" : [100, 200, 300]}

# scorer = make_scorer(f1_score, average='macro')

# for model in models :
# 	gs = GridSearchCV(model, param_grid = params, cv=5, scoring=scorer, n_jobs=4)
# 	gs.fit(X_train, y_train)
	
# 	print(f"="*20)
# 	print(f"model : {model}")
# 	print(f"params : {gs.best_params_}")
# 	print(f"score : {gs.best_score_}")
	

(6665, 11) 

       ID  Gender Ever_Married  Age Graduated  Profession  Work_Experience  \
0  462809    Male           No   22        No  Healthcare              1.0   
1  466315  Female          Yes   67       Yes    Engineer              1.0   
2  461735    Male          Yes   67       Yes      Lawyer              0.0   
3  461319    Male          Yes   56        No      Artist              0.0   
4  460156    Male           No   32       Yes  Healthcare              1.0   

  Spending_Score  Family_Size  Var_1  Segmentation  
0            Low          4.0  Cat_4             4  
1            Low          1.0  Cat_6             2  
2           High          2.0  Cat_6             2  
3        Average          2.0  Cat_6             3  
4            Low          3.0  Cat_6             3   

(2154, 10) 

       ID  Gender Ever_Married  Age Graduated  Profession  Work_Experience  \
0  458989  Female          Yes   36       Yes    Engineer              0.0   
1  458994    Male          Ye

## 성능 비교
```
rf = RandomForestClassifier(random_state=2023)
랜덤포레스트 f1 :  0.48229512478013314

lgbm = LGBMClassifier(random_state=2023)
LGBM f1 :  0.5143631627306765
```

---

### 그리드 서치
```
====================
model : RandomForestClassifier(random_state=2023)
params : {'max_depth': 3, 'n_estimators': 300}
score : 0.47926290272717564
====================
model : LGBMClassifier(random_state=2023)
params : {'max_depth': 2, 'n_estimators': 200}
score : 0.5288824546333013
```