## **작업형 유형2_데이터 전처리**

- seaborn의 'penguins'데이터 사용
- 종속변수는 'species'열
- 결측치 처리, 인코딩, 데이터 타입 확인 후 더미 변환, 파생변수 생성, 스케일링 작업 순으로 진행

In [1]:
## 1. 데이터 불러오기
import seaborn as sns

df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [2]:
## 2. 결측치 처리
# 결측치 확인
df.isna().sum()

# 연속형 변수는 중앙값으로 결측치 처리
miss_col = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
for i in miss_col:
  df[i] = df[i].fillna(df[i].median())

# 이산형 변수는 많은 값으로 결측치 처리
print(df.sex.value_counts())
df['sex'] = df['sex'].fillna('Male')

df.isna().sum()

Male      168
Female    165
Name: sex, dtype: int64


species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
## 3. 라벨인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 카테고리 변수들을 라벨인코딩
label_col = ['species', 'island', 'sex']
df[label_col] = df[label_col].apply(le.fit_transform)

In [5]:
## 4. 데이터 타입 및 더미 변환
import pandas as pd
df.dtypes
# 종속변수를 제외한 카테고리 변수들을 더미 변환 
dummy_col = ['island', 'sex']
for i in dummy_col:
  df[i] = df[i].astype('category')
df = pd.get_dummies(df)

In [16]:
## 5. 파생변수 생성
# qcut을 활용하여 'body_mass_g'변수 구간분할
df['body_mass_g_qcut'] = pd.qcut(df['body_mass_g'], 5, labels=False)
df.body_mass_g_qcut.value_counts()

0    71
1    70
2    68
4    68
3    67
Name: body_mass_g_qcut, dtype: int64

In [19]:
## 6. 스케일링
scale_col = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[scale_col])
bdf[scale_col] = scaler.transform(df[scale_col])

In [33]:
## 7. train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df['species'], test_size=0.2, stratify=df['species'], random_state=1)

print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
bprint('y_test', y_test.shape)

X_train (275, 10)
X_test (69, 10)
y_train (275,)
y_test (69,)


## **작업형 유형2_모형학습 및 평가**

In [47]:
## 8. 모형 학습 
# randomforest model
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)

# adaboost model
import xgboost as xgb
model2 = xgb.XGBClassifier()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)

In [51]:
## 9. 앙상블
from sklearn.ensemble import VotingClassifier
# voting='hard': 투표, voting='soft': 확률평균
clf = VotingClassifier(estimators=[('rf', model1), ('xgb', model2)], voting='hard')
clf.fit(X_train, y_train)
pred3 = clf.predict(X_test)

In [60]:
## 10. 모델평가
from sklearn.metrics import accuracy_score

print('randomforest 정확도', accuracy_score(y_test, pred1))
print('xgboost 정확도', accuracy_score(y_test, pred2))
print('voting 정확도', accuracy_score(y_test, pred3))

# from sklearn.metrics import classification_report
# print('랜덤포레스트 정확도', classification_report(y_test, pred1))
# print('xgboost 정확도', classification_report(y_test, pred2))
# print('voting 정확도', classification_report(y_test, pred3))

randomforest 정확도 1.0
xgboost 정확도 1.0
voting 정확도 1.0


In [64]:
## 저장 및 답안 제출
pd.DataFrame({'id':y_test.index, 'pred':pred1}).to_csv('answer.csv', index=False)
pd.read_csv('answer.csv').head()

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
