## 와인종류 분류

In [1]:
# 데이터 분석 순서
# 1. 라이브러리 및 데이터 확인
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
wine = load_wine()
x = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.DataFrame(wine.target)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2023)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_test = pd.DataFrame(x_test)

x_test.reset_index()
y_train.columns = ['target']

In [2]:
# 와인의 종류를 분류해보자
# 데이터의 결측치, 이상치에 대해 처리하고
# 분류모델을 사용하여 정확도, F1 score, AUC 값을 산출하시오.
# 제출은 result변수에 담아 양식에 맞게 제출하시오

In [3]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

In [4]:
# 2. 데이터 탐색(EDA)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(142, 13)
(36, 13)
(142, 1)


In [5]:
# 초기 데이터 확인
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
52     13.82        1.75  2.42               14.0      111.0           3.88   
146    13.88        5.04  2.23               20.0       80.0           0.98   
44     13.05        1.77  2.10               17.0      107.0           3.00   

     flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
52         3.74                  0.32             1.87             7.05  1.01   
146        0.34                  0.40             0.68             4.90  0.58   
44         3.00                  0.28             2.03             5.04  0.88   

     od280/od315_of_diluted_wines  proline  
52                           3.26   1190.0  
146                          1.33    415.0  
44                           3.35    885.0  
     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
168    13.58        2.58  2.69               24.5      105.0           1.55   
144    12.25        

In [6]:
# 변수명과 데이터 타입이 매칭이 되는지, 결측치가 있는지 확인
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 142 entries, 52 to 115
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       142 non-null    float64
 1   malic_acid                    142 non-null    float64
 2   ash                           142 non-null    float64
 3   alcalinity_of_ash             142 non-null    float64
 4   magnesium                     142 non-null    float64
 5   total_phenols                 142 non-null    float64
 6   flavanoids                    142 non-null    float64
 7   nonflavanoid_phenols          142 non-null    float64
 8   proanthocyanins               142 non-null    float64
 9   color_intensity               142 non-null    float64
 10  hue                           142 non-null    float64
 11  od280/od315_of_diluted_wines  142 non-null    float64
 12  proline                       142 non-null    float64
dtypes: float6

In [7]:
# x_train과 x_test 데이터의 기초통계량을 잘 비교
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

                              count        mean         std     min       25%  \
alcohol                       142.0   13.025915    0.812423   11.03   12.3700   
malic_acid                    142.0    2.354296    1.142722    0.74    1.6100   
ash                           142.0    2.340211    0.279910    1.36    2.1900   
alcalinity_of_ash             142.0   19.354225    3.476825   10.60   16.8000   
magnesium                     142.0   98.732394   13.581859   70.00   88.0000   
total_phenols                 142.0    2.303592    0.633955    0.98    1.7575   
flavanoids                    142.0    2.043592    1.033597    0.34    1.2275   
nonflavanoid_phenols          142.0    0.361479    0.124627    0.14    0.2700   
proanthocyanins               142.0    1.575070    0.576798    0.41    1.2425   
color_intensity               142.0    5.005070    2.150186    1.28    3.3000   
hue                           142.0    0.950394    0.220736    0.54    0.7825   
od280/od315_of_diluted_wines

In [8]:
# y데이터도 구체적으로 살펴보세요
print(y_train.head())

     target
52        0
146       2
44        0
67        1
43        0


In [9]:
print(y_train.value_counts())

target
1         57
0         47
2         38
Name: count, dtype: int64


In [10]:
# 3. 데이터 전처리 및 분리
# 1)결측치, 2)이상치, 3)변수 처리하기

In [11]:
# 결측치 확인
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64
alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64
target    0
dtype: int64


In [12]:
# 결측치 제거
# df = df.dropna()
# 참고사항 # 행 기준으로 삭제됨

# 주의사항
# x_train의 행을 제거해야 하는 경우, 그에 해당하는 y_train 행도 제거해야 합니다.
# 해결방법 : train = pd.concat([x_train, y_train], axis=1)
# 위와 같이 데이터를 결합한 후에 행을 제거하고 다시 데이터 분리를 수행하면 됨
# (만약 원데이터가 x_train/y_train이 결합된 형태로 주어진다면 전처리를 모두 수행한 후에 분리해도 됨)

In [13]:
# 결측치 대체(평균값, 중앙값, 최빈값)
# 주의사항 : train 데이터의 중앙값/평균값/최빈값 등으로 test 데이터의 결측치도 변경해줘야 함
# 연속형 변수 : 중앙값, 평균값
# 범주형 변수 : 최빈값
# df['변수명'] = df['변수명'].fillna(대체할 값)

In [14]:
# 이상치 대체(예시)
# df['변수명'] = np.where(df['변수명']>=5, 대체할 값, df['변수명'])

In [15]:
# 변수처리
# 불필요한 변수 제거
# df = df.drop(columns=['변수1', '변수2'])
# df = df.drop(['변수1', '변수2'], axis=1)
# 필요시 변수 추가(파생변수 생성)
# df['파생변수명'] = df['A']*df['B']
# 원핫인코딩(가변수 처리)
# x_train = pd.get_dummies(x_train)
# x_test = pd.get_dummies(x_test)

In [16]:
# 데이터 분리
# 데이터를 훈련 세트와 검증용 세트로 분할 (80% 훈련, 20% 검증용)
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size=0.2,
                                                  stratify=y_train['target'],
                                                  random_state=2023)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(113, 13)
(29, 13)
(113,)
(29,)


In [17]:
# 4. 모델링 및 성능평가
# 랜덤포레스트 모델 사용 (참고 : 회귀모델은 RandomForestRegressor)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [18]:
# 모델을 사용하여 테스트 데이터 에측
y_pred = model.predict(x_val)

In [19]:
# 모델 성능 평가 (정확도, F1 score, 민감도, 특이도 등)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_val, y_pred) # (실제값, 예측값)
f1 = f1_score(y_val, y_pred, average='macro')
# auc = roc_auc_score(y_val, y_pred) # AUC는 이진분류

In [20]:
# 정확도 (Accuracy)
print(acc)

1.0


In [21]:
# macro f1 score
print(f1)

1.0


In [24]:
# 5. 예측값 제출
# (주의) test 셋을 모델에 넣어 나온 예측값을 제출해야 함
# 1. 특정 클래스로 분류할 경우 (predict)
y_result = model.predict(x_test)
print(y_result[:5])
# 2. 특정 클래스로 분류될 확률을 구할 경우 (predict_proba)
y_result_prob = model.predict_proba(x_test)
print(y_result_prob[:5])

result_prob = pd.DataFrame({'result':y_result,
                            'prob_0':y_result_prob[:,0],
                            'prob_1':y_result_prob[:,1],
                            'prob_2':y_result_prob[:,2]})
print(result_prob[:5])

[2 2 2 0 1]
[[0.01 0.   0.99]
 [0.09 0.11 0.8 ]
 [0.   0.07 0.93]
 [1.   0.   0.  ]
 [0.04 0.87 0.09]]
   result  prob_0  prob_1  prob_2
0       2    0.01    0.00    0.99
1       2    0.09    0.11    0.80
2       2    0.00    0.07    0.93
3       0    1.00    0.00    0.00
4       1    0.04    0.87    0.09


In [25]:
result = pd.DataFrame({'result':y_result}).to_csv('wine_classify_excercise.csv', index=False)

In [26]:
df = pd.read_csv('wine_classify_excercise.csv')
print(df.head(5))

   result
0       2
1       2
2       2
3       0
4       1


## 당뇨진척정도 회귀

In [27]:
# 1. 라이브러리 및 데이터 확인
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2024)
x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_test.rename(columns={'index':'cust_id'}, inplace=True)
x_train.rename(columns={'index':'cust_id'}, inplace=True)
y_train.columns = ['cust_id', 'target']

In [28]:
# 참고사항
# y_test는 실기 문제상에 주어지지 않음
# train = pd.read_csv("data/customer_train.csv")
# test = pd.read_csv("data/customer_test.csv")
# x_train과 y_train, x_test를 별도로 할당해줘야 함

In [29]:
# 당뇨병 환자의 질병 진행정도를 예측해보자
# 데이터의 결측치, 이상치, 변수들에 대해 전처리하고
# 회귀모델을 이용하여 Rsq, MSE 값을 산출하시오.
# 제출은 cust_id, target 변수를 가진 dataframe 형태로 제출하시오.

In [30]:
# 데이터 설명
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [31]:
# 2. 데이터 탐색(EDA)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(353, 11)
(89, 11)
(353, 2)


In [32]:
# 초기 데이터 확인
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

   cust_id       age       sex       bmi        bp        s1        s2  \
0      424  0.001751  0.050680  0.011039 -0.019442 -0.016704 -0.003819   
1      121  0.063504 -0.044642  0.017506  0.021872  0.008063  0.021546   
2       15 -0.052738  0.050680 -0.018062  0.080401  0.089244  0.107662   

         s3        s4        s5        s6  
0 -0.047082  0.034309  0.024055  0.023775  
1 -0.036038  0.034309  0.019907  0.011349  
2 -0.039719  0.108111  0.036060 -0.042499  
   cust_id       age      sex       bmi        bp        s1        s2  \
0      334 -0.060003  0.05068 -0.047163 -0.022885 -0.071743 -0.057681   
1      313  0.059871  0.05068  0.053074  0.052858  0.032830  0.019667   
2      133 -0.041840  0.05068 -0.053630 -0.040099 -0.084126 -0.071772   

         s3        s4        s5        s6  
0 -0.006584 -0.039493 -0.062917 -0.054925  
1 -0.010266  0.034309  0.055203 -0.001078  
2 -0.002903 -0.039493 -0.072133 -0.030072  
   cust_id  target
0      424   111.0
1      121   173.0
2

In [33]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  353 non-null    int64  
 1   age      353 non-null    float64
 2   sex      353 non-null    float64
 3   bmi      353 non-null    float64
 4   bp       353 non-null    float64
 5   s1       353 non-null    float64
 6   s2       353 non-null    float64
 7   s3       353 non-null    float64
 8   s4       353 non-null    float64
 9   s5       353 non-null    float64
 10  s6       353 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 30.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  89 non-null     int64  
 1   age      89 non-null     float64
 2   sex      89 non-null     float64
 3   bmi      89 non-null     float64
 4   bp      

In [34]:
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

         count        mean         std       min         25%         50%  \
cust_id  353.0  219.444759  130.610642  0.000000  105.000000  220.000000   
age      353.0   -0.000729    0.047086 -0.107226   -0.038207    0.009016   
sex      353.0   -0.001166    0.047544 -0.044642   -0.044642   -0.044642   
bmi      353.0   -0.001373    0.048839 -0.089197   -0.036385   -0.008362   
bp       353.0   -0.000485    0.048083 -0.112399   -0.036656   -0.005670   
s1       353.0   -0.001284    0.048113 -0.126781   -0.035968   -0.004321   
s2       353.0   -0.001635    0.048175 -0.112795   -0.032629   -0.005072   
s3       353.0    0.002004    0.048396 -0.102307   -0.032356   -0.002903   
s4       353.0   -0.002704    0.047260 -0.076395   -0.039493   -0.002592   
s5       353.0   -0.001932    0.047925 -0.126097   -0.034522   -0.006081   
s6       353.0   -0.000550    0.047128 -0.137767   -0.030072   -0.001078   

                75%         max  
cust_id  337.000000  441.000000  
age        0.034443

In [37]:
print(y_train.head(3))
print(y_train.describe().T)

   cust_id  target
0      424   111.0
1      121   173.0
2       15   171.0
         count        mean         std   min    25%    50%    75%    max
cust_id  353.0  219.444759  130.610642   0.0  105.0  220.0  337.0  441.0
target   353.0  151.838527   76.603981  25.0   88.0  139.0  210.0  346.0


In [38]:
# 3. 데이터 전처리 및 분리
# 1)결측치, 2)이상치, 3)변수 처리하기
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
cust_id    0
target     0
dtype: int64


In [39]:
# 결측치 제거
# 주의사항 : x_train의 행을 제거하는 경우, 그에 해당하는 y_train 행도 제거해야 한다
# train = pd.concat([x_train, y_train], axis=1)
# 데이터를 결합한 후에 행을 제거하고 다시 데이터 분리를 수행하면 됨
# (만약 원데이터가 x_train/y_train이 결합된 형태로 주어진다면 전처리를 모두 수행한 후에 분리해도 됨)

In [40]:
# 변수처리
# 불필요한 변수 제거
# 원핫인코딩(가변수처리)

In [41]:
# 변수처리
cust_id = x_test['cust_id'].copy()

x_train = x_train.drop(['cust_id'], axis=1)
x_test = x_test.drop(columns=['cust_id'])

In [42]:
# 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size=0.2,
                                                  random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(282, 10)
(71, 10)
(282,)
(71,)


In [44]:
# 4. 모델링 및 성능평가
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=2024)
model.fit(x_train, y_train)

In [45]:
y_pred = model.predict(x_val)

In [46]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

In [47]:
print(mse)

2881.9930478873243


In [48]:
print(r2)

0.5797859925453159


In [49]:
# RMSE
rmse = mse**0.5
print(rmse)

53.68419737583235


In [52]:
# 5. 예측값 제출
y_result = model.predict(x_test)
result = pd.DataFrame({'cust_id':cust_id, 'target':y_result})
print(result[:5])

   cust_id  target
0      334   75.49
1      313  212.64
2      133   81.76
3      152  184.70
4      250  234.03


In [53]:
pd.DataFrame({'cust_id':cust_id, 'target':y_result}).to_csv('diabetes_regression_excercise.csv', index=False)

In [54]:
df = pd.read_csv('diabetes_regression_excercise.csv')
print(df.head())

   cust_id  target
0      334   75.49
1      313  212.64
2      133   81.76
3      152  184.70
4      250  234.03


## 팁 예측하기 회귀

In [19]:
# 1. 라이브러리 및 데이터 확인
import pandas as pd
import numpy as np
import seaborn as sns
df = sns.load_dataset('tips')
x = df.drop(['tip'], axis=1)
y = df['tip']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2024)
x_train = pd.DataFrame(x_train.reset_index())
x_test = pd.DataFrame(x_test.reset_index())
y_train = pd.DataFrame(y_train.reset_index())
x_test.rename(columns={'index':'cust_id'}, inplace=True)
x_train.rename(columns={'index':"cust_id"}, inplace=True)
y_train.columns = ['cust_id', 'target']

In [2]:
# 레스토랑의 tip 예측 문제
# 데이터의 결측치,이상치,변수에 대해 처리하고
# 회귀모델을 사용하여 Rsq, MSE 값을 산출하시오.

In [3]:
# - total_bill(총	청구액):	손님의	식사	총	청구액
# - tip(팁):	팁의	양
# - sex(성별):	손님의	성별
# - smoker(흡연자):	손님의	흡연	여부("Yes"	또는	"No")
# - day(요일):	식사가	이루어진	요일
# - time(시간):	점심	또는	저녁	중	언제	식사가	이루어졌는지
# - size(인원	수):	식사에	참석한	인원	수

In [20]:
# 2. 데이터 탐색(EDA)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(195, 7)
(49, 7)
(195, 2)


In [21]:
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

   cust_id  total_bill     sex smoker   day    time  size
0      209       12.76  Female    Yes   Sat  Dinner     2
1       88       24.71    Male     No  Thur   Lunch     2
2        2       21.01    Male     No   Sun  Dinner     3
   cust_id  total_bill     sex smoker  day    time  size
0      239       29.03    Male     No  Sat  Dinner     3
1      238       35.83  Female     No  Sat  Dinner     3
2      170       50.81    Male    Yes  Sat  Dinner     3
   cust_id  target
0      209    2.23
1       88    5.85
2        2    3.50


In [22]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   cust_id     195 non-null    int64   
 1   total_bill  195 non-null    float64 
 2   sex         195 non-null    category
 3   smoker      195 non-null    category
 4   day         195 non-null    category
 5   time        195 non-null    category
 6   size        195 non-null    int64   
dtypes: category(4), float64(1), int64(2)
memory usage: 6.0 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   cust_id     49 non-null     int64   
 1   total_bill  49 non-null     float64 
 2   sex         49 non-null     category
 3   smoker      49 non-null     category
 4   day         49 non-null     category
 5   time        49 non-null     category
 6   size   

In [23]:
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

            count        mean        std   min    25%     50%      75%     max
cust_id     195.0  122.394872  70.734606  2.00  61.50  123.00  183.500  243.00
total_bill  195.0   19.754564   8.699479  3.07  13.38   17.59   24.535   48.33
size        195.0    2.589744   0.977002  1.00   2.00    2.00    3.000    6.00
            count        mean        std   min    25%     50%     75%     max
cust_id      49.0  117.938776  70.579567  0.00  56.00  115.00  172.00  239.00
total_bill   49.0   19.910816   9.763248  7.25  13.28   18.24   22.23   50.81
size         49.0    2.489796   0.844651  1.00   2.00    2.00    3.00    6.00
         count        mean        std  min   25%     50%      75%    max
cust_id  195.0  122.394872  70.734606  2.0  61.5  123.00  183.500  243.0
target   195.0    2.946615   1.330591  1.0   2.0    2.71    3.525    9.0


In [24]:
# object, category 데이터도 추가 확인
print(x_train.describe(include='category').T)
print(x_test.describe(include='category').T)

       count unique     top freq
sex      195      2    Male  127
smoker   195      2      No  118
day      195      4     Sat   71
time     195      2  Dinner  140
       count unique     top freq
sex       49      2    Male   30
smoker    49      2      No   33
day       49      4     Sun   19
time      49      2  Dinner   36


In [25]:
print(y_train.head())
print(y_train.describe().T)

   cust_id  target
0      209    2.23
1       88    5.85
2        2    3.50
3      142    5.00
4      125    4.20
         count        mean        std  min   25%     50%      75%    max
cust_id  195.0  122.394872  70.734606  2.0  61.5  123.00  183.500  243.0
target   195.0    2.946615   1.330591  1.0   2.0    2.71    3.525    9.0


In [26]:
# 3. 데이터 전처리 및 분리
# 1)결측치, 2)이상치, 3)변수 처리하기
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

cust_id       0
total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64
cust_id       0
total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64
cust_id    0
target     0
dtype: int64


In [12]:
# 결측치 제거
# df = df.dropna()
# 주의사항
# x_train의 행을 제거해야 하는 경우, 그에 해당하는 y_train 행도 제거해야 함
# 해결방법 : train = pd.concat([x_train, y_train], axis=1)
# 위와 같이 데이터를 결합한 후에 행을 제거하고 다시 데이터 분리를 수행하면 됨.

In [13]:
# 결측치 대체(평균값, 중앙값, 최빈값)
# 주의사항 : train 데이터의 중앙값/평균값/최빈값 등으로 test 데이터의 결측치도 변경해줘야 함

In [14]:
# 이상치 대체
# df['변수명'] = np.where(df['변수명']>=5, 대체할 값, df['변수명'])

In [27]:
# 변수처리
# 불필요한 변수 제거
cust_id = x_test['cust_id'].copy()
x_train = x_train.drop(['cust_id'], axis=1)
x_test = x_test.drop(columns=['cust_id'])

In [28]:
# 변수처리(원핫인코딩)
print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  195 non-null    float64 
 1   sex         195 non-null    category
 2   smoker      195 non-null    category
 3   day         195 non-null    category
 4   time        195 non-null    category
 5   size        195 non-null    int64   
dtypes: category(4), float64(1), int64(1)
memory usage: 4.5 KB
None


In [29]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

def dtypechange(dummies):
    for col in dummies.columns:
        if dummies[col].dtype == bool:
            dummies[col] = dummies[col].astype(np.uint8)
    return dummies

x_train = dtypechange(x_train)
x_test = dtypechange(x_test)

print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   195 non-null    float64
 1   size         195 non-null    int64  
 2   sex_Male     195 non-null    uint8  
 3   sex_Female   195 non-null    uint8  
 4   smoker_Yes   195 non-null    uint8  
 5   smoker_No    195 non-null    uint8  
 6   day_Thur     195 non-null    uint8  
 7   day_Fri      195 non-null    uint8  
 8   day_Sat      195 non-null    uint8  
 9   day_Sun      195 non-null    uint8  
 10  time_Lunch   195 non-null    uint8  
 11  time_Dinner  195 non-null    uint8  
dtypes: float64(1), int64(1), uint8(10)
memory usage: 5.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   49 non-null     float64
 1   size   

In [30]:
# 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size=0.2,
                                                  random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(156, 12)
(39, 12)
(156,)
(39,)


In [31]:
# 4. 모델링 및 성능평가
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=24)
model.fit(x_train, y_train)

In [32]:
y_pred = model.predict(x_val)

In [33]:
from sklearn.metrics import mean_squared_error, r2_score
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

In [35]:
print(mse)
print(r2)

1.1216057448717942
0.5178401015220538


In [36]:
rmse = mse**0.5
print(rmse)

1.059058895846588


In [37]:
# 5. 예측값 제출
y_result = model.predict(x_test)

result = pd.DataFrame({'cust_id':cust_id, 'target':y_result})
print(result[:5])

   cust_id  target
0      239  2.8514
1      238  4.0042
2      170  6.0369
3      156  7.5887
4      134  3.5092


In [38]:
pd.DataFrame({'result':y_result}).to_csv('tips_regression_excercise.csv', index=False)

In [39]:
df = pd.read_csv('tips_regression_excercise.csv')
print(df.head())

   result
0  2.8514
1  4.0042
2  6.0369
3  7.5887
4  3.5092


## iris 종 분류

In [55]:
# 1. 라이브러리 및 데이터 확인
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
x = pd.DataFrame(iris.data, columns=['sepal_length','sepal_width','petal_length','petal_width'])
y = iris.target
y = np.where(y>0, 1, 0)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=2023)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
y_train = pd.DataFrame(y_train)
y_train.columns = ['species']
x_test.loc[0, 'sepal_length'] = None
x_train.loc[0, 'sepal_length'] = None
x_train.loc[0, 'sepal_width'] = 150

In [43]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [44]:
# 붓꽃의 종(Species)을 분류해보자
# 데이터의 결측치, 이상치에 대해 처리하고
# 분류모델을 사용하여 정확도, F1 score, AUC 값을 산출하시오.
# 제출은 result 변수에 담아 양식에 맞게 제출하시오

In [56]:
# 2. 데이터 탐색(EDA)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(120, 4)
(31, 4)
(120, 1)


In [57]:
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

    sepal_length  sepal_width  petal_length  petal_width
2            4.7          3.2           1.3          0.2
49           5.0          3.3           1.4          0.2
66           5.6          3.0           4.5          1.5
     sepal_length  sepal_width  petal_length  petal_width
93            5.0          2.3           3.3          1.0
69            5.6          2.5           3.9          1.1
137           6.4          3.1           5.5          1.8
   species
0        0
1        0
2        1


In [58]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 2 to 44
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  119 non-null    float64
 1   sepal_width   120 non-null    float64
 2   petal_length  120 non-null    float64
 3   petal_width   120 non-null    float64
dtypes: float64(4)
memory usage: 8.8 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 31 entries, 93 to 0
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  30 non-null     float64
 1   sepal_width   30 non-null     float64
 2   petal_length  30 non-null     float64
 3   petal_width   30 non-null     float64
dtypes: float64(4)
memory usage: 2.3 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   species  120 non-n

In [59]:
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

              count      mean        std  min    25%   50%    75%    max
sepal_length  119.0  5.916807   0.845759  4.3  5.150  6.00  6.500    7.9
sepal_width   120.0  4.292500  13.419277  2.2  2.800  3.00  3.325  150.0
petal_length  120.0  3.816667   1.798848  1.1  1.575  4.40  5.225    6.9
petal_width   120.0  1.226667   0.780512  0.1  0.300  1.35  1.800    2.5
              count      mean       std  min    25%   50%    75%  max
sepal_length   30.0  5.576667  0.705488  4.6  5.000  5.50  5.900  7.6
sepal_width    30.0  3.000000  0.522593  2.0  2.625  3.00  3.300  4.2
petal_length   30.0  3.523333  1.631518  1.0  1.600  4.05  4.925  6.6
petal_width    30.0  1.090000  0.685490  0.1  0.350  1.15  1.575  2.3
         count      mean       std  min  25%  50%  75%  max
species  120.0  0.666667  0.473381  0.0  0.0  1.0  1.0  1.0


In [60]:
print(y_train.head())

   species
0        0
1        0
2        1
3        1
4        1


In [61]:
print(y_train.value_counts())

species
1          80
0          40
Name: count, dtype: int64


In [62]:
# 3. 데이터 전처리 및 분리
# 1)결측치, 2)이상치, 3)변수 처리하기
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

sepal_length    1
sepal_width     0
petal_length    0
petal_width     0
dtype: int64
sepal_length    1
sepal_width     1
petal_length    1
petal_width     1
dtype: int64
species    0
dtype: int64


In [63]:
# 결측치 대체(중앙값)
# 주의사항 : train 데이터의 중앙값으로 test 데이터도 변경해줘야 함
median = x_train['sepal_length'].median()
x_train['sepal_length'] = x_train['sepal_length'].fillna(median)
x_test['sepal_length'] = x_test['sepal_length'].fillna(median)

In [64]:
print(x_test.isnull().sum())

sepal_length    0
sepal_width     1
petal_length    1
petal_width     1
dtype: int64


In [65]:
median_sepal_width = x_train['sepal_width'].median()
x_test['sepal_width'] = x_test['sepal_width'].fillna(median_sepal_width)
median_petal_length = x_train['petal_length'].median()
x_test['petal_length'] = x_test['petal_length'].fillna(median_petal_length)
median_petal_width = x_train['petal_width'].median()
x_test['petal_width'] = x_test['petal_width'].fillna(median_petal_width)

In [66]:
# 이상치 확인
cond1 = (x_train['sepal_width']>=10)
print(len(x_train[cond1]))

1


In [68]:
# 이상치 대체
cond1 = (x_train['sepal_width']<=10)
max = x_train[cond1]['sepal_width'].max()
print(max)
x_train['sepal_width'] = np.where(x_train['sepal_width']>=10, max, x_train['sepal_width'])
print(x_train.describe().T)

4.4
              count      mean       std  min    25%   50%    75%  max
sepal_length  120.0  5.917500  0.842232  4.3  5.175  6.00  6.500  7.9
sepal_width   120.0  3.079167  0.428383  2.2  2.800  3.00  3.325  4.4
petal_length  120.0  3.816667  1.798848  1.1  1.575  4.40  5.225  6.9
petal_width   120.0  1.226667  0.780512  0.1  0.300  1.35  1.800  2.5


In [69]:
# 변수처리
# 불필요한 변수 제거
# 필요시 변수 추가(파생변수 생성)
# 원핫인코딩(가변수 처리)

In [70]:
# 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['species'],
                                                  test_size=0.2,
                                                  stratify=y_train['species'],
                                                  random_state=2023)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(96, 4)
(24, 4)
(96,)
(24,)


In [71]:
# 4. 모델링 및 성능평가
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=2024)
model.fit(x_train, y_train)

In [72]:
y_pred = model.predict(x_val)

In [73]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)

In [74]:
print(acc)
print(f1)
print(auc)

1.0
1.0
1.0


In [75]:
# 참고사항
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
print(cm)

[[ 8  0]
 [ 0 16]]


In [76]:
#      예측
#      0  1
#실제0 TN FP
#    1 FN TP

In [77]:
# 5. 예측값 제출
y_result = model.predict(x_test)
print(y_result[:5])

[1 1 1 0 1]


In [78]:
y_result_prob = model.predict_proba(x_test)
print(y_result_prob[:5])

[[0.03 0.97]
 [0.   1.  ]
 [0.   1.  ]
 [1.   0.  ]
 [0.03 0.97]]


In [81]:
result = pd.DataFrame({'result':y_result,
                       'prob_0':y_result_prob[:,0],
                       'prob_1':y_result_prob[:,1]})
print(result[:5])

   result  prob_0  prob_1
0       1    0.03    0.97
1       1    0.00    1.00
2       1    0.00    1.00
3       0    1.00    0.00
4       1    0.03    0.97


In [82]:
pd.DataFrame({'result':y_result}).to_csv('iris_classify_excercise.csv', index=False)

In [83]:
df = pd.read_csv('iris_classify_excercise.csv')
print(df.head())

   result
0       1
1       1
2       1
3       0
4       1


## 타이타닉 생존자 분류

In [1]:
# 1. 라이브러리 및 데이터 확인
import pandas as pd
import numpy as np
import seaborn as sns
df = sns.load_dataset('titanic')
x = df.drop(['survived'], axis=1)
y = df['survived']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=2024)
x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
x_test.reset_index()
y_train.columns = ['target']
y_test.columns = ['target']

In [2]:
# 타이타닉 생존자 예측 문제
# 데이터의 결측치, 중복 변수값에 대해 처리하고
# 분류모델을 사용하여 Accuracy, F1 score, AUC 값을 산출하시오.

In [3]:
# ### 데이터 설명
# - survival	:	0	=	No,	1	=	Yes
# - pclass	:	객실	등급(1,2,3)
# - sex	:	성별
# - age	:	나이
# - sibsp	:	타이타닉호에	탑승한	형제/배우자의	수
# - parch	:	타이타닉호에	탑승한	부모/자녀의	수
# - fare	:	요금
# - embarked	:	탑승지	이름(C,	Q,	S)	Cherbourg	/	Queenstown	/	Southampton
# - (중복)class	:	객실	등급(First,	Second,	Third)
# - who	:	man,	women,	child
# - adult_male	:	성인남자인지	여부(True=성인남자,	False	그외)
# - deck	:	선실번호	첫	알파벳(A,B,C,D,E,F,G)
# - (중복)	embark_town	:	탑승지	이름(Cherbourg,	Queenstown,	Southampton)
# - (중복)	alive	:	생존여부(no:사망,	yes:생존)
# - alone	:	혼자	탑승했는지	여부(True=혼자,	False=가족과	함께)

In [4]:
# 2. 데이터 탐색(EDA)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(712, 14)
(179, 14)
(712, 1)


In [5]:
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

     pclass     sex   age  sibsp  parch     fare embarked  class    who  \
313       3    male  28.0      0      0   7.8958        S  Third    man   
736       3  female  48.0      1      3  34.3750        S  Third  woman   
165       3    male   9.0      0      2  20.5250        S  Third  child   

     adult_male deck  embark_town alive  alone  
313        True  NaN  Southampton    no   True  
736       False  NaN  Southampton    no  False  
165       False  NaN  Southampton   yes  False  
     pclass     sex   age  sibsp  parch     fare embarked  class    who  \
517       3    male   NaN      0      0  24.1500        Q  Third    man   
858       3  female  24.0      0      3  19.2583        C  Third  woman   
10        3  female   4.0      1      1  16.7000        S  Third  child   

     adult_male deck  embark_town alive  alone  
517        True  NaN   Queenstown    no   True  
858       False  NaN    Cherbourg   yes  False  
10        False    G  Southampton   yes  False  
     t

In [6]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 313 to 205
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   pclass       712 non-null    int64   
 1   sex          712 non-null    object  
 2   age          566 non-null    float64 
 3   sibsp        712 non-null    int64   
 4   parch        712 non-null    int64   
 5   fare         712 non-null    float64 
 6   embarked     710 non-null    object  
 7   class        712 non-null    category
 8   who          712 non-null    object  
 9   adult_male   712 non-null    bool    
 10  deck         165 non-null    category
 11  embark_town  710 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(3), object(5)
memory usage: 64.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 517 to 41
Data columns (total 14 columns):
 #   Column       N

In [7]:
print(x_train.describe().T)
print(x_test.describe().T)

        count       mean        std   min     25%    50%       75%       max
pclass  712.0   2.289326   0.842272  1.00   1.000   3.00   3.00000    3.0000
age     566.0  29.824205  14.491500  0.67  21.000  28.75  39.00000   80.0000
sibsp   712.0   0.536517   1.086324  0.00   0.000   0.00   1.00000    8.0000
parch   712.0   0.375000   0.803364  0.00   0.000   0.00   0.00000    6.0000
fare    712.0  31.699613  47.084052  0.00   7.925  14.50  31.06875  512.3292
        count       mean        std   min      25%   50%   75%       max
pclass  179.0   2.385475   0.808669  1.00   2.0000   3.0   3.0    3.0000
age     148.0  29.220743  14.699227  0.42  19.7500  27.5  37.0   74.0000
sibsp   179.0   0.469274   1.167397  0.00   0.0000   0.0   1.0    8.0000
parch   179.0   0.407821   0.818429  0.00   0.0000   0.0   0.0    5.0000
fare    179.0  34.211312  59.050296  0.00   7.8667  13.0  30.5  512.3292


In [8]:
print(x_train.describe(include='object').T)
print(x_test.describe(include='object').T)
print(x_train.describe(include='category').T)
print(x_test.describe(include='category').T)

            count unique          top freq
sex           712      2         male  464
embarked      710      3            S  513
who           712      3          man  435
embark_town   710      3  Southampton  513
alive         712      2           no  439
            count unique          top freq
sex           179      2         male  113
embarked      179      3            S  131
who           179      3          man  102
embark_town   179      3  Southampton  131
alive         179      2           no  110
      count unique    top freq
class   712      3  Third  385
deck    165      7      C   47
      count unique    top freq
class   179      3  Third  106
deck     38      7      C   12


In [9]:
print(y_train.value_counts())

target
0         439
1         273
Name: count, dtype: int64


In [10]:
# 3. 데이터 전처리 및 분리
# 1)결측치, 2)이상치, 3)변수 처리하기
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

pclass           0
sex              0
age            146
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           547
embark_town      2
alive            0
alone            0
dtype: int64
pclass           0
sex              0
age             31
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           141
embark_town      0
alive            0
alone            0
dtype: int64
target    0
dtype: int64


In [11]:
# 중복변수 제거
x_train = x_train.drop(columns=['class','embark_town','alive','deck'])
x_test = x_test.drop(['class','embark_town','alive','deck'], axis=1)

In [12]:
# 변수제거 확인
print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 313 to 205
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      712 non-null    int64  
 1   sex         712 non-null    object 
 2   age         566 non-null    float64
 3   sibsp       712 non-null    int64  
 4   parch       712 non-null    int64  
 5   fare        712 non-null    float64
 6   embarked    710 non-null    object 
 7   who         712 non-null    object 
 8   adult_male  712 non-null    bool   
 9   alone       712 non-null    bool   
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 51.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 517 to 41
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      179 non-null    int64  
 1   sex         179 non-null    object 
 2   age         148 non-null    float64
 3   sibsp       179 no

In [13]:
# 결측치 대체
med_age = x_train['age'].median()
x_train['age'] = x_train['age'].fillna(med_age)
x_test['age'] = x_test['age'].fillna(med_age)

mode_et = x_train['embarked'].mode()
x_train['embarked'] = x_train['embarked'].fillna(mode_et[0]) # 최빈값 주의!

In [14]:
# 결측치 대체 여부 확인
print(x_train.isnull().sum())
print(x_test.isnull().sum())

pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
who           0
adult_male    0
alone         0
dtype: int64
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
who           0
adult_male    0
alone         0
dtype: int64


In [15]:
# 변수처리(원핫인코딩)
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 313 to 205
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      712 non-null    int64  
 1   age         712 non-null    float64
 2   sibsp       712 non-null    int64  
 3   parch       712 non-null    int64  
 4   fare        712 non-null    float64
 5   adult_male  712 non-null    bool   
 6   alone       712 non-null    bool   
 7   sex_female  712 non-null    bool   
 8   sex_male    712 non-null    bool   
 9   embarked_C  712 non-null    bool   
 10  embarked_Q  712 non-null    bool   
 11  embarked_S  712 non-null    bool   
 12  who_child   712 non-null    bool   
 13  who_man     712 non-null    bool   
 14  who_woman   712 non-null    bool   
dtypes: bool(10), float64(2), int64(3)
memory usage: 40.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 517 to 41
Data columns (total 15 columns):
 #   Column      Non-Null Count  D

In [18]:
def dtypechange(dummies):
    for col in dummies.columns:
        if dummies[col].dtype == bool:
            dummies[col] = dummies[col].astype(np.uint8)
    return dummies

In [19]:
x_train = dtypechange(x_train)
x_test = dtypechange(x_test)
print(x_train.info())
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 313 to 205
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      712 non-null    int64  
 1   age         712 non-null    float64
 2   sibsp       712 non-null    int64  
 3   parch       712 non-null    int64  
 4   fare        712 non-null    float64
 5   adult_male  712 non-null    uint8  
 6   alone       712 non-null    uint8  
 7   sex_female  712 non-null    uint8  
 8   sex_male    712 non-null    uint8  
 9   embarked_C  712 non-null    uint8  
 10  embarked_Q  712 non-null    uint8  
 11  embarked_S  712 non-null    uint8  
 12  who_child   712 non-null    uint8  
 13  who_man     712 non-null    uint8  
 14  who_woman   712 non-null    uint8  
dtypes: float64(2), int64(3), uint8(10)
memory usage: 40.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 517 to 41
Data columns (total 15 columns):
 #   Column      Non-Null Count  

In [20]:
# advanced 버전
x_train_ad = x_train.copy()
x_test_ad = x_test.copy()
y_train_ad = y_train.copy()

In [21]:
# (참고사항) 원핫인코딩 후 변수의 수가 다른 경우
# => x_test의 변수의 수가 x_train보다 많은 경우 (혹은 그 반대의 경우)
# 원핫인코딩 후 Feature 수가 다를 경우
# 해결방법(x_test의 변수의 수가 더 많은 경우의 코드)
# x_train = x_train.reindex(columns=x_test.columns, fill_value=0)

In [22]:
# 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train['target'],
                                                  test_size=0.2,
                                                  stratify=y_train['target'],
                                                  random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(569, 15)
(143, 15)
(569,)
(143,)


In [23]:
# 4. 모델링 및 성능평가
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=2024)
model.fit(x_train, y_train)

In [24]:
y_pred = model.predict(x_val)

In [25]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)

In [26]:
print(acc)
print(f1)
print(auc)

0.8461538461538461
0.7884615384615384
0.8272727272727273


In [27]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, y_pred)
print(cm)
#        예측
#        0  1
# 실제 0 TN FP
#      1 FN TP

[[80  8]
 [14 41]]


In [29]:
# 실제 test 셋으로 성능평가를 한다면?
y_pred_f = model.predict(x_test)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
acc_f = accuracy_score(y_test, y_pred_f)
f1_f = f1_score(y_test, y_pred_f)
auc_f = roc_auc_score(y_test, y_pred_f)

In [30]:
print(acc_f)
print(f1_f)
print(auc_f)

0.7932960893854749
0.7299270072992701
0.7805006587615284


In [31]:
# Advanced 버전
# (주의) 전체 코드 실행시간이 1분으로 제한되어 있기 때문에, 가능하면 30초 미만으로 할 것!
# GridSearch CV를 활용한 하이퍼파라미터 최적화
# GridSearch : 격자탐색
# CV = CrossValidation, 교차검증
# (주의) 별도로 train/val 분리가 필요하지 않음
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_params = {'n_estimators':[30,70,100],
             'max_depth':[6,8,10],
             'min_samples_leaf':[1,2,3]
            }
# n_estimators : tree의 개수
# max_depth : tree의 최대 깊이
# min_samples_leaf : leaf node(더이상 분할되지 않는 마지막 노드)가 되기 위해 필요한 최소 샘플 수
# (이 값보다 작은 수의 샘플이 해당 노드에 있을 경우, 더이상 분할되지 않음)

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf = RandomForestClassifier(random_state=2024)
grid_rf = GridSearchCV(rf, param_grid=rf_params, cv=10)
grid_rf.fit(x_train_ad, y_train_ad['target'])
# y값 입력시 주의(1차원 형태로 입력해야 함)

print('최적 하이퍼파라미터 : ', grid_rf.best_params_)
print('Best 예측정확도 : ', grid_rf.best_score_)

최적 하이퍼파라미터 :  {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 30}
Best 예측정확도 :  0.8400039123630674


In [34]:
# (참고) help 함수를 통한 함수 사용법 확인
#help(GridSearchCV)
#help(RandomForestClassifier)

In [35]:
# 위의 최적 하이퍼파라미터로 랜덤포레스트 모델을 생성
from sklearn.ensemble import RandomForestClassifier
model_h = RandomForestClassifier(n_estimators=30,
                                 max_depth=10,
                                 min_samples_leaf=2,
                                 random_state=2024)
model_h.fit(x_train_ad, y_train_ad['target']) # y 데이터 입력시 주의

In [36]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
y_pred_h = model_h.predict(x_test)

In [37]:
acc_h = accuracy_score(y_test, y_pred_h)
f1_h = f1_score(y_test, y_pred_h)
auc_h = roc_auc_score(y_test, y_pred_h)

print(acc_h)
print(f1_h)
print(auc_h)
print('-------------')
print(acc_f)
print(f1_f)
print(auc_f)

0.8044692737430168
0.7244094488188977
0.7787878787878787
-------------
0.7932960893854749
0.7299270072992701
0.7805006587615284


In [38]:
# 5. 예측값 제출
# (주의) x_test 셋을 모델에 넣어 나온 예측값을 제출해야 함
y_result = model_h.predict(x_test)
print(y_result[:5])

y_result_prob = model_h.predict_proba(x_test)
print(y_result_prob[:5])

result = pd.DataFrame({'result':y_result,
                       'prob_0':y_result_prob[:,0],
                       'prob_1':y_result_prob[:,1]})
print(result[:5])

[0 1 1 1 1]
[[0.85922147 0.14077853]
 [0.37895882 0.62104118]
 [0.18422222 0.81577778]
 [0.         1.        ]
 [0.07349776 0.92650224]]
   result    prob_0    prob_1
0       0  0.859221  0.140779
1       1  0.378959  0.621041
2       1  0.184222  0.815778
3       1  0.000000  1.000000
4       1  0.073498  0.926502


In [39]:
pd.DataFrame({'result':y_result}).to_csv('titanic_classify_exercise.csv', index=False)

In [40]:
df = pd.read_csv('titanic_classify_exercise.csv')
print(df.head())

   result
0       0
1       1
2       1
3       1
4       1
