### random_state
random_state는 train_test_split 함수에서 데이터셋을 훈련용과 테스트용으로 나눌 때 사용하는 난수 시드(seed) 값을 의미합니다. 이 값을 지정하면 데이터 분할 과정에서 무작위성이 고정되어 실행할 때마다 같은 방식으로 데이터를 나누게 되어 결과의 재현성(reproducibility)을 보장합니다.
random_state는 난수 생성기의 초기값 역할을 하여 데이터 분할을 재현 가능하게 만드는 역할을 합니다.
* 예를 들어, random_state=11로 지정하면 데이터 셔플과 분할이 동일하게 이루어져, 여러 번 실행해도 일관된 X_train, X_test, y_train, y_test를 얻을 수 있습니다.
* 값을 지정하지 않거나 None일 경우에는 실행할 때마다 다르게 데이터를 분할합니다.
* 이는 모델 평가의 일관성을 확보하고, 디버깅이나 다른 사람과 결과를 공유할 때 중요합니다.

# 랜덤포레스트(RandomForest)

타이타닉 데이터셋 설명
* PassengerId: 승객 고유 ID
* Survived: 생존 여부 (0 = 사망, 1 = 생존)
* Pclass: 티켓 클래스 (1 = 1등석, 2 = 2등석, 3 = 3등석)
* Name: 승객 이름
* Sex: 성별
* Age: 나이(몇몇 결측치 존재)
* SibSp: 배에 탑승한 형제자매 및 배우자 수
* Parch: 배에 탑승한 부모 및 자녀 수
* Ticket: 티켓 번호
* Fare: 티켓 요금
* Cabin: 객실 번호 (많은 결측치 존재)
* Embarked: 탑승한 항구 (C = Cherbourg, Q = Queenstown, S = Southampton)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# 랜덤포레스트 알고리즘 모듈
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# 결측치 처리
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [4]:
# Family 파생컬럼 생성
df['Family'] = df['SibSp'] + df['Parch']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Family       891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [5]:
# # 원핫 인코딩
# onehot_sex = pd.get_dummies(df['Sex'])
# onehot_embarked = pd.get_dummies(df['Embarked'])
# df = pd.concat([df, onehot_sex, onehot_embarked], axis=1)

# 데이터 범주화
from sklearn.preprocessing import LabelEncoder
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2,0


In [6]:
# 데이터 분할(독립변수, 종속변수)
# X = df[['Pclass','Fare','female','male','C','S']]
X = df[['Pclass','Fare','Sex','Age','Embarked','Family']]
y = df['Survived']
print(X)
print(y)

     Pclass     Fare  Sex        Age  Embarked  Family
0         3   7.2500    1  22.000000         2       1
1         1  71.2833    0  38.000000         0       1
2         3   7.9250    0  26.000000         2       0
3         1  53.1000    0  35.000000         2       1
4         3   8.0500    1  35.000000         2       0
..      ...      ...  ...        ...       ...     ...
886       2  13.0000    1  27.000000         2       0
887       1  30.0000    0  19.000000         2       0
888       3  23.4500    0  29.699118         2       3
889       1  30.0000    1  26.000000         0       0
890       3   7.7500    1  32.000000         1       0

[891 rows x 6 columns]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [7]:
# 상관관계 분석 함수
def correlation_analysis(X, y):
    data = X.copy()
    data['target'] = y
    correlation = data.corr()['target'].drop('target')
    return correlation.abs().sort_values(ascending=False)
correlation_results = correlation_analysis(X, y)
correlation_results

Sex         0.543351
Pclass      0.338481
Fare        0.257307
Embarked    0.167675
Age         0.069809
Family      0.016639
Name: target, dtype: float64

In [8]:
X = df[['Pclass','Fare','Sex']]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# 모델 선정
model = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=11)
# 랜덤포레스트로 학습
model.fit(X_train, y_train)
# 예측
y_pred = model.predict(X_test)
# 결과 분석
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

(712, 3)
(179, 3)
(712,)
(179,)
[[112   6]
 [ 18  43]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       118
           1       0.88      0.70      0.78        61

    accuracy                           0.87       179
   macro avg       0.87      0.83      0.84       179
weighted avg       0.87      0.87      0.86       179



## AI 모델 저장 및 로드

In [10]:
# pickle
import pickle
# 저장
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# 로드
model_file = r'C:\Temp\2025\python2\37. 0909\model.pkl'
with open(model_file, 'rb') as f:
    my_model = pickle.load(f)

In [11]:
# 사용
y_pred2 = my_model.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

[[112   6]
 [ 18  43]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       118
           1       0.88      0.70      0.78        61

    accuracy                           0.87       179
   macro avg       0.87      0.83      0.84       179
weighted avg       0.87      0.87      0.86       179



In [12]:
# joblib
# from joblib import dump, load
import joblib

# 저장
joblib.dump(model, 'model.joblib')

# 로드
my_model2 = joblib.load('model.joblib')

In [13]:
# 사용
y_pred3 = my_model2.predict(X_test)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

[[112   6]
 [ 18  43]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       118
           1       0.88      0.70      0.78        61

    accuracy                           0.87       179
   macro avg       0.87      0.83      0.84       179
weighted avg       0.87      0.87      0.86       179

