In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

file_path = '/content/drive/MyDrive/00_KITA_2404/M3_분석라이브러리/pandas/dataset/titanic3.csv'
df = pd.read_csv(file_path)

In [4]:
print(df.head())

   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

     age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.00      0      0   24160  211.3375       B5        S    2    NaN   
1   0.92      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.00      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St Louis, MO  


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB
None


In [6]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [7]:
df.drop(columns=['name', 'ticket','body', 'home.dest', 'boat','cabin', 'pclass'], inplace=True)

In [8]:
print(df.isnull().sum())

survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64


In [9]:
df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.92,1,2,151.55,S
2,0,female,2.0,1,2,151.55,S
3,0,male,30.0,1,2,151.55,S
4,0,female,25.0,1,2,151.55,S


In [10]:
# 결측치 처리
df['age'].fillna(df['age'].mean(), inplace=True)
df['fare'].fillna(df['fare'].mean(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

In [11]:
## age를 6개의 범주로
bins=[0,5,12,18,35,60,100]
labels=['Infant', 'Child', 'Teenager','Young Adult', 'Adult', 'Senior']
df['age_group']=pd.cut(df['age'], bins=bins, labels=labels)
df.drop(columns=['age'], inplace=True)

In [12]:
## fare를 3개 범주로
bins=[0,30,100,600]
labels=['Low', 'Medium', 'High']
df['fare_group']=pd.cut(df['fare'], bins=bins, labels=labels)
df.drop(columns=['fare'], inplace=True)

In [13]:
## sibsp와 parch 합쳐서 가족 규모 변수 생성
df['family_size']=df['sibsp'] + df['parch'] + 1       # +1: 자기 자신 포함
df.drop(columns=['sibsp', 'parch'], inplace=True)

In [14]:
df.head()

Unnamed: 0,survived,sex,embarked,age_group,fare_group,family_size
0,1,female,S,Young Adult,High,1
1,1,male,S,Infant,High,4
2,0,female,S,Infant,High,4
3,0,male,S,Young Adult,High,4
4,0,female,S,Young Adult,High,4


In [15]:
# 범주형 변수를 더미 변수로 변환할 열 목록
categorical_columns=['age_group', 'fare_group', 'sex', 'embarked']

# 반복문을 사용하여 범주형 변수를 더미 변수로 변환
for column in categorical_columns:
  # 첫 번째 카테고리는 drop_first=True로 제거
  df=pd.get_dummies(df, columns=[column], drop_first=True)
df.head()

Unnamed: 0,survived,family_size,age_group_Child,age_group_Teenager,age_group_Young Adult,age_group_Adult,age_group_Senior,fare_group_Medium,fare_group_High,sex_male,embarked_Q,embarked_S
0,1,1,False,False,True,False,False,False,True,False,False,True
1,1,4,False,False,False,False,False,False,True,True,False,True
2,0,4,False,False,False,False,False,False,True,False,False,True
3,0,4,False,False,True,False,False,False,True,True,False,True
4,0,4,False,False,True,False,False,False,True,False,False,True


In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# 타겟 변수와 피처 변수 설정
X = df.drop('survived', axis=1)
y = df['survived']

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 크기 확인
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# 랜덤 포레스트 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 성능 평가
print("Accuracy:", accuracy_score(y_test, y_pred))
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#print("Classification Report:\n", classification_report(y_test, y_pred))


(1047, 11) (262, 11) (1047,) (262,)
Accuracy: 0.767175572519084


In [None]:

import pandas as pd
import numpy as np

# DataFrame을 순회하면서 숫자가 아닌 값을 출력
for index, row in df_new.iterrows():
    for column in df_new.columns:
        try:
            pd.to_numeric(row[column])
        except ValueError:
            print(f"인덱스 {index}의 {column} 컬럼에는 숫자가 아닌 값이 있습니다: {row[column]}")

## 결측값 처리