---

## 필수 라이브러리 설치 및 한글 폰트 설정

In [198]:
import pandas as pd
import matplotlib.pyplot as plt
import platform
from IPython.display import display

In [199]:
if platform.system() == 'Windows':    # Windows
    plt.rc('font', family='Malgun Gothic')
elif platform.system() == 'Darwin':   # macOS
    plt.rc('font', family='AppleGothic')
else:                                 # Linux
    plt.rc('font', family='NanumGothic')

plt.rcParams['axes.unicode_minus'] = False  # 음수 부호 깨짐 방지

---

## 1-1. 데이터 불러오기

In [200]:
# CSV 파일 URL
url = "https://raw.githubusercontent.com/JyoARa/Machine-Learning/refs/heads/main/Project/Titanic/train.csv"

# 불러오기
df = pd.read_csv(url)

# 처음 5행 확인
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [201]:
# 원본 복사
df_kor = df.copy()

In [202]:
# 컬럼명 한글화

# 컬럼명 매핑 (영어 → 한글)
columns_map = {
    "PassengerId": "승객ID",
    "Survived": "생존여부",
    "Pclass": "객실등급",
    "Name": "이름",
    "Sex": "성별",
    "Age": "나이",
    "SibSp": "형제/배우자수",
    "Parch": "부모/자녀수",
    "Ticket": "티켓번호",
    "Fare": "요금",
    "Cabin": "객실번호",
    "Embarked": "승선항구"
}

# rename으로 변경
df_kor.rename(columns=columns_map, inplace=True)

# 확인
df_kor.head()

Unnamed: 0,승객ID,생존여부,객실등급,이름,성별,나이,형제/배우자수,부모/자녀수,티켓번호,요금,객실번호,승선항구
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


---

## 1-2. 데이터 분석

[🏷️ Titanic Data를 활용한 데이터분석 Project](https://jyoara.tistory.com/16) 를 참고해주세요

---

## 1-3. 데이터 전처리

In [203]:
df_Target = df_kor["생존여부"]
df_Features = df_kor.drop(columns=["생존여부"])

display(df_Target.to_frame().head())
display(df_Features.head())

Unnamed: 0,생존여부
0,0
1,1
2,1
3,1
4,0


Unnamed: 0,승객ID,객실등급,이름,성별,나이,형제/배우자수,부모/자녀수,티켓번호,요금,객실번호,승선항구
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [204]:
df_Features_Prep = df_Features.copy()

In [None]:
# ================================
# 1. 불필요한 컬럼 제거
# ================================

# 제거 전 크기 확인
print(f"[불필요 컬럼 제거 전] shape : {df_Features.shape}")
display(df_Features.head())

# 컬럼 제거
df_Features_Prep = df_Features_Prep.drop(columns=["승객ID", "이름"])

# 제거 후 크기 확인
print(f"[불필요 컬럼 제거 후] shape : {df_Features_Prep.shape}")
display(df_Features_Prep.head())

[불필요 컬럼 제거 전] shape : (891, 11)


Unnamed: 0,승객ID,객실등급,이름,성별,나이,형제/배우자수,부모/자녀수,티켓번호,요금,객실번호,승선항구
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


[불필요 컬럼 제거 후] shape : (891, 9)


Unnamed: 0,객실등급,성별,나이,형제/배우자수,부모/자녀수,티켓번호,요금,객실번호,승선항구
0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,female,35.0,1,0,113803,53.1,C123,S
4,3,male,35.0,0,0,373450,8.05,,S


In [208]:
# ================================
# 2-1. 결측치 확인
# ================================

# 각 컬럼별 결측치 개수 계산
missing_count = df_Features_Prep.isnull().sum()

# 전체 대비 결측치 비율 계산
missing_ratio = (missing_count / len(df_Features_Prep)) * 100

# 결측치 현황을 DataFrame으로 정리
missing_df = pd.DataFrame({
    "결측치 개수": missing_count,
    "결측치 비율(%)": missing_ratio.round(2)
})

# 표로 출력
display(missing_df)

Unnamed: 0,결측치 개수,결측치 비율(%)
객실등급,0,0.0
성별,0,0.0
나이,177,19.87
형제/배우자수,0,0.0
부모/자녀수,0,0.0
티켓번호,0,0.0
요금,0,0.0
객실번호,687,77.1
승선항구,2,0.22
