In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import train_test_split # 데이터 스플릿용
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀분석모델 사용
from sklearn.metrics import accuracy_score # Accuracy 측정하려고
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [None]:
train = pd.read_csv('train.csv')
train # 891 * 12
# train.info() # Name, Sex, Ticket, Cabin, Embarked : object
# train.isnull().sum() # Age 177, Cabin 687, Embarked 2 : NaN

# Sex Mapping
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})

In [None]:
sns.countplot(x='Survived', data=train)
plt.title('Survived (0 = Died, 1 = Survived)')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['Died', 'Survived'])
plt.show()

In [None]:
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title('Survival by Sex')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
sns.countplot(x='SibSp', hue='Survived', data=train)
plt.title('Survival by SibSp')
plt.xlabel('SibSp')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
sns.countplot(x='Parch', hue='Survived', data=train)
plt.title('Survival by Parch')
plt.xlabel('Parch')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.title('Survival by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
# Age NaN처리 Sex,SibSp, Parch가 같은그룹으로 묶어서 평균을 NaN에 적용
train['Age'] = train.groupby(['Sex', 'SibSp', 'Parch'])['Age'].transform(
    lambda x: x.fillna(x.mean())
)
# print(train['Age'].isnull().sum())  # 아직도 NaN 7
# Age NaN처리 Sex,Pclass가 같은그룹으로 묶어서 평균을 NaN에 적용
train['Age'] = train.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.mean())
)
# print(train['Age'].isnull().sum())  # 결측 없음

In [None]:
sns.countplot(x='Age', hue='Survived', data=train)
plt.title('Survival by Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
# Age 10살 단위로 그룹화
train['AgeGroup_10'] = (train['Age'] // 10).astype(int)
# 기존 Age열 제거
train = train.drop(['Age'], axis=1) # Cabin열 너무 많은 NaN으로 drop

sns.countplot(x='AgeGroup_10', hue='Survived', data=train)
plt.title('Survival by AgeGroup_10')
plt.xlabel('AgeGroup_10')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
# Embarked 최빈값으로 NaN 대체
most_common_embarked = train['Embarked'].mode()[0]
train['Embarked'].fillna(most_common_embarked, inplace=True)
# train.info() # Cabin 204 NaN, (Name, Ticket, Cabin, Embarked = object)

# Embarked 원핫인코딩
embarked_dummies = pd.get_dummies(train['Embarked'], prefix='Embarked', drop_first=True)
# 기존 train 데이터에 합치기
train = pd.concat([train, embarked_dummies], axis=1)
# 기존 Embarked열 제거
train = train.drop(['Embarked'], axis=1) # Cabin열 너무 많은 NaN으로 drop

sns.countplot(x='Embarked_Q', hue='Survived', data=train)
plt.title('Survival by Embarked_Q')
plt.xlabel('Embarked_Q')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
sns.countplot(x='Embarked_S', hue='Survived', data=train)
plt.title('Survival by Embarked_S')
plt.xlabel('Embarked_S')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['Died', 'Survived'])
plt.show()

In [None]:
# train # 891*15
# train.info()
    # 3   Name         891 non-null    object
    # 8   Ticket       891 non-null    object
    # 10  Cabin        204 non-null    object
    # 11  Embarked     891 non-null    object

# train.isnull().sum() # Cabin 687 NaN
train = train.drop(['Cabin','Name'], axis=1) # Cabin열 너무 많은 NaN으로 drop, Name 도 drop
# train # 891*14
train.info()

In [None]:
ticket_counts = train['Ticket'].value_counts()

In [None]:
# 접두사 추출: 숫자만 있는 경우 'NONE'으로
train['TicketPrefix'] = train['Ticket'].apply(lambda x: x.split()[0] if not x.isdigit() else 'NONE')

# Ticket 원핫 인코딩
ticket_prefix_dummies = pd.get_dummies(train['TicketPrefix'], prefix='Ticket')
train = pd.concat([train, ticket_prefix_dummies], axis=1)

# train.info()
train = train.drop(['Ticket','TicketPrefix'], axis=1) # 기존 Ticket, TickerPrefix drop
train.info()

In [None]:
corr = train.corr()['Survived'].abs().sort_values(ascending=False) # corr에다가 Survived와 상관계수의 절대값이 높은 순서대로(내림차순) 정렬
print(corr) # 각각의 상관계수 확인
best = corr.index[1:15] # 0은 Survived니까 빼고
print(best)

In [None]:
# feature 끼리의 상관관계 확인
repeat_reduce = train[best].corr()

sns.heatmap(repeat_reduce, annot=True, fmt=".2f", cmap="coolwarm") # Heatmap 은 seaborn에서 된다
plt.show()

In [None]:
# 중복은 없는듯, NaN값 있는지 확인
X = train[best]
y = train['Survived']
# X, y 분할
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
S_model = LogisticRegression()
S_model.fit(train_X, train_y)

# 테스트 및 정확도확인
predict_y = S_model.predict(test_X)
accuracy = accuracy_score(test_y, predict_y)
print(f"\n\n\nLogistic Regression Model's Accuracy : {accuracy:.4f}\n\n\n")

In [None]:
# 실제 test 데이터 준비 (PassengerId 필요)
test = pd.read_csv('test.csv')
test_ids = test['PassengerId']

test.isnull().sum()
# Name Sex Ticket Cabin Embarked : object
# Age 86, Fare 1, Cabin 327 : NaN

print(test['Age'].value_counts())

In [None]:
# 전처리 (학습 때와 동일하게)
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [None]:
# Age NaN처리 Sex,SibSp, Parch가 같은그룹으로 묶어서 평균을 NaN에 적용
test['Age'] = test.groupby(['Sex', 'SibSp', 'Parch'])['Age'].transform(
    lambda x: x.fillna(x.mean())
)
test['Age'] = test.groupby(['Sex', 'Pclass'])['Age'].transform(
    lambda x: x.fillna(x.mean())
)
# Age 10살 단위로 그룹화
test['AgeGroup_10'] = (test['Age'] // 10).astype(int)
# 기존 Age열 제거
test = test.drop(['Age'], axis=1)

test.isnull().sum()

In [None]:
# Embarked 최빈값으로 NaN 대체
most_common_embarked = test['Embarked'].mode()[0]
test['Embarked'].fillna(most_common_embarked, inplace=True)
# Embarked 원핫인코딩
embarked_dummies = pd.get_dummies(test['Embarked'], prefix='Embarked', drop_first=True)
# 기존 test 데이터에 합치기
test = pd.concat([test, embarked_dummies], axis=1)
# 기존 Embarked열 제거
test = test.drop(['Embarked'], axis=1)
# Cabin열 너무 많은 NaN으로 drop, Name 도 drop
test = test.drop(['Cabin','Name'], axis=1)
# 접두사 추출: 숫자만 있는 경우 'NONE'으로
test['TicketPrefix'] = test['Ticket'].apply(lambda x: x.split()[0] if not x.isdigit() else 'NONE')
# Ticket 원핫 인코딩
ticket_prefix_dummies = pd.get_dummies(test['TicketPrefix'], prefix='Ticket')
test = pd.concat([test, ticket_prefix_dummies], axis=1)

In [None]:
most_common_Fare = test['Fare'].mode()[0]
test['Fare'].fillna(most_common_Fare, inplace=True)

In [None]:
# 사용한 피처만 선택 (X.columns 그대로 사용)
test_X = test[X.columns]

# 예측
submission_preds = S_model.predict(test_X)

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': submission_preds
})
# submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.to_csv('submission.csv', index=False)
print("submission.csv 저장 완료!")