In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

# LabelEncoder 객체 생성
encoder = LabelEncoder()

# 1. 데이터 로딩
file_path = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/refs/heads/main/titanic.csv"
df = pd.read_csv(file_path)

# 2. 결측치 확인
print(df.isnull().sum())
print("================================================================")

# 3. 결측치 제거 (Age의 결측치는 평균나이로)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df.dropna(inplace=True)
print(df)
print("================================================================")

# 4. 레이블 확인(생존자 확인)
print(df['Survived'].value_counts())
print("================================================================")

# 5. 불필요한 컬럼 제거
df.drop(["Name", "Ticket", "Embarked"], axis=1, inplace=True)
print(df)
print("================================================================")

# 6. 인코딩(머신러닝과 딥러닝의 모든 값을 숫자로 변경, 단 머신러닝의 label은 제외)
for column in df.select_dtypes(include=['object']).columns:
    df[column] = encoder.fit_transform(df[column])
print(df)
print("================================================================")

# 특징과 결과값 지정
X = df.drop('Survived', axis=1)
Y = df["Survived"]

# 훈련 데이터와 테스트 데이터 분리 (80% 훈련, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 모델 정의
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Machine': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # 훈련
    y_pred = model.predict(X_test)  # 예측
    
    # 연속적인 예측값을 0 또는 1로 변환해야 정확도와 혼동 행렬 확인 가능
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)


    print(f"{model_name} 모델")
    print("정확도(Accuracy):", accuracy_score(y_test, y_pred_binary))
    print("혼동 행렬(Confusion Matrix):\n", confusion_matrix(y_test, y_pred_binary))
    print("="*50)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
     PassengerId  Survived  Pclass  \
1              2         1       1   
3              4         1       1   
6              7         0       1   
10            11         1       3   
11            12         1       1   
..           ...       ...     ...   
871          872         1       1   
872          873         0       1   
879          880         1       1   
887          888         1       1   
889          890         1       1   

                                                  Name     Sex   Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
6                              McCarthy, Mr. Timothy J    male  5