
## **인공지능개론 2주차 실습과제**

 **타이타닉 데이터** 를 가지고 **여러가지 고려사항(결측치,레이블,엔코딩)** 을 확인한뒤  
  **5가지 방법(RF, DT, LR, KNN, SVM)** 으로 분류하고 **accuracy 및 confusion matrix** 확인


### **데이터 가져오기**

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

# CSV 파일 읽기
file_path = "C:/Users/Admin/Desktop/20251학기 공부/인공지능개론/week2/titanic.csv"
df = pd.read_csv(file_path)
# 데이터 확인
print(df.head())
print(df.columns)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
In

### **결측치 확인 및 제거**

In [3]:
# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# Age의 빈곳을 평균나이로 채우기
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [7]:
# 결측치 재확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### **레이블 확인**

In [9]:
# 레이블 확인
print(df['Survived'].value_counts())

Survived
0    549
1    342
Name: count, dtype: int64


### **불필요한 컬럼 제거**

In [11]:
# 불필요한 컬럼 제거
df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'])

# 데이터 재확인
print(df.head())
print(df.columns)

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare
0            1         0       3    male  22.0      1      0   7.2500
1            2         1       1  female  38.0      1      0  71.2833
2            3         1       3  female  26.0      0      0   7.9250
3            4         1       1  female  35.0      1      0  53.1000
4            5         0       3    male  35.0      0      0   8.0500
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare'],
      dtype='object')


### **엔코딩(숫자)**

In [13]:
from sklearn.preprocessing import LabelEncoder

print(df['Sex'].value_counts()) 

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 'Sex' 컬럼을 숫자로 변환
df['Sex'] = label_encoder.fit_transform(df['Sex'])

print(df['Sex'].value_counts())

Sex
male      577
female    314
Name: count, dtype: int64
Sex
1    577
0    314
Name: count, dtype: int64


### **분류**

In [15]:
# 타겟 변수와 독립 변수 분리
X = df.drop('Survived', axis=1)
y = df['Survived']

# 훈련용 데이터와 테스트용 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
from sklearn.tree import DecisionTreeClassifier

# DT 모델 생성
dt_model = DecisionTreeClassifier(random_state=42)

# 모델 학습
dt_model.fit(X_train, y_train)

# 예측
y_pred = dt_model.predict(X_test)

# 정확도 확인
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# 혼동 행렬 확인
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy: 75.75%
Confusion Matrix:
[[124  33]
 [ 32  79]]


In [25]:
from sklearn.ensemble import RandomForestClassifier

# RF모델 생성 (n_estimators: 트리 개수 설정)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습
rf_model.fit(X_train, y_train)

# 예측
y_pred_rf = rf_model.predict(X_test)

# 정확도 출력
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf * 100:.2f}%")

# 혼동 행렬 계산
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(cm_rf)

Accuracy: 80.97%
Confusion Matrix:
[[142  15]
 [ 36  75]]


In [27]:
from sklearn.linear_model import LogisticRegression

# LR모델 생성
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# 모델 학습
lr_model.fit(X_train, y_train)

# 예측
y_pred_lr = lr_model.predict(X_test)

# 정확도 출력
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy_lr * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(cm_lr)

Accuracy: 80.97%
Confusion Matrix:
[[138  19]
 [ 32  79]]


In [29]:
from sklearn.svm import SVC

# SVM모델 생성 (커널은 RBF 사용)
svm_model = SVC(kernel='rbf', random_state=42)

# 모델 학습
svm_model.fit(X_train, y_train)

# 예측
y_pred_svm = svm_model.predict(X_test)

# 정확도 출력
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(cm_svm)

Accuracy: 60.82%
Confusion Matrix:
[[152   5]
 [100  11]]


In [31]:
from sklearn.neighbors import KNeighborsClassifier

# KNN모델 생성
knn_model = KNeighborsClassifier(n_neighbors=5)

# 모델 학습
knn_model.fit(X_train, y_train)

# 예측
y_pred_knn = knn_model.predict(X_test)

# 정확도 출력
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy: {accuracy_knn * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:")
print(cm_knn)


Accuracy: 64.55%
Confusion Matrix:
[[135  22]
 [ 73  38]]
