# 09.09
- 지도학습
  - 분류 알고리즘
  - 회귀 알고리즘

## SVM(Support Vector Machine)

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

# 데이터 불러오기
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
# 데이터 탐색
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [93]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [94]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [95]:
# 결측치 처리
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [96]:
# 타입 변환
df['Age'] = df['Age'].astype(int)
# Family 파생컬럼 생성
df['Family'] = df['SibSp'] + df['Parch']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Family       891 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 90.6+ KB


In [97]:
# 원핫 인코딩
onehot_sex = pd.get_dummies(df['Sex'])
onehot_embarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df, onehot_sex, onehot_embarked], axis=1)
# df = pd.concat([df, onehot_embarked], axis=1)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1,True,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0,True,False,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1,True,False,False,False,True
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,0,False,True,False,False,True


In [98]:
# 데이터 범주화
# df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
# df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

In [99]:
# 데이터 분할(독립변수, 종속변수)
X = df[['Pclass','Age','Fare','Family','female','male','C','Q','S']]
y = df['Survived']
print(X)
print(y)

     Pclass  Age     Fare  Family  female   male      C      Q      S
0         3   22   7.2500       1   False   True  False  False   True
1         1   38  71.2833       1    True  False   True  False  False
2         3   26   7.9250       0    True  False  False  False   True
3         1   35  53.1000       1    True  False  False  False   True
4         3   35   8.0500       0   False   True  False  False   True
..      ...  ...      ...     ...     ...    ...    ...    ...    ...
886       2   27  13.0000       0   False   True  False  False   True
887       1   19  30.0000       0    True  False  False  False   True
888       3   29  23.4500       3    True  False  False  False   True
889       1   26  30.0000       0   False   True   True  False  False
890       3   32   7.7500       0   False   True  False   True  False

[891 rows x 9 columns]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: i

In [100]:
# 상관관계 분석 함수
def correlation_analysis(X, y):
    data = X.copy()
    data['target'] = y
    correlation = data.corr()['target'].drop('target')
    return correlation.abs().sort_values(ascending=False)
correlation_results = correlation_analysis(X, y)
correlation_results

female    0.543351
male      0.543351
Pclass    0.338481
Fare      0.257307
C         0.168240
S         0.149683
Age       0.067809
Family    0.016639
Q         0.003650
Name: target, dtype: float64

In [101]:
# 상관관계 분석을 통한 X 재설정
X = df[['Pclass','Fare','female','male','C','S']]
# 데이터 분할(학습용, 테스트용)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# 알고리즘 선정(객체로 선언)
'''
1. C (Regularization Parameter)
역할: 분류 오류(마진 위반)를 얼마나 허용할지를 조절합니다.
작동 원리:
C가 크다 → 오류를 최대한 줄이려 함 → 마진이 좁아지고, 데이터에 과적합(Overfitting)될 수 있음.
C가 작다 → 오류를 더 허용 → 마진이 넓어지고, 데이터에 덜 맞을 수 있지만 일반화 성능이 좋아질 수 있음.

2. Kernel (커널 함수)
SVM은 원래 선형 분류기이지만, 커널을 이용해 비선형 문제에도 적용할 수 있습니다.
linear: 선형 결정 경계
poly: 다항식 커널 → 차수(degree)에 따라 복잡한 경계 형성
rbf (Gaussian Radial Basis Function): 가장 많이 쓰이는 커널, 국소적 유사도를 반영
sigmoid: 신경망의 활성화 함수와 유사한 형태

3. Gamma (γ)
적용되는 경우: RBF, Poly, Sigmoid 커널에서 사용
역할: 데이터 포인트 하나가 얼마나 넓은 범위에 영향을 미칠지를 조절합니다.

작동 원리:
γ가 크다 → 한 점이 매우 좁은 영역에만 영향 → 복잡한 경계 → 과적합 위험
γ가 작다 → 한 점이 넓은 영역에 영향 → 단순한 경계 → 과소적합 위험
'''
model = svm.SVC(kernel='rbf', C = 10, gamma=0.01, random_state=11)
# SVM으로 학습
model.fit(X_train, y_train)
# 예측
y_pred = model.predict(X_test)
# 결과 분석
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

(623, 6)
(268, 6)
(623,)
(268,)
[[149  27]
 [ 27  65]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       176
           1       0.71      0.71      0.71        92

    accuracy                           0.80       268
   macro avg       0.78      0.78      0.78       268
weighted avg       0.80      0.80      0.80       268

