#### 의사결정나무

In [117]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [118]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

age_mean = df['Age'].mean()
df.fillna({'Age': age_mean}, inplace=True)
Embarked_mode = df['Embarked'].mode()[0]
df.fillna({'Embarked': Embarked_mode}, inplace=True)

df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

df['FamilySize'] = df['SibSp']+df['Parch']

X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [119]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [120]:
dt = DecisionTreeClassifier(random_state=11)
dt.fit(X_train, y_train)

In [121]:
pred = dt.predict(X_test)
acc = accuracy_score(y_test, pred)
print(acc)

0.7877094972067039


In [122]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, pred)
print(mat)

[[98 20]
 [18 43]]


In [123]:
from sklearn.metrics import classification_report

rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       118
           1       0.68      0.70      0.69        61

    accuracy                           0.79       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



#### KNN

In [124]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [125]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")

In [126]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_width']] = scaler.fit_transform(df[['sepal_width']])
df[['petal_length']] = scaler.fit_transform(df[['petal_length']])
df[['petal_width']] = scaler.fit_transform(df[['petal_width']])

In [127]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [128]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [129]:
pred = knn.predict(X_test)

In [130]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.9333333333333333


In [131]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pred)
print(mat)

[[ 9  0  0]
 [ 0 10  0]
 [ 0  2  9]]


In [132]:
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.83      1.00      0.91        10
   virginica       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



#### SVM

In [133]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.model_selection import train_test_split

In [134]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

In [135]:
age_mean = df['Age'].mean()
df.fillna({'Age': age_mean}, inplace=True)
embarked_mode = df['Embarked'].mode()
df.fillna({'Embarked': embarked_mode}, inplace=True)
df['FamilySize'] = df['SibSp'] + df['Parch']

In [136]:
ohe_sex = pd.get_dummies(df['Sex'])
df = pd.concat([df, ohe_sex], axis=1)

ohe_embarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df, ohe_embarked], axis=1)

In [137]:
X = df[['Pclass', 'Age', 'Fare', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
y = df['Survived']

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((623, 9), (268, 9), (623,), (268,))

In [139]:
sv = svm.SVC(kernel='rbf')
sv.fit(X_train, y_train)

In [140]:
pred = sv.predict(X_test)

In [141]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.7201492537313433


In [142]:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pred)
print(mat)

[[167   7]
 [ 68  26]]


In [143]:
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.71      0.96      0.82       174
           1       0.79      0.28      0.41        94

    accuracy                           0.72       268
   macro avg       0.75      0.62      0.61       268
weighted avg       0.74      0.72      0.67       268



In [144]:
sv = svm.SVC(kernel='rbf')
sv = svm.SVC(kernel='linear', C=1, gamma=0.1)
sv = svm.SVC(kernel='rbf', C=0.1, gamma=0.1)

#### 로지스틱 회귀

In [145]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [146]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")

In [147]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_width']] = scaler.fit_transform(df[['sepal_width']])
df[['petal_length']] = scaler.fit_transform(df[['petal_length']])
df[['petal_width']] = scaler.fit_transform(df[['petal_width']])

In [148]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [149]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [150]:
pred = lr.predict(X_test)

In [151]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.8333333333333334


#### 랜덤 포레스트

In [152]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [153]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv")

In [154]:
age_mean = df['Age'].mean()
df.fillna({'Age': age_mean}, inplace=True)
Embarked_mode = df['Embarked'].mode()[0]
df.fillna({'Embarked': Embarked_mode}, inplace=True)

In [155]:
from sklearn.preprocessing import LabelEncoder

df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

In [156]:
df['FamilySize'] = df['SibSp']+df['Parch']

In [157]:
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [158]:
rf = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=20)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

In [159]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.8603351955307262
