In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [None]:
df_dropna = pd.read_csv('fixed_dropna.csv')

In [None]:
mapping = {'river': 0, 'lake': 1, 'ocean': 2}
df_dropna['Origin'] = df_dropna['Origin'].replace(mapping)

df_dropna = df_dropna.drop(columns=['Unnamed: 0'])

  df_dropna['Origin'] = df_dropna['Origin'].replace(mapping)


In [None]:
df_dropna.head()

Unnamed: 0,Origin,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Hardness,ph,Survivability
0,1,17978.98634,6.5466,310.135738,398.410813,11.558279,3.496447,4.075075,181.101509,9.092223,0
1,1,28748.68774,7.544869,326.678363,280.467916,8.399735,4.023884,2.559708,188.313324,5.584087,0
2,1,28749.71654,7.513408,393.663395,283.651634,13.789695,8.899454,2.672989,248.071735,10.223862,0
3,0,13672.09176,4.563009,303.309771,474.607645,12.363817,4.155727,4.401425,203.361523,8.635849,0
4,2,25484.50849,9.0772,404.041635,563.885481,17.927806,4.290139,4.370562,227.231469,11.180284,0


In [None]:
X_DN = df_dropna.drop(columns=['Survivability'])  # 특징 데이터
y_DN = df_dropna['Survivability']  # 타겟 데이터

# train : test = 8:2 설정
X_train_DN, X_test_DN, y_train_DN, y_test_DN = train_test_split(X_DN, y_DN, test_size=0.2,random_state=42)

Linear Regression

In [None]:
LRmodel = LinearRegression(fit_intercept=True)
LRmodel.fit(X_train_DN, y_train_DN)

y_pred_DN = LRmodel.predict(X_test_DN)

In [None]:
# 모델 평가
y_pred_DN = (y_pred_DN > 0.5).astype(int)


print("Accuracy:", accuracy_score(y_test_DN, y_pred_DN))
print("Linear Regression Classification Report:")
print(classification_report(y_test_DN, y_pred_DN))

Accuracy: 0.6335078534031413
Linear Regression Classification Report:
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       239
           1       0.80      0.03      0.05       143

    accuracy                           0.63       382
   macro avg       0.72      0.51      0.41       382
weighted avg       0.69      0.63      0.50       382



Logistic Regression

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_DN)
X_test_scaled = scaler.transform(X_test_DN)


Logistic_model = OneVsRestClassifier(LogisticRegression())
Logistic_model.fit(X_train_scaled, y_train_DN)
y_pred_DN_logi = Logistic_model.predict(X_test_scaled)


print("Accuracy:", accuracy_score(y_test_DN, y_pred_DN_logi))
print("Logistic Regression Classification Report:")
print(classification_report(y_test_DN, y_pred_DN_logi))

Accuracy: 0.6335078534031413
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       239
           1       0.80      0.03      0.05       143

    accuracy                           0.63       382
   macro avg       0.72      0.51      0.41       382
weighted avg       0.69      0.63      0.50       382



In [None]:
logreg = LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000)

logreg.fit(X_train_scaled, y_train_DN.astype(int))  # y_train이 정수형인지 확인

y_pred_lr = logreg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test_DN.astype(int), y_pred_lr))
print("Classification Report:\n", classification_report(y_test_DN.astype(int), y_pred_lr))

Accuracy: 0.6335078534031413
Classification Report:
               precision    recall  f1-score   support

           0       0.63      1.00      0.77       239
           1       0.80      0.03      0.05       143

    accuracy                           0.63       382
   macro avg       0.72      0.51      0.41       382
weighted avg       0.69      0.63      0.50       382



Neural Network

In [None]:
#Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', max_iter=1000, random_state=42)

mlp.fit(X_train_scaled, y_train_DN.astype(int))

y_pred_NN = mlp.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test_DN.astype(int), y_pred_NN))
print("Classification Report:\n", classification_report(y_test_DN.astype(int), y_pred_NN))

Accuracy: 0.6649214659685864
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.76      0.74       239
           1       0.56      0.51      0.53       143

    accuracy                           0.66       382
   macro avg       0.64      0.63      0.64       382
weighted avg       0.66      0.66      0.66       382



Random Forest

In [None]:
#RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train_DN.astype(int))


y_pred_RF = rf_classifier.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test_DN.astype(int), y_pred_RF))
print("Classification Report:\n", classification_report(y_test_DN.astype(int), y_pred_RF))

Accuracy: 0.7120418848167539
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.89      0.79       239
           1       0.69      0.42      0.52       143

    accuracy                           0.71       382
   macro avg       0.70      0.65      0.66       382
weighted avg       0.71      0.71      0.69       382

