In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression,SGDRegressor, SGDClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import PolynomialFeatures
import time
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

In [51]:
# 데이터 로드
data = pd.read_csv('Stateofsewage.csv')

In [52]:
# 위도와 경도를 표준화
scaler = StandardScaler()
data[['Latitude', 'Longitude']] = scaler.fit_transform(data[['Geographical Location (Latitude)', 'Geographical Location (Longitude)']])

In [53]:


# 라벨 데이터 매핑 (Good -> 2, Moderate -> 1, Poor -> 0)
label_mapping = {'Good': 2, 'Moderate': 1, 'Poor': 0}
data['Label'] = data['State of Sewage System'].map(label_mapping)

In [54]:
y_regression = data[['Nitrogen (mg/L)', 'Phosphorus (mg/L)']]
y_classification = data['Label']

In [55]:
X_direct = y_regression.values  # Nitrogen과 Phosphorus를 입력 데이터로 사용
y_direct = y_classification

# 데이터 분리
X_train_direct, X_test_direct, y_train_direct, y_test_direct = train_test_split(
    X_direct, y_direct, test_size=0.2, random_state=42
)

softmax_direct_model = SGDClassifier(loss='log_loss', learning_rate='constant', eta0=0.01, random_state=42)
start_time_softmax = time.time()

# Softmax 학습 (Mini-batch SGD)
for i in range(0, len(X_train_direct), 64):
    X_batch = X_train_direct[i:i + 64]
    y_batch = y_train_direct.iloc[i:i + 64]
    softmax_direct_model.partial_fit(X_batch, y_batch, classes=np.unique(y_train_direct))

end_time_softmax = time.time()
softmax_train_time = end_time_softmax - start_time_softmax

# Softmax 테스트
softmax_pred = softmax_direct_model.predict(X_test_direct)
softmax_accuracy = accuracy_score(y_test_direct, softmax_pred)
softmax_report = classification_report(y_test_direct, softmax_pred, target_names=['Poor', 'Moderate', 'Good'])

print("--- Softmax Regression ---")
print(f"Test Accuracy: {softmax_accuracy:.4f}")
print(softmax_report)


--- Softmax Regression ---
Test Accuracy: 0.5029
              precision    recall  f1-score   support

        Poor       0.00      0.00      0.00      1466
    Moderate       0.00      0.00      0.00      2519
        Good       0.50      1.00      0.67      4031

    accuracy                           0.50      8016
   macro avg       0.17      0.33      0.22      8016
weighted avg       0.25      0.50      0.34      8016



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
# KNN 직접 분류
k = 3
knn_train_times = {}
knn_accuracies = {}
knn_reports = {}

knn_model_direct = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
start_time_knn = time.time()

# KNN 학습
knn_model_direct.fit(X_train_direct, y_train_direct)
end_time_knn = time.time()

knn_train_times[k] = end_time_knn - start_time_knn

# KNN 테스트
knn_pred = knn_model_direct.predict(X_test_direct)
knn_accuracies[k] = accuracy_score(y_test_direct, knn_pred)
knn_reports[k] = classification_report(y_test_direct, knn_pred, target_names=['Poor', 'Moderate', 'Good'])


print(f"--- KNN (k={k}) ---")
print(f"Test Accuracy: {knn_accuracies[k]:.4f}")
print(knn_reports[k])

--- KNN (k=3) ---
Test Accuracy: 0.9993
              precision    recall  f1-score   support

        Poor       1.00      1.00      1.00      1270
    Moderate       1.00      1.00      1.00      4747
        Good       1.00      1.00      1.00      1999

    accuracy                           1.00      8016
   macro avg       1.00      1.00      1.00      8016
weighted avg       1.00      1.00      1.00      8016



In [46]:
def add_noise_to_features(X, noise_level=0.1):
    noise = np.random.normal(0, noise_level, X.shape)
    return X + noise

# 라벨 일부를 변경
def add_label_noise(y, noise_ratio=0.1):
    y = y.reset_index(drop=True)  # 인덱스 재설정
    num_noisy = int(len(y) * noise_ratio)
    noisy_indices = np.random.choice(len(y), num_noisy, replace=False)
    y_noisy = y.copy()
    for idx in noisy_indices:
        possible_labels = [label for label in np.unique(y) if label != y[idx]]
        y_noisy.iloc[idx] = np.random.choice(possible_labels)
    return y_noisy

# 노이즈 추가된 데이터 생성
X_train = add_noise_to_features(X_train_direct, noise_level=0.05)  # feature 노이즈 추가
y_train = add_label_noise(y_train_direct, noise_ratio=0.1)  # 라벨 노이즈 추가


In [49]:
# KNN 직접 분류
k = 3
knn_model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
start_time_knn = time.time()

# KNN 학습
knn_model.fit(X_train, y_train)
end_time_knn = time.time()

# KNN 테스트
knn_pred = knn_model.predict(X_test_direct)
accuracy = accuracy_score(y_test_direct, knn_pred)
report = classification_report(y_test_direct, knn_pred, target_names=['Poor', 'Moderate', 'Good'])

print(f"--- KNN (k={k}) ---")
print(f"Training Time: {end_time_knn - start_time_knn:.4f} seconds")
print(f"Test Accuracy: {accuracy:.4f}")
print(report)


--- KNN (k=3) ---
Training Time: 0.0286 seconds
Test Accuracy: 0.9672
              precision    recall  f1-score   support

        Poor       0.88      0.99      0.93      1270
    Moderate       0.99      0.97      0.98      4747
        Good       0.98      0.96      0.97      1999

    accuracy                           0.97      8016
   macro avg       0.95      0.97      0.96      8016
weighted avg       0.97      0.97      0.97      8016

