In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Separate features and target
X = train_df.drop(['Human Waste'], axis=1)
y = train_df['Human Waste']
X_test = test_df

# Preprocess data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Define models
models = []

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
models.append(('Logistic Regression', lr_model))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=5000, random_state=200, class_weight='balanced')
models.append(('Random Forest', rf_model))

# 2-layer FNN
def create_2_layer_fnn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.05),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

models.append(('2-layer FNN', create_2_layer_fnn))

# 3-layer FNN
def create_3_layer_fnn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.05),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

models.append(('3-layer FNN', create_3_layer_fnn))

# 4-layer FNN
def create_4_layer_fnn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.05),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

models.append(('4-layer FNN', create_4_layer_fnn))

# Gaussian Naive Bayes
gnb_model = GaussianNB()
models.append(('Gaussian Naive Bayes', gnb_model))

# CART
cart_model = DecisionTreeClassifier(random_state=250)
models.append(('CART', cart_model))

# Kernel SVM
ksvm_model = SVC(kernel='rbf', class_weight='balanced', random_state=250)
models.append(('Kernel SVM', ksvm_model))

# Bagging
bagging_model = BaggingClassifier(random_state=250)
models.append(('Bagging', bagging_model))

# QDA
qda_model = QuadraticDiscriminantAnalysis()
models.append(('QDA', qda_model))




In [20]:
# Evaluate models using 5-fold cross-validation
best_model = None
best_f1 = 0
best_model_name = ''

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model_fn in models:
    scores = []
    for train_idx, val_idx in kfold.split(X_scaled, y):
        if 'FNN' in name:
            model = model_fn()
            model.fit(X_scaled[train_idx], y.iloc[train_idx], epochs=50, batch_size=64, verbose=0)
            y_pred = (model.predict(X_scaled[val_idx]) > 0.5).astype(int)
        else:
            model = model_fn
            model.fit(X_scaled[train_idx], y.iloc[train_idx])
            y_pred = model.predict(X_scaled[val_idx])
        score = f1_score(y.iloc[val_idx], y_pred)
        scores.append(score)
    mean_f1 = sum(scores) / len(scores)
    
    print(f'{name}: Cross-validated F1 score = {mean_f1:.3f}')
    
    if mean_f1 > best_f1:
        best_f1 = mean_f1
        best_model = model_fn
        best_model_name = name

print(f'\nBest model: {best_model_name}')
print(f'Cross-validated F1 score: {best_f1:.3f}')



Logistic Regression: Cross-validated F1 score = 0.832
Random Forest: Cross-validated F1 score = 0.909
2-layer FNN: Cross-validated F1 score = 0.864
3-layer FNN: Cross-validated F1 score = 0.873
4-layer FNN: Cross-validated F1 score = 0.876
Gaussian Naive Bayes: Cross-validated F1 score = 0.601
CART: Cross-validated F1 score = 0.867
Kernel SVM: Cross-validated F1 score = 0.856
Bagging: Cross-validated F1 score = 0.895
QDA: Cross-validated F1 score = 0.574

Best model: Random Forest
Cross-validated F1 score: 0.909


In [17]:
# Train the best model on the full training set
if 'FNN' in best_model_name:
    best_model = best_model()
    best_model.fit(X_scaled, y)
else:
    best_model = best_model
    best_model.fit(X_scaled, y)

# Make predictions on the test set
if 'FNN' in best_model_name:
    best_model = best_model()
    best_model.fit(X_scaled, y, epochs=500, batch_size=64, verbose=0)
else:
    y_pred = best_model.predict(X_test_scaled)



In [18]:
# Save predictions from the best model
submission_df = pd.DataFrame({'Human Waste': y_pred.flatten()})
submission_df.to_csv('submission.csv', index_label='Index')

print(f'Predictions from {best_model_name} saved to submission.csv')

Predictions from Random Forest saved to submission.csv
