In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold


In [2]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Separate features and target
X_train = train_df.drop(['Human Waste'], axis=1)
y_train = train_df['Human Waste']
X_test = test_df

In [4]:
# Preprocess data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Define models
models = []

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
models.append(('Logistic Regression', lr_model))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
models.append(('Random Forest', rf_model))

# 2-layer FNN
model_2_layer = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(1, activation='sigmoid')
])
model_2_layer.compile(optimizer=Adam(learning_rate=0.001),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
models.append(('2-layer FNN', model_2_layer))

# 3-layer FNN
model_3_layer = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_3_layer.compile(optimizer=Adam(learning_rate=0.001),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
models.append(('3-layer FNN', model_3_layer))

# 4-layer FNN
model_4_layer = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_4_layer.compile(optimizer=Adam(learning_rate=0.001),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
models.append(('4-layer FNN', model_4_layer))

In [6]:
# Evaluate models using cross-validation
best_model = None
best_f1 = 0
best_model_name = ''

for name, model in models:
    if 'FNN' in name:
        scores = []
        for _ in range(5):
            model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)
            y_pred = (model.predict(X_train_scaled) > 0.5).astype(int)
            score = f1_score(y_train, y_pred)
            scores.append(score)
        mean_f1 = sum(scores) / len(scores)
    else:
        f1_scorer = make_scorer(f1_score)
        scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=f1_scorer)
        mean_f1 = scores.mean()
    
    print(f'{name}: Cross-validated F1 score = {mean_f1:.3f}')
    
    if mean_f1 > best_f1:
        best_f1 = mean_f1
        best_model = model
        best_model_name = name

print(f'\nBest model: {best_model_name}')
print(f'Cross-validated F1 score: {best_f1:.3f}')

Logistic Regression: Cross-validated F1 score = 0.875
Random Forest: Cross-validated F1 score = 0.911
2-layer FNN: Cross-validated F1 score = 0.965
3-layer FNN: Cross-validated F1 score = 0.992
4-layer FNN: Cross-validated F1 score = 0.997

Best model: 4-layer FNN
Cross-validated F1 score: 0.997


In [7]:
# Train the best model on the full training set
if 'FNN' in best_model_name:
    best_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)
else:
    best_model.fit(X_train_scaled, y_train)

In [8]:
# Make predictions on the test set
if 'FNN' in best_model_name:
    y_pred = (best_model.predict(X_test_scaled) > 0.5).astype(int)
else:
    y_pred = best_model.predict(X_test_scaled)



In [9]:
# Save predictions from the best model
submission_df = pd.DataFrame({'Human Waste': y_pred.flatten()})
submission_df.to_csv('submission.csv', index_label='Index')

print(f'Predictions from {best_model_name} saved to submission.csv')

Predictions from 4-layer FNN saved to submission.csv
