In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Load the dataset
df = pd.read_csv('data/df.csv')
df.head()

In [None]:
# Splitting the data into training and test sets
X = df.drop('stunting', axis=1)
y = df['stunting']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using a base Random Forest model to identify important features
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_
features = X.columns
important_features = pd.Series(feature_importances, index=features).nlargest(5)
important_features

In [None]:
# Selecting the top 5 important features for training
selected_features = important_features.index.tolist()
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initializing models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Ridge Regression': RidgeClassifier(random_state=42),
    'LASSO': Lasso(alpha=0.1),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier()
}

# Training and evaluating models
results = {}
for name, model in models.items():
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

results

In [None]:
# Visualizing the accuracy of each model
plt.figure(figsize=(12, 6))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plotting ROC curves for each model
plt.figure(figsize=(12, 8))

for name, model in models.items():
    # Predict probabilities
    probas = model.predict_proba(X_test_selected)
    fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s (AUC = %0.2f)' % (name, roc_auc))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## Model Recommendation
Based on the accuracy results and the AUC values from the ROC curves, we can make a recommendation on the best model for predicting stunting.

The model with the highest accuracy and AUC value will be considered the best model for this task. Let's analyze the results and provide a recommendation.