# Used SMOTE to balance the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [2]:
malnutrition = pd.read_csv('Malnutrition data.csv')

In [3]:
df = malnutrition

In [4]:
# List of columns to be removed
columns_to_remove = ["Low Income", "Lower Middle Income", "Upper Middle Income"]

# Create a new DataFrame with the specified columns removed
df_filtered = df.drop(columns=columns_to_remove, axis=1)

# Display the resulting DataFrame
print(df_filtered.head())

   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Assuming df_filtered is your DataFrame without the specified columns

# Separate features (X) and target variable (y)
X = df_filtered.drop("Status", axis=1)  # Replace "TargetColumn" with your actual target column name
y = df_filtered["Status"]

# Instantiate SMOTE with the default n_neighbors value (usually 5)
smote = SMOTE(k_neighbors=3,random_state=42)

# Apply SMOTE to the whole dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with the resampled data
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name="Status")], axis=1)

# Display the resulting DataFrame
print(df_resampled.head())


   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [6]:
# Print the number of samples after applying SMOTE
print("\nNumber of samples after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Create a new DataFrame with the resampled data
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name="Status")], axis=1)

# Display the resulting DataFrame
print("\nResampled DataFrame:")
print(df_resampled.head())


Number of samples after SMOTE:
Stunting       696
Overweight     696
Underweight    696
Wasting        696
Name: Status, dtype: int64

Resampled DataFrame:
   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [7]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y) from the resampled DataFrame
X_resampled = df_resampled.drop("Status", axis=1)
y_resampled = df_resampled["Status"]

# Split the resampled data into training (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Split the temp data into validation (50% of temp, or 15% of the original) and testing (50% of temp, or 15% of the original)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the number of samples in each set
print("\nNumber of samples in the training set:")
print(y_train.value_counts())

print("\nNumber of samples in the validation set:")
print(y_val.value_counts())

print("\nNumber of samples in the testing set:")
print(y_test.value_counts())



Number of samples in the training set:
Wasting        498
Overweight     493
Underweight    479
Stunting       478
Name: Status, dtype: int64

Number of samples in the validation set:
Overweight     110
Wasting        106
Underweight    104
Stunting        98
Name: Status, dtype: int64

Number of samples in the testing set:
Stunting       120
Underweight    113
Overweight      93
Wasting         92
Name: Status, dtype: int64


In [8]:
# Using the resampled dataset to generate Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
print("\nValidation Set:")
print(f"Accuracy: {accuracy_val:.2%}")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("\nTest Set:")
print(f"Accuracy: {accuracy_test:.2%}")
print(classification_report(y_test, y_test_pred))



Validation Set:
Accuracy: 83.01%
              precision    recall  f1-score   support

  Overweight       0.82      0.78      0.80       110
    Stunting       0.77      0.58      0.66        98
 Underweight       0.78      0.94      0.85       104
     Wasting       0.94      1.00      0.97       106

    accuracy                           0.83       418
   macro avg       0.83      0.83      0.82       418
weighted avg       0.83      0.83      0.82       418


Test Set:
Accuracy: 85.41%
              precision    recall  f1-score   support

  Overweight       0.77      0.84      0.80        93
    Stunting       0.88      0.68      0.77       120
 Underweight       0.84      0.94      0.89       113
     Wasting       0.93      0.99      0.96        92

    accuracy                           0.85       418
   macro avg       0.86      0.86      0.85       418
weighted avg       0.86      0.85      0.85       418



In [10]:
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

# # Create an SVM Classifier with a pipeline including standard scaling
# svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))

# # Train the classifier on the training data
# svm_classifier.fit(X_train, y_train)

# # Make predictions on the validation set
# y_val_pred = svm_classifier.predict(X_val)

# # Evaluate the model on the validation set
# accuracy_val = accuracy_score(y_val, y_val_pred)
# print("\nValidation Set:")
# print(f"Accuracy: {accuracy_val:.2%}")
# print(classification_report(y_val, y_val_pred))

# # Make predictions on the test set
# y_test_pred = svm_classifier.predict(X_test)

# # Evaluate the model on the test set
# accuracy_test = accuracy_score(y_test, y_test_pred)
# print("\nTest Set:")
# print(f"Accuracy: {accuracy_test:.2%}")
# print(classification_report(y_test, y_test_pred))


In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'svc__C': [0.1, 1, 10, 100],  # C parameter for regularization
    'svc__gamma': [0.01, 0.1, 1, 10],  # gamma parameter for the RBF kernel
}

# Create an SVM Classifier with a pipeline including standard scaling
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=3, scoring='accuracy')

# Train the classifier using grid search
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

# Make predictions on the validation set using the best model
y_val_pred = grid_search.predict(X_val)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
print("\nValidation Set:")
print(f"Accuracy: {accuracy_val:.2%}")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set using the best model
y_test_pred = grid_search.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("\nTest Set:")
print(f"Accuracy: {accuracy_test:.2%}")
print(classification_report(y_test, y_test_pred))


Best Parameters: {'svc__C': 10, 'svc__gamma': 10}

Validation Set:
Accuracy: 85.17%
              precision    recall  f1-score   support

  Overweight       0.81      0.86      0.83       110
    Stunting       0.82      0.61      0.70        98
 Underweight       0.82      0.92      0.87       104
     Wasting       0.95      0.99      0.97       106

    accuracy                           0.85       418
   macro avg       0.85      0.85      0.84       418
weighted avg       0.85      0.85      0.85       418


Test Set:
Accuracy: 87.80%
              precision    recall  f1-score   support

  Overweight       0.80      0.87      0.84        93
    Stunting       0.91      0.73      0.81       120
 Underweight       0.87      0.95      0.91       113
     Wasting       0.94      0.99      0.96        92

    accuracy                           0.88       418
   macro avg       0.88      0.89      0.88       418
weighted avg       0.88      0.88      0.88       418



In [12]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from sklearn.model_selection import train_test_split
# from sklearn.datasets import make_classification
# from sklearn.metrics import classification_report

# # Generate a sample dataset
# X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Build the neural network model
# model = Sequential()
# model.add(Dense(64, input_dim=10, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# # Make predictions on the test set
# y_pred_probs = model.predict(X_test)
# y_pred = (y_pred_probs > 0.5).astype(int)

# # Evaluate the model
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
