# Used SMOTE to balance the dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
malnutrition = pd.read_csv('Malnutrition data.csv')

In [3]:
df = malnutrition

In [4]:
# List of columns to be removed
columns_to_remove = ["Low Income", "Lower Middle Income", "Upper Middle Income"]

# Create a new DataFrame with the specified columns removed
df_filtered = df.drop(columns=columns_to_remove, axis=1)

# Display the resulting DataFrame
print(df_filtered.head())

   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Assuming df_filtered is your DataFrame without the specified columns

# Separate features (X) and target variable (y)
X = df_filtered.drop("Status", axis=1)  # Replace "TargetColumn" with your actual target column name
y = df_filtered["Status"]

# Instantiate SMOTE with the default n_neighbors value (usually 5)
smote = SMOTE(k_neighbors=3,random_state=42)

# Apply SMOTE to the whole dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with the resampled data
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name="Status")], axis=1)

# Display the resulting DataFrame
print(df_resampled.head())


   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [6]:
# Print the number of samples after applying SMOTE
print("\nNumber of samples after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Create a new DataFrame with the resampled data
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name="Status")], axis=1)

# Display the resulting DataFrame
print("\nResampled DataFrame:")
print(df_resampled.head())


Number of samples after SMOTE:
Stunting       696
Overweight     696
Underweight    696
Wasting        696
Name: Status, dtype: int64

Resampled DataFrame:
   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [7]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y) from the resampled DataFrame
X_resampled = df_resampled.drop("Status", axis=1)
y_resampled = df_resampled["Status"]

# Split the resampled data into training (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Split the temp data into validation (50% of temp, or 15% of the original) and testing (50% of temp, or 15% of the original)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the number of samples in each set
print("\nNumber of samples in the training set:")
print(y_train.value_counts())

print("\nNumber of samples in the validation set:")
print(y_val.value_counts())

print("\nNumber of samples in the testing set:")
print(y_test.value_counts())



Number of samples in the training set:
Wasting        498
Overweight     493
Underweight    479
Stunting       478
Name: Status, dtype: int64

Number of samples in the validation set:
Overweight     110
Wasting        106
Underweight    104
Stunting        98
Name: Status, dtype: int64

Number of samples in the testing set:
Stunting       120
Underweight    113
Overweight      93
Wasting         92
Name: Status, dtype: int64


In [8]:
# Using the resampled dataset to generate Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
print("\nValidation Set:")
print(f"Accuracy: {accuracy_val:.2%}")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("\nTest Set:")
print(f"Accuracy: {accuracy_test:.2%}")
print(classification_report(y_test, y_test_pred))



Validation Set:
Accuracy: 83.01%
              precision    recall  f1-score   support

  Overweight       0.82      0.78      0.80       110
    Stunting       0.77      0.58      0.66        98
 Underweight       0.78      0.94      0.85       104
     Wasting       0.94      1.00      0.97       106

    accuracy                           0.83       418
   macro avg       0.83      0.83      0.82       418
weighted avg       0.83      0.83      0.82       418


Test Set:
Accuracy: 85.41%
              precision    recall  f1-score   support

  Overweight       0.77      0.84      0.80        93
    Stunting       0.88      0.68      0.77       120
 Underweight       0.84      0.94      0.89       113
     Wasting       0.93      0.99      0.96        92

    accuracy                           0.85       418
   macro avg       0.86      0.86      0.85       418
weighted avg       0.86      0.85      0.85       418

