# Models after performing SMOTE

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
malnutrition = pd.read_csv('Malnutrition data.csv')

In [3]:
df = malnutrition

In [4]:
# List of columns to be removed
columns_to_remove = ["Low Income", "Lower Middle Income", "Upper Middle Income"]

# Create a new DataFrame with the specified columns removed
df_filtered = df.drop(columns=columns_to_remove, axis=1)

# Display the resulting DataFrame
print(df_filtered.head())


   Sex  Age  Height  Weight    Status
0    1    5      75      17  Stunting
1    0    4     101      13  Stunting
2    0    4      71      17  Stunting
3    0    3      81      13  Stunting
4    0    1      79      16  Stunting


In [5]:
X = df_filtered.drop("Status", axis=1)  # Features
y = df_filtered["Status"]  # Labels

In [6]:
from sklearn.model_selection import train_test_split

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the 'temp' sets into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
# Check the number of entries in each split
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])



Training set size: 585
Validation set size: 126
Test set size: 126


In [8]:
# Check the number of columns in each split
print("Number of columns in training set:", X_train.shape[1])
print("Number of columns in validation set:", X_val.shape[1])
print("Number of columns in test set:", X_test.shape[1])



Number of columns in training set: 4
Number of columns in validation set: 4
Number of columns in test set: 4


In [9]:
from imblearn.over_sampling import SMOTE

# Ensure that you have enough samples in the minority class for the neighbors
# If not, you can reduce the k_neighbors parameter
smote = SMOTE(k_neighbors=1, random_state=42)  # Adjust k_neighbors as needed

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# The rest of your code goes here, using X_train_resampled and y_train_resampled for training




In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Assuming your data is already loaded and encoded as mentioned before

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the 'temp' sets into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(k_neighbors=1, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Calculate the number of new samples
num_original_samples = len(X_train)
num_resampled_samples = len(X_train_resampled)
num_new_samples = num_resampled_samples - num_original_samples

print(f"Number of original samples in training set: {num_original_samples}")
print(f"Number of resampled samples in training set: {num_resampled_samples}")
print(f"Number of new synthetic samples generated by SMOTE: {num_new_samples}")

# The rest of your code goes here, using X_train_resampled and y_train_resampled for training


Number of original samples in training set: 585
Number of resampled samples in training set: 1940
Number of new synthetic samples generated by SMOTE: 1355


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the SMOTE-transformed training data
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
classification_report_val = classification_report(y_val, y_val_pred)

print("Validation Set:")
print(f"Validation Accuracy: {accuracy_val:.2f}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(classification_report_val)

# Make predictions on the test set
y_test_pred = rf_classifier.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
classification_report_test = classification_report(y_test, y_test_pred)

print("\nTest Set:")
print(f"Test Accuracy: {accuracy_test:.2f}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(classification_report_test)



Validation Set:
Validation Accuracy: 0.75
Confusion Matrix:
[[ 2  8  0  0]
 [12 92  6  0]
 [ 1  4  0  0]
 [ 0  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.13      0.20      0.16        10
    Stunting       0.88      0.84      0.86       110
 Underweight       0.00      0.00      0.00         5
     Wasting       0.00      0.00      0.00         1

    accuracy                           0.75       126
   macro avg       0.25      0.26      0.25       126
weighted avg       0.78      0.75      0.76       126


Test Set:
Test Accuracy: 0.63
Confusion Matrix:
[[ 1 16  1]
 [11 79 11]
 [ 1  6  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.08      0.06      0.06        18
    Stunting       0.78      0.78      0.78       101
 Underweight       0.00      0.00      0.00         7

    accuracy                           0.63       126
   macro avg       0.29      0.28      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# Using SVM

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create a Support Vector Machine Classifier with a linear kernel
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier on the SMOTE-transformed training data
svm_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_val_pred = svm_classifier.predict(X_val)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
classification_report_val = classification_report(y_val, y_val_pred)

print("Validation Set:")
print(f"Validation Accuracy: {accuracy_val:.2f}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(classification_report_val)

# Make predictions on the test set
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
classification_report_test = classification_report(y_test, y_test_pred)

print("\nTest Set:")
print(f"Test Accuracy: {accuracy_test:.2f}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(classification_report_test)


Validation Set:
Validation Accuracy: 0.24
Confusion Matrix:
[[ 3  1  5  1]
 [31 25 46  8]
 [ 2  0  2  1]
 [ 0  0  1  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.08      0.30      0.13        10
    Stunting       0.96      0.23      0.37       110
 Underweight       0.04      0.40      0.07         5
     Wasting       0.00      0.00      0.00         1

    accuracy                           0.24       126
   macro avg       0.27      0.23      0.14       126
weighted avg       0.85      0.24      0.33       126


Test Set:
Test Accuracy: 0.21
Confusion Matrix:
[[ 4  5  7  2]
 [27 21 46  7]
 [ 3  3  1  0]
 [ 0  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.12      0.22      0.15        18
    Stunting       0.72      0.21      0.32       101
 Underweight       0.02      0.14      0.03         7
     Wasting       0.00      0.00      0.00         0

    accuracy    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']}
svm_classifier = SVC(random_state=42)
grid_search = GridSearchCV(svm_classifier, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_resampled)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions on the validation set using the best model
best_svm_classifier = grid_search.best_estimator_
y_val_pred = best_svm_classifier.predict(X_val_scaled)

# Evaluate the model on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
classification_report_val = classification_report(y_val, y_val_pred)

print("Validation Set:")
print(f"Validation Accuracy: {accuracy_val:.2f}")
print("Confusion Matrix:")
print(conf_matrix_val)
print("Classification Report:")
print(classification_report_val)

# Make predictions on the test set using the best model
y_test_pred = best_svm_classifier.predict(X_test_scaled)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
classification_report_test = classification_report(y_test, y_test_pred)

print("\nTest Set:")
print(f"Test Accuracy: {accuracy_test:.2f}")
print("Confusion Matrix:")
print(conf_matrix_test)
print("Classification Report:")
print(classification_report_test)


Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}
Validation Set:
Validation Accuracy: 0.56
Confusion Matrix:
[[ 3  5  2  0]
 [30 67 13  0]
 [ 2  3  0  0]
 [ 1  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.08      0.30      0.13        10
    Stunting       0.89      0.61      0.72       110
 Underweight       0.00      0.00      0.00         5
     Wasting       0.00      0.00      0.00         1

    accuracy                           0.56       126
   macro avg       0.24      0.23      0.21       126
weighted avg       0.79      0.56      0.64       126


Test Set:
Test Accuracy: 0.48
Confusion Matrix:
[[ 5 12  1  0]
 [28 56 15  2]
 [ 3  4  0  0]
 [ 0  0  0  0]]
Classification Report:
              precision    recall  f1-score   support

  Overweight       0.14      0.28      0.19        18
    Stunting       0.78      0.55      0.65       101
 Underweight       0.00      0.00      0.00         7
     Wasting       0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import train_test_split

# # Split the dataset into training, validation, and test sets
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# # Split the 'temp' sets into validation and test sets
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# # Check the number of samples in the minority class
# min_class_samples = min(y_train.value_counts())

# # Apply SMOTE only to the training set with k_neighbors
# k_neighbors = min(6, min_class_samples - 1)  # Ensure k_neighbors is less than min_class_samples
# smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # Continue with model training and evaluation using X_train_resampled, y_train_resampled, X_val, y_val, X_test, y_test


In [16]:
# # Print the number of instances before and after SMOTE
# print("Number of instances in the original training set:", len(X_train), len(y_train))
# print("Number of instances after SMOTE:", len(X_train_resampled), len(y_train_resampled))

# # Print the class distribution before and after SMOTE
# print("Class distribution in the original training set:")
# print(y_train.value_counts())

# print("\nClass distribution after SMOTE:")
# print(pd.Series(y_train_resampled).value_counts())


In [17]:
# from sklearn.preprocessing import LabelEncoder

# # Apply label encoding to target variables
# le = LabelEncoder()
# y_train_resampled_encoded = le.fit_transform(y_train_resampled)
# y_val_encoded = le.transform(y_val)
# y_test_encoded = le.transform(y_test)




In [18]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# # Create a Random Forest Classifier
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the classifier on the transformed training data
# rf_classifier.fit(X_train_resampled, y_train_resampled_encoded)

# # Make predictions on the validation set
# y_val_pred = rf_classifier.predict(X_val)

# # Evaluate the model on the validation set
# accuracy = accuracy_score(y_val_encoded, y_val_pred)
# print(f"Validation Accuracy: {accuracy:.2f}")

# # Print classification report
# print("Classification Report:")
# print(classification_report(y_val_encoded, y_val_pred))


In [19]:
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.preprocessing import LabelEncoder

# # Assuming your data is already loaded and encoded as mentioned before

# # Apply SMOTE to the training data
# smote = SMOTE(random_state=42)
# X_train_resampled_smote, y_train_resampled_smote = smote.fit_resample(X_train_resampled, y_train_resampled_encoded)

# # Create a Random Forest Classifier
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the classifier on the SMOTE-transformed training data
# rf_classifier.fit(X_train_resampled_smote, y_train_resampled_smote)

# # Make predictions on the validation set
# y_val_pred = rf_classifier.predict(X_val)

# # Evaluate the model on the validation set
# accuracy = accuracy_score(y_val_encoded, y_val_pred)
# print(f"Validation Accuracy: {accuracy:.2f}")

# # Print classification report
# print("Classification Report:")
# print(classification_report(y_val_encoded, y_val_pred))


In [20]:
# X_train_resampled_smote.info()

In [21]:
# y_train_resampled_smote.info()