In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#Import packages
import pandas as pd
import numpy as np

In [3]:
#Load dataset into a dataframe
data = pd.read_csv('../data/raw/GlobalDietaryDatabase_V2.csv')

In [4]:
#Make a copy of the dataframe(df) to be cleaned
df_cleaned = data.copy()

In [5]:
# Define features and target variable
features = df_cleaned.drop(['Unnamed: 0', 'Entity', 'superregion2', 'iso3', 'Year', 'Continent', 'Unit'], axis=1)
target = df_cleaned['Diabetes prevalence (% of population ages 20 to 79)']

In [6]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [7]:
# Standardize features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define the Neural Network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

In [12]:
# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

In [13]:
# Predict on the test set
y_pred = model.predict(X_test_scaled).flatten()

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

RMSE: 3.0473655499593906
MAE: 2.4644309636708854


The Neural Network model's RMSE (3.0474) is higher than that of the Random Forest (0.3046) and Gradient Boosting (0.2058) models but lower than that of the XGBoost (4.0491) and SVM (4.0601) models.

The Neural Network model's MAE (2.4644) is higher than that of the Random Forest (0.1671) and Gradient Boosting (0.1078) models but lower than that of the SVM (2.6563) model.

Overall, the Neural Network model's performance falls between that of the best-performing models (Random Forest and Gradient Boosting) and the worst-performing models (XGBoost and SVM).

# **Further Optimisation**

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import reciprocal
import numpy as np

In [32]:
# Define features and target variable
features = df_cleaned.drop(['Unnamed: 0', 'Entity', 'superregion2', 'iso3', 'Year', 'Continent', 'Unit'], axis=1)
target = df_cleaned['Diabetes prevalence (% of population ages 20 to 79)']

In [33]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [34]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
# Define the Keras model function
def create_keras_model(neurons=64, activation='relu', optimizer='adam'):
    model = Sequential([
        Dense(neurons, activation=activation, input_shape=(X_train_scaled.shape[1],)),
        Dense(neurons, activation=activation),
        Dense(1)
    ])
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [36]:
# Wrap the Keras model for use with scikit-learn
class KerasRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, create_model_fn=create_keras_model, **kwargs):
        self.create_model_fn = create_model_fn
        self.model = self.create_model_fn(**kwargs)
    
    def fit(self, X, y, **kwargs):
        self.model.fit(X, y, **kwargs)
        return self
    
    def predict(self, X):
        return np.squeeze(self.model.predict(X))

In [37]:
# Define hyperparameters to search
param_dist = {
    'neurons': [32, 64, 128],
    'activation': ['relu', 'tanh'],
    'optimizer': ['adam', 'rmsprop']
}

In [38]:
# Perform Randomized Search for hyperparameter tuning
keras_regressor = KerasRegressorWrapper()
random_search = RandomizedSearchCV(keras_regressor, param_distributions=param_dist, cv=3, n_iter=10, scoring='neg_mean_squared_error')
random_search.fit(X_train_scaled, y_train)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid parameter 'optimizer' for estimator KerasRegressorWrapper(). Valid parameters are: ['create_model_fn'].

In [15]:
# Define the Neural Network model
def create_model(neurons=64, activation='relu', optimizer='adam'):
    model = Sequential([
        Dense(neurons, activation=activation, input_shape=(X_train_scaled.shape[1],)),
        Dense(neurons, activation=activation),
        Dense(1)
    ])
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [30]:
# Get the best model
best_model = random_search.best_estimator_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [None]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)

In [None]:
# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)