In [1]:
#Launch commands to automatically reload modules
%load_ext autoreload
%autoreload 2

In [2]:
#Load libraries to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Load dataset into a dataframe
data = pd.read_csv('../data/raw/GlobalDietaryDatabase_V2.csv')

In [4]:
#Display dimensions of the dataframe(df)
data.shape

(185, 74)

In [6]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,Entity,superregion2,iso3,Fruits,Non-starchy vegetables,Potatoes,Other starchy vegetables,Beans and legumes,Nuts and seeds,...,"Supply_ Fish, shellfish and their products",Supply_ Meat and meat products,Supply_ Vegetables and their products,Supply_ Fruits and their products,Supply_ Fats and oils,Supply_ Sweets and sugars,Supply_ Spices and condiments,Supply_ Beverages,Supply_ Foods for particular nutritional uses,Supply_ Miscellaneous
0,1,Afghanistan,SAARC,AFG,65.697036,98.080124,21.195808,44.89214,103.830266,31.205695,...,1,48,27,76,222,134,3,1,0,1
1,2,Angola,SSA,AGO,119.74698,308.837004,346.377109,52.662482,37.799557,11.612246,...,28,106,14,112,267,135,0,85,4,2
2,3,Albania,FSU,ALB,138.87436,129.083757,185.783632,73.031694,40.181361,12.272771,...,16,263,211,393,297,308,2,83,1,19
3,4,United Arab Emirates,MENA,ARE,107.57782,123.284747,182.972287,57.068126,40.563807,31.729379,...,48,353,106,176,718,223,34,36,6,9
4,5,Argentina,LAC,ARG,93.995787,123.803276,51.336122,7.010755,3.575009,1.564769,...,11,601,52,91,552,400,3,125,0,0


In [7]:
# Check for missing values
print(data.isnull().sum())

Unnamed: 0                                       0
Entity                                           0
superregion2                                     0
iso3                                             0
Fruits                                           0
                                                ..
Supply_ Sweets and sugars                        0
Supply_ Spices and condiments                    0
Supply_ Beverages                                0
Supply_ Foods for particular nutritional uses    0
Supply_ Miscellaneous                            0
Length: 74, dtype: int64


# **DATA CLEANING**

In [8]:
#Make a copy of the dataframe(df) to be cleaned
df_cleaned = data.copy()

In [9]:
#Remove unnecessary columns
columns_to_drop = ['Unnamed: 0'] + [col for col in data.columns if col.startswith('Supply_')]
df_cleaned = data.drop(columns=columns_to_drop)

#Handle missing values. For simplicity, we'll fill missing values with the mean of their respective columns.
df_cleaned = df_cleaned.fillna(df_cleaned.mean(numeric_only=True))

#Check if there's any remaining missing value that wasn't handled (e.g., non-numeric columns).
missing_values_check = df_cleaned.isnull().sum()

#Check data types for a brief overview.
data_types = df_cleaned.dtypes

(df_cleaned.head(), missing_values_check, data_types)

(                 Entity superregion2 iso3      Fruits  Non-starchy vegetables   
 0           Afghanistan        SAARC  AFG   65.697036               98.080124  \
 1                Angola          SSA  AGO  119.746980              308.837004   
 2               Albania          FSU  ALB  138.874360              129.083757   
 3  United Arab Emirates         MENA  ARE  107.577820              123.284747   
 4             Argentina          LAC  ARG   93.995787              123.803276   
 
      Potatoes  Other starchy vegetables  Beans and legumes  Nuts and seeds   
 0   21.195808                 44.892140         103.830266       31.205695  \
 1  346.377109                 52.662482          37.799557       11.612246   
 2  185.783632                 73.031694          40.181361       12.272771   
 3  182.972287                 57.068126          40.563807       31.729379   
 4   51.336122                  7.010755           3.575009        1.564769   
 
    Refined grains  ...  Vitam

In [10]:
df_cleaned.columns

Index(['Entity', 'superregion2', 'iso3', 'Fruits', 'Non-starchy vegetables',
       'Potatoes', 'Other starchy vegetables', 'Beans and legumes',
       'Nuts and seeds', 'Refined grains', 'Whole grains',
       'Total processed meats', 'Unprocessed red meats', 'Total seafoods',
       'Eggs', 'Cheese', 'Yoghurt (including fermented milk)',
       'Sugar-sweetened beverages', 'Fruit juices', 'Coffee', 'Tea',
       'Total carbohydrates', 'Total protein', 'Saturated fat',
       'Monounsaturated fatty acids', 'Total omega-6 fat',
       'Seafood omega-3 fat', 'Plant omega-3 fat', 'Dietary fiber',
       'Added sugars', 'Calcium', 'Dietary sodium', 'Iodine', 'Iron',
       'Magnesium', 'Potassium', 'Selenium', 'Vitamin A w/ supplements',
       'Vitamin B1', 'Vitamin B2', 'Vitamin B3', 'Vitamin B6',
       'Vitamin B9 (Folate)', 'Vitamin B12', 'Vitamin C', 'Vitamin D',
       'Vitamin E', 'Zinc', 'Total Milk', 'Year',
       'Diabetes prevalence (% of population ages 20 to 79)', 'Continen

In [12]:
# Define dictionary of old and new column names
column_name_mapping = {
    'Entity': 'Country',
    'Population (2021)': 'Population', 'Gross National Income Per Capita (2021)': 'Gross Income Per Capita',
    'Diabetes prevalence (% of population ages 20 to 79)': 'Diabetes prevalence'
}

# Rename the columns
df_cleaned = df_cleaned.rename(columns=column_name_mapping)

# **MODELLING**

In [13]:
#Define target feature
target = df_cleaned.pop('Diabetes prevalence')

In [14]:
#Split numerical and categorical columns
num_cols = list(df_cleaned.select_dtypes('number').columns)
cat_cols = list(set(df_cleaned.columns) - set(num_cols))

In [17]:
#Import standard scaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Instantiate oneHot encoder
ohe = OneHotEncoder(sparse_output=False, drop='first')

#Fit and apply oneHot encoder on dataframe
features = ohe.fit_transform(df_cleaned[cat_cols])

#Convert features into a dataframe
features = pd.DataFrame(features, columns=ohe.get_feature_names_out())

#Instantiate standard scaler
scaler = StandardScaler()

#Fit and apply scaling and save results to features
features[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [18]:
#Import dump from joblib
from joblib import dump

#Save one-hot encoder and scaler into model folder
dump(ohe, '../models/ohe.joblib')
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

# **Split Data**

In [19]:
#Import train, test, split
from sklearn.model_selection import train_test_split

#Split data
X_data, X_test, y_data, y_test = train_test_split(features, target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

In [20]:
#Display data split dimensions for X
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(118, 429)
(30, 429)
(37, 429)


In [21]:
#Display data split dimensions for y
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(118,)
(30,)
(37,)


In [22]:
#Save sets
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_val.to_csv('../data/processed/X_val.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_val.to_csv('../data/processed/y_val.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# **Baseline**

In [23]:
#Calculate average of target variable
pred_value = y_train.mean()

In [24]:
#Generate numpy array
y_base = np.full((len(y_train), 1), pred_value)

In [30]:
#Compute RMSE and MAE scores of baseline model
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

rmse_score = mse(y_base, y_train, squared=False)
mae_score = mae(y_base, y_train)

print(f"RMSE: {rmse_score}")
print(f"MAE: {mae_score}")

RMSE: 4.411347697591873
MAE: 3.3704969836253955


RMSE score is slightly higher than MAE score which could imply that there are some outliers or large errors in the predictions affecting the overall RMSE. We will try to reduce both scores by training other models, feature selection and/or hyperparameter tuning.