In [1]:
# Import the modules
import numpy as np 
import pandas as pd
import hvplot.pandas 
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import findspark and initialise. 
import findspark
findspark.init()

In [None]:
# Start Spark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

In [None]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
spark.sparkContext.addFile("Resources/ObesityDataSet.csv")
df = spark.read.csv(SparkFiles.get("ObesityDataSet.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+------+---+------+------+------------------------------+----+----+---+----------+-----+----+---+---+---+----------+--------------------+-------------------+
|Gender|Age|Height|Weight|family_history_with_overweight|FAVC|FCVC|NCP|      CAEC|SMOKE|CH2O|SCC|FAF|TUE|      CALC|              MTRANS|         NObeyesdad|
+------+---+------+------+------------------------------+----+----+---+----------+-----+----+---+---+---+----------+--------------------+-------------------+
|Female| 21|  1.62|    64|                           yes|  no|   2|  3| Sometimes|   no|   2| no|  0|  1|        no|Public_Transporta...|      Normal_Weight|
|Female| 21|  1.52|    56|                           yes|  no|   3|  3| Sometimes|  yes|   3|yes|  3|  0| Sometimes|Public_Transporta...|      Normal_Weight|
|  Male| 23|   1.8|    77|                           yes|  no|   2|  3| Sometimes|   no|   2| no|  2|  1|Frequently|Public_Transporta...|      Normal_Weight|
|  Male| 27|   1.8|    87|                          

In [None]:
df.count()

2111

In [None]:
df1=df.dropDuplicates()

In [None]:
df1.count()

2087

In [None]:
df1.write.csv('Resources/Cleaned_ObesityDataSet.csv', mode='overwrite',header=True)

In [2]:
df_obesity=pd.read_csv("Resources/Cleaned_ObesityDataSet.csv/part-00000-a468d7b4-c0ae-4050-af17-3db69e0aec6f-c000.csv")
df_obesity

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,23.000000,1.670000,75.000000,yes,yes,2.000000,3.000000,Frequently,yes,2.000000,no,0.000000,2.000000,no,Walking,Overweight_Level_I
1,Female,22.993680,1.741377,54.877111,yes,yes,3.000000,3.000000,Frequently,no,2.009796,no,2.000000,0.071317,no,Public_Transportation,Insufficient_Weight
2,Female,38.952866,1.568441,62.855073,yes,yes,2.002796,3.000000,Sometimes,no,2.526775,no,0.271174,0.806069,Sometimes,Automobile,Overweight_Level_I
3,Female,18.549437,1.545196,72.467862,yes,no,3.000000,3.014808,Sometimes,no,2.000000,no,1.997529,1.000000,Sometimes,Public_Transportation,Overweight_Level_II
4,Female,35.456326,1.651812,79.437921,yes,yes,2.156065,2.909117,Sometimes,no,1.221281,no,0.503279,1.796136,no,Automobile,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,Male,19.693804,1.800000,85.000000,yes,yes,2.188722,3.000000,Sometimes,no,2.721356,no,1.528968,1.000000,Sometimes,Public_Transportation,Overweight_Level_I
2083,Female,18.000000,1.647971,68.818893,yes,yes,2.000000,1.411685,Sometimes,no,1.859089,no,0.000000,1.306000,no,Public_Transportation,Overweight_Level_I
2084,Female,38.692265,1.548178,62.341438,yes,yes,2.956671,2.965494,Sometimes,no,2.868132,no,0.000000,0.549250,Sometimes,Automobile,Overweight_Level_I
2085,Male,20.986834,1.677178,80.379575,yes,yes,2.000000,2.961706,Sometimes,no,2.000000,no,1.661556,1.114716,no,Public_Transportation,Overweight_Level_II


In [None]:
df_obesity.dtypes

Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [None]:
df_obesity.count()

Gender                            2111
Age                               2111
Height                            2111
Weight                            2111
family_history_with_overweight    2111
FAVC                              2111
FCVC                              2111
NCP                               2111
CAEC                              2111
SMOKE                             2111
CH2O                              2111
SCC                               2111
FAF                               2111
TUE                               2111
CALC                              2111
MTRANS                            2111
NObeyesdad                        2111
dtype: int64

In [3]:
# Drop duplicates
df_obesity.drop_duplicates(inplace=True)
df_obesity

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,23.000000,1.670000,75.000000,yes,yes,2.000000,3.000000,Frequently,yes,2.000000,no,0.000000,2.000000,no,Walking,Overweight_Level_I
1,Female,22.993680,1.741377,54.877111,yes,yes,3.000000,3.000000,Frequently,no,2.009796,no,2.000000,0.071317,no,Public_Transportation,Insufficient_Weight
2,Female,38.952866,1.568441,62.855073,yes,yes,2.002796,3.000000,Sometimes,no,2.526775,no,0.271174,0.806069,Sometimes,Automobile,Overweight_Level_I
3,Female,18.549437,1.545196,72.467862,yes,no,3.000000,3.014808,Sometimes,no,2.000000,no,1.997529,1.000000,Sometimes,Public_Transportation,Overweight_Level_II
4,Female,35.456326,1.651812,79.437921,yes,yes,2.156065,2.909117,Sometimes,no,1.221281,no,0.503279,1.796136,no,Automobile,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,Male,19.693804,1.800000,85.000000,yes,yes,2.188722,3.000000,Sometimes,no,2.721356,no,1.528968,1.000000,Sometimes,Public_Transportation,Overweight_Level_I
2083,Female,18.000000,1.647971,68.818893,yes,yes,2.000000,1.411685,Sometimes,no,1.859089,no,0.000000,1.306000,no,Public_Transportation,Overweight_Level_I
2084,Female,38.692265,1.548178,62.341438,yes,yes,2.956671,2.965494,Sometimes,no,2.868132,no,0.000000,0.549250,Sometimes,Automobile,Overweight_Level_I
2085,Male,20.986834,1.677178,80.379575,yes,yes,2.000000,2.961706,Sometimes,no,2.000000,no,1.661556,1.114716,no,Public_Transportation,Overweight_Level_II


In [None]:
df_obesity['NObeyesdad'].unique()

array(['Overweight_Level_I', 'Insufficient_Weight', 'Overweight_Level_II',
       'Obesity_Type_II', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_III'], dtype=object)

## DATA PROCESSING
# Categorical encoding

In [4]:
# Encoding the NObeyesdad column using a custom function
def encode_NObeyesdad(NObeyesdad):
    if NObeyesdad == "Obesity_Type_I" or NObeyesdad =="Obesity_Type_III" or NObeyesdad =="Obesity_Type_II":
        return 1
    if NObeyesdad == "Overweight_Level_II" or NObeyesdad == "Overweight_Level_I" :
        return 2
    else:
        return 0

# Encoding the family_history_with_overweight column using a custom function       
def encode_family_history_with_overweight(family_history_with_overweight):
    if family_history_with_overweight == "yes" :
        return 1
    else:
        return 0
# Encoding the SMOKE column using a custom function       
def encode_SMOKE(SMOKE):
    if SMOKE == "yes" :
        return 1
    else:
        return 0
# Encoding the FAVC column using a custom function       
def encode_FAVC(FAVC):
    if FAVC == "yes" :
        return 1
    else:
        return 0
# Encoding the SCC column using a custom function       
def encode_SCC(SCC):
    if SCC == "yes" :
        return 1
    else:
        return 0

# Call the function on the  NObeyesdad  column
df_obesity["NObeyesdad"] = df_obesity["NObeyesdad"].apply(encode_NObeyesdad)
df_obesity["family_history_with_overweight"] = df_obesity["family_history_with_overweight"].apply(encode_family_history_with_overweight)
df_obesity["SMOKE"] = df_obesity["SMOKE"].apply(encode_SMOKE)
df_obesity["FAVC"] = df_obesity["FAVC"].apply(encode_FAVC)
df_obesity["SCC"] = df_obesity["SCC"].apply(encode_SCC)
df_obesity.rename(columns={"NObeyesdad":"obesity_risk"}, inplace= True)
# Review the DataFrame 
df_obesity

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,obesity_risk
0,Female,23.000000,1.670000,75.000000,1,1,2.000000,3.000000,Frequently,1,2.000000,0,0.000000,2.000000,no,Walking,2
1,Female,22.993680,1.741377,54.877111,1,1,3.000000,3.000000,Frequently,0,2.009796,0,2.000000,0.071317,no,Public_Transportation,0
2,Female,38.952866,1.568441,62.855073,1,1,2.002796,3.000000,Sometimes,0,2.526775,0,0.271174,0.806069,Sometimes,Automobile,2
3,Female,18.549437,1.545196,72.467862,1,0,3.000000,3.014808,Sometimes,0,2.000000,0,1.997529,1.000000,Sometimes,Public_Transportation,2
4,Female,35.456326,1.651812,79.437921,1,1,2.156065,2.909117,Sometimes,0,1.221281,0,0.503279,1.796136,no,Automobile,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,Male,19.693804,1.800000,85.000000,1,1,2.188722,3.000000,Sometimes,0,2.721356,0,1.528968,1.000000,Sometimes,Public_Transportation,2
2083,Female,18.000000,1.647971,68.818893,1,1,2.000000,1.411685,Sometimes,0,1.859089,0,0.000000,1.306000,no,Public_Transportation,2
2084,Female,38.692265,1.548178,62.341438,1,1,2.956671,2.965494,Sometimes,0,2.868132,0,0.000000,0.549250,Sometimes,Automobile,2
2085,Male,20.986834,1.677178,80.379575,1,1,2.000000,2.961706,Sometimes,0,2.000000,0,1.661556,1.114716,no,Public_Transportation,2


# One Hot encoding

In [5]:
# Encode (convert to dummy variables) the CAEC,CALC and MTRANS column
df_obesity_dummies = pd.get_dummies(df_obesity[["Gender","CAEC","CALC","MTRANS"]]).astype(int)
df_obesity_dummies 


Unnamed: 0,Gender_Female,Gender_Male,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
2,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0
3,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0
4,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0
2083,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0
2084,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0
2085,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0


In [6]:
#droping Gender,CAEC,CALC and MTRANS column
df = df_obesity.drop(columns=["Gender","CAEC","CALC","MTRANS"], axis=0)


In [7]:
#concatenating dataframes
df_obesity_concat = pd.concat([df, df_obesity_dummies], axis=1)
df_obesity_concat

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,SMOKE,CH2O,SCC,...,CAEC_no,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,23.000000,1.670000,75.000000,1,1,2.000000,3.000000,1,2.000000,0,...,0,0,0,0,1,0,0,0,0,1
1,22.993680,1.741377,54.877111,1,1,3.000000,3.000000,0,2.009796,0,...,0,0,0,0,1,0,0,0,1,0
2,38.952866,1.568441,62.855073,1,1,2.002796,3.000000,0,2.526775,0,...,0,0,0,1,0,1,0,0,0,0
3,18.549437,1.545196,72.467862,1,0,3.000000,3.014808,0,2.000000,0,...,0,0,0,1,0,0,0,0,1,0
4,35.456326,1.651812,79.437921,1,1,2.156065,2.909117,0,1.221281,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,19.693804,1.800000,85.000000,1,1,2.188722,3.000000,0,2.721356,0,...,0,0,0,1,0,0,0,0,1,0
2083,18.000000,1.647971,68.818893,1,1,2.000000,1.411685,0,1.859089,0,...,0,0,0,0,1,0,0,0,1,0
2084,38.692265,1.548178,62.341438,1,1,2.956671,2.965494,0,2.868132,0,...,0,0,0,1,0,1,0,0,0,0
2085,20.986834,1.677178,80.379575,1,1,2.000000,2.961706,0,2.000000,0,...,0,0,0,0,1,0,0,0,1,0


In [42]:
X = df_obesity_concat.drop(columns="obesity_risk")
y = df_obesity_concat['obesity_risk']
y.unique()

array([2, 0, 1], dtype=int64)

In [26]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

# CLASSIFICATION USING LOGISTIC REGRESSION

In [28]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [29]:
classifier.fit(X_train_scaled, y_train)

In [30]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.9814696485623003
Testing Data Score: 0.975095785440613


In [45]:
predictions = classifier.predict(X_test_scaled)
target_names = ["Normal", "Obese" ,"Over_weight"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.98      0.96      0.97       139
       Obese       0.99      0.99      0.99       238
 Over_weight       0.95      0.97      0.96       145

    accuracy                           0.98       522
   macro avg       0.97      0.97      0.97       522
weighted avg       0.98      0.98      0.98       522



# CLASSIFICATION USING RANDOM FOREST CLASSIFIER

In [32]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [33]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [34]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [46]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1","Actual 2"], columns=["Predicted 0", "Predicted 1","Predicted 2"]
)
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.975095785440613

# Confusion matrix and classification report

In [47]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,134,0,5
Actual 1,0,235,3
Actual 2,3,2,140


Accuracy Score : 0.975095785440613
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       139
           1       0.99      0.99      0.99       238
           2       0.95      0.97      0.96       145

    accuracy                           0.98       522
   macro avg       0.97      0.97      0.97       522
weighted avg       0.98      0.98      0.98       522



# CLASSIFICATION USING KNN

In [37]:
# Instantiate the model with k = 3 neighbours
model = KNeighborsClassifier(n_neighbors=3)

In [38]:
# Train the model
model.fit(X_train_scaled, y_train)

In [39]:
# Create predictions
y_pred = model.predict(X_test_scaled)

In [40]:
score = model.score(X_test_scaled, y_test)
score

0.8850574712643678

In [41]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       142
           1       0.95      0.93      0.94       242
           2       0.81      0.85      0.83       138

    accuracy                           0.89       522
   macro avg       0.87      0.87      0.87       522
weighted avg       0.89      0.89      0.89       522



# optimization of Logistic regression by GridSearchCV

In [48]:
classifier.get_params(deep=True)

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 200,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 1,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [49]:
params = {'penalty':['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-10, 10, 50), 
    'solver':['lbfgs','newton-cg','liblinear','sag','saga'],
             }

In [50]:
from sklearn.model_selection import GridSearchCV
grid1 = GridSearchCV(classifier, params)
grid1 = grid1.fit(X_train_scaled, y_train)

In [51]:
lr_tuned_g = grid1.best_estimator_

In [52]:
lr_tuned_g.fit(X_train_scaled, y_train)

In [53]:
print('Best Penalty:', grid1.best_estimator_.get_params()['penalty'])
print('Best C:', grid1.best_estimator_.get_params()['C'])
print('Best solver:', grid1.best_estimator_.get_params()['solver'])

Best Penalty: l2
Best C: 49417.13361323838
Best solver: newton-cg


In [54]:
predictions1 = lr_tuned_g.predict(X_test_scaled)
target_names = ["Normal", "Obese" ,"Over_weight"]
print(classification_report(y_test, predictions1, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.99      0.99      0.99       139
       Obese       1.00      0.99      0.99       238
 Over_weight       0.98      0.98      0.98       145

    accuracy                           0.99       522
   macro avg       0.99      0.99      0.99       522
weighted avg       0.99      0.99      0.99       522



In [56]:
print(f"% improved: {lr_tuned_g.score(X_train_scaled, y_train)/classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr_tuned_g.score(X_test_scaled, y_test)}")

% improved: 1.0188802083333333
Testing Data Score: 0.9885057471264368
