# Assignment 3 - Data Science
# Name: Ibrahim Abuhahhash
# Student ID: 1221140 

## Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import set_config
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error;
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report;
from sklearn.compose import ColumnTransformer
set_config(transform_output='pandas')

### Loading Dataset (diabetes.csv)

In [2]:
df = pd.read_csv("diabetes.csv") 
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,WeightGroup,AgeGroup,Gender,Outcome
0,4,189.0,110.0,31.0,,0_68,overweight,18 - 44,F,0
1,2,157.0,74.0,35.0,440.0,0.134,obese_2,18 - 44,F,0
2,6,98.0,58.0,33.0,190.0,0.43,obese_1,18 - 44,F,0
3,6,111.0,64.0,39.0,,0.26,obese_1,18 - 44,F,0
4,3,106.0,72.0,,,0_207,overweight,18 - 44,F,0


## Data Inspection

### Columns

In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'WeightGroup', 'AgeGroup', 'Gender',
       'Outcome'],
      dtype='object')

### Information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               642 non-null    int64  
 1   Glucose                   638 non-null    float64
 2   BloodPressure             616 non-null    float64
 3   SkinThickness             455 non-null    float64
 4   Insulin                   331 non-null    float64
 5   DiabetesPedigreeFunction  642 non-null    object 
 6   WeightGroup               642 non-null    object 
 7   AgeGroup                  642 non-null    object 
 8   Gender                    642 non-null    object 
 9   Outcome                   642 non-null    int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 50.3+ KB


### Description

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,Outcome
count,642.0,638.0,616.0,455.0,331.0,642.0
mean,5.453271,124.043887,72.397727,29.483516,158.996979,0.422118
std,39.460225,30.75698,12.306886,10.527458,117.255057,0.494282
min,0.0,56.0,24.0,7.0,14.0,0.0
25%,1.0,100.0,64.0,22.0,82.5,0.0
50%,3.0,121.0,72.0,30.0,130.0,0.0
75%,6.0,144.0,80.0,36.5,192.5,1.0
max,1000.0,199.0,114.0,99.0,846.0,1.0


# Data Cleaning

## Drop Duplicates

In [6]:
print(f'number of duplicates:  {df.duplicated().sum()}')
df = df.drop_duplicates()

number of duplicates:  3


## Validating data types

###  DiabetesPedigreeFunction

In [7]:
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].astype(str).str.replace('_', '.')
df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'].astype(float)

## Handling inconsistency

### Gender

In [8]:
print(df['Gender'].value_counts())
df['Gender'] = df['Gender'].replace({"m": "M"})

Gender
F    563
M     59
m     17
Name: count, dtype: int64


### Weight Group

In [9]:
print(df['WeightGroup'].value_counts()) ## value counts for weight group
df['WeightGroup'] = df['WeightGroup'].replace('MISSING', np.nan) ## replacing missing with nan
df['WeightGroup'] = df['WeightGroup'].replace({"obsese_3": "obese_3", "healthy weight": "healthyweight"}) ## handling inconsistencies 

WeightGroup
obese_1           195
overweight        146
obese_2           126
obsese_3           86
healthy weight     74
MISSING             8
underweight         4
Name: count, dtype: int64


The 'MISSING' value will be misleading if i keep it as it is when i do preprocessing. so i converted each 'MISSING' value into Nan

### Age Group

In [10]:
print(df['AgeGroup'].value_counts())
df['AgeGroup'] = df['AgeGroup'].replace({"<65": "18 - 44"}) ## handling inconsistencies 

AgeGroup
18 - 44    526
45 - 64    102
>65          8
<65          3
Name: count, dtype: int64


<65 was very ambiguous. it could fall within the range 18 - 44 or 45 - 64. so i added it to 18 - 44, since it is the most common group. 

## Handling extreme values

In [11]:
def detect_outliers(df, col):
    # calculate z scores for the column values
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    # return a boolean series indicating outliers (z-scores >= 3 or <= -3)
    return abs(z_scores) >= 3

#identify numerical columns in the df
num_cols = df.select_dtypes(include=[np.number]).columns

#loop through each numerical column to detect outliers
for col in num_cols:
    # create a new column to indicate outliers for the current column
    df[f'{col}_outlier'] = detect_outliers(df, col)


## Pregnancy Outliers

In [12]:
df[df['Pregnancies_outlier'] == True]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,WeightGroup,AgeGroup,Gender,Outcome,Pregnancies_outlier,Glucose_outlier,BloodPressure_outlier,SkinThickness_outlier,Insulin_outlier,DiabetesPedigreeFunction_outlier,Outcome_outlier
371,1000,163.0,72.0,41.0,114.0,0.817,obese_3,45 - 64,F,1,True,False,False,False,False,False,False


In [13]:
df['Pregnancies'] = df['Pregnancies'].replace(1000, 10)

we can see that that there is an unrealistic value (most definetly an error) with pregnancies, which is that a female has 1000 pregnancies. we could either remove it or fix it. i chose to fix it because it seems that it was meant to be 10 not 1000.

# Prepare the data appropriately for modeling

## Implement a preprocessing pipeline to avoid data leakage.

In [14]:
y = df['Outcome'] ## split the data into x and y for training and test 
x = df.drop(columns = 'Outcome') ## target 
X_train, X_test,y_train,y_test = train_test_split(x,y,random_state=1)

In [15]:
cat_cols = X_train.select_dtypes("object").columns ## gets the object columns

num_cols = X_train.select_dtypes("number").columns## gets the numerical columns

print(cat_cols)
num_cols

Index(['WeightGroup', 'AgeGroup', 'Gender'], dtype='object')


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction'],
      dtype='object')

## Check for missing values.

### Check for missing values

In [16]:
df.isna().sum() ## seeing how many na values we have in our dataset for each feature

Pregnancies                           0
Glucose                               4
BloodPressure                        26
SkinThickness                       185
Insulin                             309
DiabetesPedigreeFunction              0
WeightGroup                           8
AgeGroup                              0
Gender                                0
Outcome                               0
Pregnancies_outlier                   0
Glucose_outlier                       0
BloodPressure_outlier                 0
SkinThickness_outlier                 0
Insulin_outlier                       0
DiabetesPedigreeFunction_outlier      0
Outcome_outlier                       0
dtype: int64

## Handle missing values by using an appropriate imputation strategy (instead of dropping rows).


In [17]:
impute_na = SimpleImputer(strategy='most_frequent', fill_value='NA') ## filling na values
impute_mean = SimpleImputer(strategy='mean') ## filling null values (zeros)

- i have used the 'most_frequent' strategy. because it works best for categorical data and to preserve consistency and handle common categories.

- i have used mean imputation because it is the best for numerical data. replacing missing values with the average minimizes bias and maintain overall data distribution. 

In [18]:
impute_na.fit(X_train[cat_cols])
impute_mean.fit(X_train[num_cols])

In [19]:
X_train_cat_imputed = impute_na.transform(X_train[cat_cols])
X_test_cat_imputed = impute_na.transform(X_test[cat_cols])
X_train_num_imputed = impute_mean.transform(X_train[num_cols])
X_test_num_imputed = impute_mean.transform(X_test[num_cols])
X_test_cat_imputed

Unnamed: 0,WeightGroup,AgeGroup,Gender
260,obese_1,18 - 44,F
468,obese_1,18 - 44,F
525,overweight,18 - 44,F
441,obese_1,18 - 44,F
482,obese_1,18 - 44,M
...,...,...,...
397,obese_1,18 - 44,F
38,obese_2,18 - 44,M
304,obese_2,45 - 64,F
207,overweight,18 - 44,F


In [20]:
weight_categories = ['underweight', 'healthyweight', 'overweight', 'obese_1', 'obese_2', 'obese_3']
age_categories = ['18 - 44','45 - 64', '>65']

ordinal_encoder = OrdinalEncoder(categories=[weight_categories, age_categories])
ordinal_cols = ['WeightGroup', 'AgeGroup']

X_train_cat_imputed[ordinal_cols] = ordinal_encoder.fit_transform(X_train_cat_imputed[ordinal_cols])

X_test_cat_imputed[ordinal_cols] = ordinal_encoder.transform(X_test_cat_imputed[ordinal_cols])

print(X_test_cat_imputed.head())


     WeightGroup  AgeGroup Gender
260          3.0       0.0      F
468          3.0       0.0      F
525          2.0       0.0      F
441          3.0       0.0      F
482          3.0       0.0      M


i decided to use ordinal encoding for both weight group and age group, since they are ordinal data. meaning weight group represent people from youngest to the oldest age, while weight group reperesnt people from the lowest to the highest weight, while for nominal data such as gender. i have used one-hot encoding to ensure equal treatment of categories without making up unmeaningful order.

In [21]:
ohe_cols = cat_cols.drop(ordinal_cols)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_cat_ohe = ohe.fit_transform(X_train_cat_imputed[ohe_cols])
X_test_cat_ohe = ohe.transform(X_test_cat_imputed[ohe_cols])

# Scale numerical columns
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_test_num_scaled = scaler.transform(X_test_num_imputed)

We used one hot encoder because we are dealing with 'Gender' column, which is a nominal data. 'Gender' cannot be an ordinal. if we made it ordinal then that would mean that you are treating 'Male' higher than 'Female'. there should not be ranking. so they are both treated equally

In [22]:
cat_pipe_ordinal = make_pipeline(impute_na, ordinal_encoder)
cat_pipe_onehot = make_pipeline(impute_na, ohe)
nominal_cols = ['Gender']
num_pipe = make_pipeline(impute_mean,scaler)
num_tuple = ('numeric',num_pipe,num_cols)
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', cat_pipe_ordinal, ordinal_cols),   
        ('onehot', cat_pipe_onehot, nominal_cols),    
        ('numeric', num_pipe, num_cols)               
    ],
    verbose_feature_names_out=False
)


# Training and evaluation



## Create your model

In [23]:
model = LogisticRegression()

## Put your preprocessing and models into pipelines

In [24]:
pipeline = make_pipeline(preprocessor, model)
pipeline.fit(X_train, y_train)

The target variable in this project represetns a binary outcome, whether a person has diabetes or not. represented as 0 (no diabetes) and 1 (diabetes). so i used a classification model because the goal is to predict whether a person has dibates or not. classification models are designed to handle this type of task. where the outcome has fixed categories. 

# Model evaluation

In [25]:
test_preds = pipeline.predict(X_test)
test_preds

array([0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0], dtype=int64)

## Predicted outcome vs Actual Outcome

In [26]:
print(f"Predicted: {test_preds[:10]}") ## checking the actual outcome and predicted outcome
print(f"Actual   : {np.array(y_test[:10])}")

Predicted: [0 1 1 0 0 1 1 0 1 0]
Actual   : [0 1 1 1 1 1 1 1 0 0]


In [27]:
print("Test Data: ") 
print(classification_report(y_test, test_preds)) ## classification report on testing data

Test Data: 
              precision    recall  f1-score   support

           0       0.80      0.85      0.82        93
           1       0.77      0.70      0.73        67

    accuracy                           0.79       160
   macro avg       0.78      0.78      0.78       160
weighted avg       0.79      0.79      0.79       160



In [28]:
train_preds = pipeline.predict(X_train)
print("Train Data: ")
print(classification_report(y_train, train_preds)) ## classification report on training data

Train Data: 
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       278
           1       0.72      0.64      0.68       201

    accuracy                           0.74       479
   macro avg       0.74      0.73      0.73       479
weighted avg       0.74      0.74      0.74       479



Conclusion: I'd say my model is acceptable. the scores in both reports are similar with little difference. there is no overfitting or underfitting. 