At this point the summary statistics for all the variables in the dataset have been generated. Data Transformation of the categorical variables using the Label Encoder

function in scikit-learn has also been done. Finally, the missing values for the bmi variable have been imputed using the grouped means of the values of the other variables.

Note also that the only variable in this dataset with missing values is the bmi variable.

The dataset has been obtained from https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

In [24]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [25]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
print (df.head(6))

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   
5  56669    Male  81.0             0              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   
5        Private          Urban             186.21  29.0  for

In [26]:
#checking for missing values
print (df.isnull().sum())

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [27]:
#select all columns except the first since the id variable is irrelevant for meaningful predictions of any appropriate outcome variable in this dataset

df = df.iloc[:, 1:] 
print (df.head(6))

   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   
3  Female  49.0             0              0          Yes        Private   
4  Female  79.0             1              0          Yes  Self-employed   
5    Male  81.0             0              0          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0          Urban             228.69  36.6  formerly smoked       1  
1          Rural             202.21   NaN     never smoked       1  
2          Rural             105.92  32.5     never smoked       1  
3          Urban             171.23  34.4           smokes       1  
4          Rural             174.12  24.0     never smoked       1  
5          Urban             186.21  29.0  formerly s

In [28]:
print (df.dtypes)

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [29]:
# Convert multiple object columns to category
df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']].astype('category')

# Check the data types
print(df.dtypes)

gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object


In [30]:
# Select all category variables in this dataset and obtain the counts of each levels of those variables.
category_columns = df.select_dtypes(include='category').columns

for col in category_columns:
    print(f"\nCount of each level in '{col}':")
    counts = df[col].value_counts()
    for category, count in counts.items():
        print(f"{category}: {count}")


Count of each level in 'gender':
Female: 2994
Male: 2115
Other: 1

Count of each level in 'ever_married':
Yes: 3353
No: 1757

Count of each level in 'work_type':
Private: 2925
Self-employed: 819
children: 687
Govt_job: 657
Never_worked: 22

Count of each level in 'Residence_type':
Urban: 2596
Rural: 2514

Count of each level in 'smoking_status':
never smoked: 1892
Unknown: 1544
formerly smoked: 885
smokes: 789


In [31]:
#select all numerical variables in this dataset and obtain their summary statistics

numeric_columns = df.select_dtypes(include=['float64']).columns
print(df[numeric_columns].describe())


               age  avg_glucose_level          bmi
count  5110.000000        5110.000000  4909.000000
mean     43.226614         106.147677    28.893237
std      22.612647          45.283560     7.854067
min       0.080000          55.120000    10.300000
25%      25.000000          77.245000    23.500000
50%      45.000000          91.885000    28.100000
75%      61.000000         114.090000    33.100000
max      82.000000         271.740000    97.600000


In [32]:
#Obtain the count information for the values in the stroke, hypertension, and  heart_disease variables

def get_counts(df, column_name):
    # Convert the column_name type to int64, handling potential errors
    df[column_name] = pd.to_numeric(df[column_name], errors='raise').astype('int64')
    
    #calculate the value counts and sort by index which will be the column_name values.
    value_counts = df[column_name].value_counts().sort_index()
    print(f"Frequency counts for '{column_name}':\n{value_counts}\n")

get_counts(df,'stroke')
get_counts(df, 'hypertension')
get_counts(df, 'heart_disease')

Frequency counts for 'stroke':
stroke
0    4861
1     249
Name: count, dtype: int64

Frequency counts for 'hypertension':
hypertension
0    4612
1     498
Name: count, dtype: int64

Frequency counts for 'heart_disease':
heart_disease
0    4834
1     276
Name: count, dtype: int64



In [None]:
#Age 
#We are creating a new variable Age_temp_cat using the Age variable. The purpose of this new variable 
#is to help us in imputing the missing values of the bmi variable

#bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, float('inf')]
#labels = [0, 1, 2, 3, 4, 5, 6,7,8]
#bins = [0,15, 30, 45, 60, 75, float ('inf') ]
#labels = [0, 1, 2, 3, 4, 5]
#bins = [0, 20, 40, 60,  float('inf')]
#The bins above were used to play around with the data when determining the optimal bin values 
#for creating the new variable Age_temp_cat from Age

#Note that the minimum value for Age is 0.08 and the maximum value is 82

bins = [0, 20, 40, 60,  float('inf')]
labels = [0, 1, 2, 3]



df['Age_temp_cat'] = pd.cut(df['age'], bins=bins, labels=labels, right=False) 
df['Age_temp_cat'] = df['Age_temp_cat'].astype('category')

# Initialize and fit LabelEncoder
encoder = LabelEncoder()
df['Age_temp'] = encoder.fit_transform(df['Age_temp_cat'])
# Drop the intermediate 'Age_temp_cat' column
df.drop('Age_temp_cat', axis=1, inplace=True)
print (df.head(3))





   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  
0          Urban             228.69  36.6  formerly smoked       1         3  
1          Rural             202.21   NaN     never smoked       1         3  
2          Rural             105.92  32.5     never smoked       1         3  


In [34]:
#avg_glucose_level

#We are creating a new variable avg_glucose_level_temp_cat using the avg_glucose_level variable. The purpose of this new variable 
#is to help us in imputing the missing values of the bmi variable

#Note that the minimum value for avg_glucose_level is 55.12 and  the maximum value is 271.74

# Define the bins and labels
#bins = [50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250, 270, float('inf')]
#labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
#bins = [50, 90, 130, 170, 210, float('inf')]
#labels = [1, 2, 3, 4, 5]
#bins = [50, 100, 150, 200, float('inf')]
#labels = [1, 2, 3, 4]
#bins = [50, 120, 190, float('inf')]
#labels = [1, 2, 3]
#bins = [50, 150,float('inf')]
#labels = [1, 2]

#The bins above were used to play around with the data when determining the optimal bin values 
#for creating the new variable avg_glucose_level_temp_cat from avg_glucose_level

# Define the bins and labels
bins = [50, 120, 190, float('inf')]
labels = [1, 2, 3]

# Create the 'avg_glucose_level_temp' column using pd.cut
df['avg_glucose_level_temp_cat'] = pd.cut(df['avg_glucose_level'], bins=bins, labels=labels, right=False)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the categorical column
df['avg_glucose_level_temp'] = label_encoder.fit_transform(df['avg_glucose_level_temp_cat']) + 1 # +1 to match the original labels

df.drop('avg_glucose_level_temp_cat', axis=1, inplace=True)
# Print the DataFrame with the new categorical column
print(df.head(3))

   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  \
0          Urban             228.69  36.6  formerly smoked       1         3   
1          Rural             202.21   NaN     never smoked       1         3   
2          Rural             105.92  32.5     never smoked       1         3   

   avg_glucose_level_temp  
0                       3  
1                       3  
2                       1  


In [35]:
#Obtain the frequency counts of the newly created variables Age_temp and avg_glucose_level_temp
get_counts(df,'Age_temp')
get_counts(df, 'avg_glucose_level_temp')

Frequency counts for 'Age_temp':
Age_temp
0     966
1    1204
2    1564
3    1376
Name: count, dtype: int64

Frequency counts for 'avg_glucose_level_temp':
avg_glucose_level_temp
1    3991
2     589
3     530
Name: count, dtype: int64



In [36]:
#Gender variable
#Transform the Gender variable into a numerical variable using the label encoder method

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'gender' column
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])
get_counts(df,'gender_encoded')

# Get the unique classes and their corresponding labels
class_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Gender Mapping:")
for key, value in class_labels.items():
    print(f"{key}: {value}")

Frequency counts for 'gender_encoded':
gender_encoded
0    2994
1    2115
2       1
Name: count, dtype: int64

Gender Mapping:
Female: 0
Male: 1
Other: 2


In [37]:
#ever_married
#Transform the ever_married variable into a numerical variable using the label encoder method

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'ever_married' column
df['ever_married_encoded'] = label_encoder.fit_transform(df['ever_married'])

# Print the DataFrame with the new encoded column
print(df.head(3))

# Get the unique classes and their corresponding labels
class_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("ever_married Mapping:")
for key, value in class_labels.items():
    print(f"{key}: {value}")


   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  \
0          Urban             228.69  36.6  formerly smoked       1         3   
1          Rural             202.21   NaN     never smoked       1         3   
2          Rural             105.92  32.5     never smoked       1         3   

   avg_glucose_level_temp  gender_encoded  ever_married_encoded  
0                       3               1                     1  
1                       3               0                     1  
2                       1               1                     1  
ever_married Mapping:
No: 0
Yes: 1


In [38]:
#Work type

#Transform the Work type variable into a numerical variable using the label encoder method

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'work_type' column
df['work_type_encoded'] = label_encoder.fit_transform(df['work_type'])

# Print the DataFrame with the new encoded column
print(df.head(3))



# Get the unique classes and their corresponding labels
class_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("work_type Mapping:")
for key, value in class_labels.items():
    print(f"{key}: {value}")




   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  \
0          Urban             228.69  36.6  formerly smoked       1         3   
1          Rural             202.21   NaN     never smoked       1         3   
2          Rural             105.92  32.5     never smoked       1         3   

   avg_glucose_level_temp  gender_encoded  ever_married_encoded  \
0                       3               1                     1   
1                       3               0                     1   
2                       1               1                     1   

   work_type_encoded  
0                  2  
1                  3  
2                  2  
work_type Map

In [39]:
#Residence type

#Transform the Residence type variable into a numerical variable using the label encoder method

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Residence_type' column
df['Residence_type_encoded'] = label_encoder.fit_transform(df['Residence_type'])

# Print the DataFrame with the new encoded column
print(df.head(3))

# Get the unique classes and their corresponding labels
class_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Residence_type Mapping:")
for key, value in class_labels.items():
    print(f"{key}: {value}")



   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  \
0          Urban             228.69  36.6  formerly smoked       1         3   
1          Rural             202.21   NaN     never smoked       1         3   
2          Rural             105.92  32.5     never smoked       1         3   

   avg_glucose_level_temp  gender_encoded  ever_married_encoded  \
0                       3               1                     1   
1                       3               0                     1   
2                       1               1                     1   

   work_type_encoded  Residence_type_encoded  
0                  2                       1  
1          

In [40]:
#smoking status
#Transform the smoking status variable into a numerical variable using the label encoder method

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'smoking_status' column
df['smoking_status_encoded'] = label_encoder.fit_transform(df['smoking_status'])

# Print the DataFrame with the new encoded column
print(df.head(3))

# Get the unique classes and their corresponding labels
class_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Residence_type Mapping:")
for key, value in class_labels.items():
    print(f"{key}: {value}")


   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  Age_temp  \
0          Urban             228.69  36.6  formerly smoked       1         3   
1          Rural             202.21   NaN     never smoked       1         3   
2          Rural             105.92  32.5     never smoked       1         3   

   avg_glucose_level_temp  gender_encoded  ever_married_encoded  \
0                       3               1                     1   
1                       3               0                     1   
2                       1               1                     1   

   work_type_encoded  Residence_type_encoded  smoking_status_encoded  
0                  2              

In [41]:
print (df.dtypes)

gender                    category
age                        float64
hypertension                 int64
heart_disease                int64
ever_married              category
work_type                 category
Residence_type            category
avg_glucose_level          float64
bmi                        float64
smoking_status            category
stroke                       int64
Age_temp                     int64
avg_glucose_level_temp       int64
gender_encoded               int64
ever_married_encoded         int64
work_type_encoded            int64
Residence_type_encoded       int64
smoking_status_encoded       int64
dtype: object


In [42]:

#  Imputes missing BMI values using grouped means based on other relevant variables in this dataframe
def impute_bmi_with_grouped_means(df):
     # Identify the variables to group by
    grouping_columns = [
                         'gender_encoded',
                        'ever_married_encoded', 'work_type_encoded',
                        'Residence_type_encoded', 'smoking_status_encoded',
                        'Age_temp' , 'avg_glucose_level_temp'
                        ]
    # Calculate grouped means
    grouped_means = df.groupby(grouping_columns)['bmi'].transform('mean')

     # Impute missing values
    df['bmi'] = df['bmi'].fillna(grouped_means)
    return (df)

# Impute missing BMI values
df = impute_bmi_with_grouped_means(df)
print (df.head(3))


    



   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   

  Residence_type  avg_glucose_level        bmi   smoking_status  stroke  \
0          Urban             228.69  36.600000  formerly smoked       1   
1          Rural             202.21  30.007143     never smoked       1   
2          Rural             105.92  32.500000     never smoked       1   

   Age_temp  avg_glucose_level_temp  gender_encoded  ever_married_encoded  \
0         3                       3               1                     1   
1         3                       3               0                     1   
2         3                       1               1                     1   

   work_type_encoded  Residence_type_encoded  smoking_status_encoded  
0             

In [43]:
#checking again for presence of missing values
print (df.isnull().sum())

gender                    0
age                       0
hypertension              0
heart_disease             0
ever_married              0
work_type                 0
Residence_type            0
avg_glucose_level         0
bmi                       4
smoking_status            0
stroke                    0
Age_temp                  0
avg_glucose_level_temp    0
gender_encoded            0
ever_married_encoded      0
work_type_encoded         0
Residence_type_encoded    0
smoking_status_encoded    0
dtype: int64


In [44]:
# Print rows with missing BMI values
missing_bmi_rows = df[df['bmi'].isnull()]
if not missing_bmi_rows.empty:
    print("Rows with missing BMI values:")
    print(missing_bmi_rows)
else:
    print("No missing BMI values found.")

Rows with missing BMI values:
      gender   age  hypertension  heart_disease ever_married work_type  \
19      Male  57.0             0              1           No  Govt_job   
1102  Female  23.0             0              0           No   Private   
1596    Male  47.0             0              0           No   Private   
2752  Female  38.0             0              0          Yes   Private   

     Residence_type  avg_glucose_level  bmi smoking_status  stroke  Age_temp  \
19            Urban             217.08  NaN        Unknown       1         2   
1102          Rural             193.22  NaN         smokes       0         1   
1596          Rural             237.17  NaN        Unknown       0         2   
2752          Rural             217.55  NaN         smokes       0         1   

      avg_glucose_level_temp  gender_encoded  ever_married_encoded  \
19                         3               1                     0   
1102                       3               0              

In [45]:
# imputing again without using the avg_glucose_level variable we used earlier
def impute_bmi_with_grouped_means(df):
     # Identify the variables to group by
    grouping_columns = [
                         'gender_encoded',
                        'ever_married_encoded', 'work_type_encoded',
                        'Residence_type_encoded', 'smoking_status_encoded',
                        'Age_temp' 
                        ]
    # Calculate grouped means
    grouped_means = df.groupby(grouping_columns)['bmi'].transform('mean')

     # Impute missing values
    df['bmi'] = df['bmi'].fillna(grouped_means)
    return (df)

# Impute missing BMI values
df = impute_bmi_with_grouped_means(df)

In [46]:
#checking again for presence of missing values
print (df.isnull().sum())

gender                    0
age                       0
hypertension              0
heart_disease             0
ever_married              0
work_type                 0
Residence_type            0
avg_glucose_level         0
bmi                       0
smoking_status            0
stroke                    0
Age_temp                  0
avg_glucose_level_temp    0
gender_encoded            0
ever_married_encoded      0
work_type_encoded         0
Residence_type_encoded    0
smoking_status_encoded    0
dtype: int64
