In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load the data

In [None]:
stroke_data = pd.read_csv('drive/My Drive/healthcare-dataset-stroke-data (2).csv')

# MANUAL PREPROCESSING STEPS

# Step 1: Remove records with 'Other' gender

In [None]:
stroke_data = stroke_data[stroke_data['gender'] != 'Other']
print("Records after removing 'Other' gender:", len(stroke_data))

Records after removing 'Other' gender: 5109


# Step 2: Handle missing BMI values manually

In [None]:
# Compute mean BMI manually
bmi_sum = 0
bmi_count = 0
for bmi in stroke_data['bmi']:
    if not np.isnan(bmi):
        bmi_sum += bmi
        bmi_count += 1

bmi_mean = bmi_sum / bmi_count
print(f"Manually calculated BMI mean: {bmi_mean:.2f}")

# Fill missing BMI values with the mean - fixing the index issue
for idx in stroke_data.index:  # Use the actual index values instead of range(len())
    if np.isnan(stroke_data.loc[idx, 'bmi']):
        stroke_data.loc[idx, 'bmi'] = bmi_mean

Manually calculated BMI mean: 28.89


# Step 3: One-hot encode categorical variables manually

In [None]:
# Function to one-hot encode a categorical column
def manual_one_hot_encode(df, column_name, prefix):
    unique_values = df[column_name].unique()
    result = df.copy()

    for value in unique_values:
        # Create new column for each unique value
        new_col_name = f"{prefix}_{value}"
        result[new_col_name] = 0
        result.loc[df[column_name] == value, new_col_name] = 1

    # Drop the original column
    result = result.drop(column_name, axis=1)
    return result

# Apply one-hot encoding to all categorical columns
categorical_columns = ['hypertension', 'heart_disease', 'gender', 'ever_married',
                      'work_type', 'Residence_type', 'smoking_status']
stroke_data_encoded = stroke_data.copy()

for col in categorical_columns:
    stroke_data_encoded = manual_one_hot_encode(stroke_data_encoded, col, col)

# Step 4: Rename specific columns manually

In [None]:
# Create a renaming dictionary
renaming = {
    "hypertension_0": "no_hypertension",
    "hypertension_1": "yes_hypertension",
    "heart_disease_0": "no_heart_disease",
    "heart_disease_1": "yes_heart_disease"
}

# Rename columns
stroke_data_encoded = stroke_data_encoded.rename(columns=renaming)

# Step 5: Drop the ID column

In [None]:
stroke_data_encoded = stroke_data_encoded.drop('id', axis=1)

# Step 6: Standardize numerical features manually

In [None]:
numeric_columns = ['age', 'avg_glucose_level', 'bmi']

# Calculate mean and std for each numeric column
stats = {}
for col in numeric_columns:
    col_sum = 0
    col_count = 0
    for val in stroke_data_encoded[col]:
        col_sum += val
        col_count += 1

    col_mean = col_sum / col_count

    # Calculate standard deviation
    sum_squared_diff = 0
    for val in stroke_data_encoded[col]:
        sum_squared_diff += (val - col_mean) ** 2

    col_std = np.sqrt(sum_squared_diff / col_count)

    stats[col] = {'mean': col_mean, 'std': col_std}
    print(f"Column {col}: mean = {col_mean:.2f}, std = {col_std:.2f}")

# Apply standardization using actual index values
for col in numeric_columns:
    for idx in stroke_data_encoded.index:  # Use actual index values
        old_value = stroke_data_encoded.loc[idx, col]
        new_value = (old_value - stats[col]['mean']) / stats[col]['std']
        stroke_data_encoded.loc[idx, col] = new_value


Column age: mean = 43.23, std = 22.61
Column avg_glucose_level: mean = 106.14, std = 45.28
Column bmi: mean = 28.89, std = 7.70


In [None]:
stroke_data_encoded

Unnamed: 0,age,avg_glucose_level,bmi,stroke,no_hypertension,yes_hypertension,yes_heart_disease,no_heart_disease,gender_Male,gender_Female,...,work_type_Self-employed,work_type_Govt_job,work_type_children,work_type_Never_worked,Residence_type_Urban,Residence_type_Rural,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,smoking_status_Unknown
0,1.051242,2.706450,1.001034e+00,1,1,0,1,0,1,0,...,0,0,0,0,1,0,1,0,0,0
1,0.785889,2.121652,1.476935e-14,1,1,0,0,1,0,1,...,1,0,0,0,0,1,0,1,0,0
2,1.626174,-0.004867,4.683922e-01,1,1,0,1,0,1,0,...,0,0,0,0,0,1,0,1,0,0
3,0.255182,1.437473,7.152261e-01,1,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
4,1.581949,1.501297,-6.358651e-01,1,0,1,0,1,0,1,...,1,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1.626174,-0.494481,1.476935e-14,0,0,1,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
5106,1.670400,0.420922,1.442737e+00,0,1,0,0,1,0,1,...,1,0,0,0,1,0,0,1,0,0
5107,-0.363976,-0.511266,2.215582e-01,0,1,0,0,1,0,1,...,1,0,0,0,0,1,0,1,0,0
5108,0.343633,1.328375,-4.280049e-01,0,1,0,0,1,1,0,...,0,0,0,0,0,1,1,0,0,0


In [None]:
# Save as CSV
stroke_data_encoded.to_csv('stroke_data_processed.csv', index=False)

# Download CSV
from google.colab import files
files.download('stroke_data_processed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>