In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, roc_auc_score


In [2]:
# reading my csv file of stroke
dataset = pd.read_csv('stroke_data.csv')
dataset

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
40905,1.0,38.0,0,0,0,4,1,120.94,29.7,1,0
40906,0.0,53.0,0,0,1,4,0,77.66,40.8,0,0
40907,1.0,32.0,0,0,1,2,0,231.95,33.2,0,0
40908,1.0,42.0,0,0,1,3,0,216.38,34.5,0,0


In [23]:
# code to know different type of values in dataset.
work_types = dataset['smoking_status'].unique()
work_types

array([1, 0])

In [3]:
# Identify columns with null values
# This identifies all columns in the dataset that contain at least one null (missing) value
null_columns = dataset.columns[dataset.isnull().any()]
# .any() is used to summarize whether a column contains any null values (True) or not (False).

# Print the names of the columns and their count of null values
print("Columns with null values and their count:")

# This calculates the total number of null (missing) values for each identified column and prints it
print(dataset[null_columns].isnull().sum())


Columns with null values and their count:
sex    3
dtype: int64


In [4]:
dataset.columns

Index(['sex', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [5]:
dataset.shape

(40910, 11)

In [6]:
dataset.size

450010

In [7]:
dataset.describe()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,40907.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0
mean,0.555162,51.327255,0.213835,0.127719,0.82134,3.461134,0.514886,122.075901,30.406355,0.488609,0.500122
std,0.496954,21.623969,0.410017,0.333781,0.383072,0.780919,0.499784,57.561531,6.835072,0.499876,0.500006
min,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,55.12,11.5,0.0,0.0
25%,0.0,35.0,0.0,0.0,1.0,3.0,0.0,78.75,25.9,0.0,0.0
50%,1.0,52.0,0.0,0.0,1.0,4.0,1.0,97.92,29.4,0.0,1.0
75%,1.0,68.0,0.0,0.0,1.0,4.0,1.0,167.59,34.1,1.0,1.0
max,1.0,103.0,1.0,1.0,1.0,4.0,1.0,271.74,92.0,1.0,1.0


In [8]:
clm = dataset.columns[dataset.isnull().any()]
clm

Index(['sex'], dtype='object')

In [9]:
# code to print rows only which is having NAN value

missing_value = dataset[dataset['sex'].isnull()]
missing_value

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
22478,,39.0,0,0,1,4,1,70.56,28.6,1,0
28908,,36.0,0,0,1,4,1,70.56,28.6,1,0
35184,,77.0,0,0,1,4,1,70.56,28.6,1,0


In [10]:
# filling NAN value using forward fill method
# using inplace=True, the operation modifies the object (DataFrame or Series) in place without creating a copy.made change in original copy 
dataset['sex'].fillna(method = 'ffill', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['sex'].fillna(method = 'ffill', inplace = True)
  dataset['sex'].fillna(method = 'ffill', inplace = True)


In [11]:
# now checking NAN value in dataset after using ffil method.
dataset.isnull().sum()
# no one column is having null value


sex                  0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [12]:
# separating and dependent coloumns of dataset.

x = dataset.iloc[:,0:10]
y = dataset.iloc[:,-1]


In [13]:
# spliting into train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=4)


In [14]:
# traintraining model using logistic regrassion

model = LogisticRegression( class_weight = 'balanced', random_state = 4, max_iter = 500, solver='saga')

# saga: Good for large datasets and supports L1 regularization.

# L1 Regularization (also called Lasso Regularization) is a technique used in machine
# learning to prevent overfitting and enhance model generalization by penalizing the absolute magnitude of the
# coefficients of a model. It's commonly applied in linear models like Logistic Regression and Linear Regression.

# max_iter=500 allows the optimization algorithm to perform up to 500 iterations to find the best model weights.
# It’s useful to prevent premature stopping when the default limit (100) isn’t enough for convergence in complex datasets.

model.fit(x_train,y_train)



# 1) class_weight='balanced':

# Addresses class imbalance in the dataset.

# In an imbalanced dataset (e.g., fewer cases of stroke 1 compared to no stroke 0), the model tends to favor the majority class.

# The balanced option automatically adjusts the weights inversely proportional to the class frequencies:

# Weight of Class = Total Samples/(2 × Number of Samples in Class)
 
# This ensures the model pays equal attention to both classes.

# 2) random_state=42:

# Sets a seed for the random number generator.
# Ensures reproducibility of results.
# Any fixed integer can be used (e.g., 42 is commonly used).



In [15]:
y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.66      0.76      0.70      6067
           1       0.72      0.61      0.66      6206

    accuracy                           0.68     12273
   macro avg       0.69      0.69      0.68     12273
weighted avg       0.69      0.68      0.68     12273

ROC-AUC Score: 0.7510627512595546


In [16]:
import pickle
pickle.dump(model,open('stroke.pkl','wb'))


In [17]:
model1 = pickle.load(open('stroke.pkl','rb'))
prd = model1.predict([[1.0, 63.0, 0, 1, 1, 4, 1, 228.69,	36.6, 1]])
prd



array([1])