In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer


In [10]:
data=pd.read_csv('dataset.csv')
data.columns=[column.strip() for column in data.columns]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


In [11]:

# Drop 'id' column as it's likely not useful for prediction
data = data.drop('id', axis=1)

In [12]:
# Split the data into features (X) and target variable (y)
X = data.drop('stroke', axis=1)
y = data['stroke']


In [13]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define columns for different preprocessing steps
# Numeric features for normalization
numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Categorical features for one-hot encoding
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [15]:
# Create transformers for different preprocessing steps
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'),  # Handle missing values using median
    StandardScaler()  # Normalize the numeric features
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  # Handle missing values using the most frequent value
    OneHotEncoder()  # One-hot encode categorical features
)

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the SVM model with the preprocessor
model = make_pipeline(
    preprocessor,
    SVC(kernel='linear')  # Use a linear kernel for SVM
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = model.score(X_test, y_test)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 0.9814516129032258
