In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [39]:
# Load the dataset
data =  pd.read_csv(r'C:\\Users\\Nikitha\\Downloads\\FDM_Mini_Project-main\\FDM_Mini_Project-main\\strokeDataset.csv')

In [40]:
# number of rows and Columns in this dataset
data.shape

(5110, 12)

In [41]:
# Drop the 'id' column
data = data.drop('id', axis=1)

In [42]:
# Handle missing values in 'bmi' column by filling with the mean
data['bmi'].replace('N/A', np.nan, inplace=True)
data['bmi'] = data['bmi'].astype(float)
mean_bmi = data['bmi'].mean()
data['bmi'].fillna(mean_bmi, inplace=True)


In [43]:
# Encode categorical variables using label encoding
label_encoders = {}
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [44]:
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [45]:
# Separate features and target variable
X = data.drop('stroke', axis=1)
y = data['stroke']

In [46]:
print(X)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
1          0  61.0             0              0             1          3   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5105       0  80.0             1              0             1          2   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level        bmi  smoking_status  
0                 

In [47]:
print(y)

0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64


In [48]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Apply SMOTE for handling class imbalance
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

In [50]:
# Create and train the SVM model
svm_model = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_model.fit(X_train_resampled, y_train_resampled)

In [51]:
# Make predictions on the test set
y_pred = svm_model.predict(X_test)


In [52]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [53]:
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7788649706457925
Confusion Matrix:
 [[756 204]
 [ 22  40]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.79      0.87       960
           1       0.16      0.65      0.26        62

    accuracy                           0.78      1022
   macro avg       0.57      0.72      0.57      1022
weighted avg       0.92      0.78      0.83      1022



In [54]:
#input_data = (0,81,1,0,1,2,0,80.43,29.7,2) -1
#input_data = (0,79,0,0,1,2,1,110.85,24.1,1) -1
#input_data = (1,67,0,1,1,2,1,228.69,36.6,2) -wrong
input_data = (1,27,0,0,1,3,1,82.9,25,0)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = svm_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not strok')
else:
  print('The person is strok')

[0]
The person is not strok




In [55]:
import pickle
filename = 'trained_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))