In [1]:
import pandas as pd

# Load the dataset

stroke_data = pd.read_csv('healthcare-dataset-stroke-data.csv') 


# Display the first few rows of the dataset to understand its structure and contents
stroke_data

FileNotFoundError: [Errno 2] No such file or directory: 'healthcare-dataset-stroke-data.csv'

In [None]:
# Checking for missing values and data types
missing_values = stroke_data.isnull().sum()
data_types = stroke_data.dtypes

# Display missing values and data types
missing_values, data_types

In [None]:
from sklearn.preprocessing import LabelEncoder

# Removing the 'id' column
data = stroke_data.drop(columns=['id'])

# Filling missing values in 'bmi' with the median value
bmi_median = stroke_data['bmi'].median()
data['bmi'].fillna(bmi_median, inplace=True)

# Encoding categorical variables using Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Displaying the first few rows after encoding
data.head()
data

In [None]:
# Drop id, ever_married, work_type, and Residence_type columns
df = data.drop(['ever_married', 'work_type', 'Residence_type'], axis=1)
df

In [None]:
# Checking the balance of the target variable 'stroke'
class_distribution = df['stroke'].value_counts(normalize=True)
class_distribution


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separating the features and the target variable
X = df.drop('stroke', axis=1)
y = df['stroke']

# Applying SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Checking the balance in the resampled dataset
balanced_target_distribution = pd.Series(y_resampled).value_counts(normalize=True)
balanced_target_distribution


In [None]:
# Splitting the original (unbalanced) dataset into training and testing sets
X_original = df.drop('stroke', axis=1)
y_original = df['stroke']

# Splitting the dataset into training and testing sets (70% train, 30% test)
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
    X_original, y_original, test_size=0.3, random_state=42)

# Displaying the size of each set
(X_train_original.shape, X_test_original.shape, y_train_original.shape, y_test_original.shape)

In [None]:
# Assuming the balanced dataset is stored in X_resampled and y_resampled as previously defined

# Splitting the balanced dataset into training and testing sets (70% train, 30% test)
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42)

# Displaying the size of the balanced training and testing sets
(X_train_balanced.shape, X_test_balanced.shape, y_train_balanced.shape, y_test_balanced.shape)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Assuming 'X' and 'y' are your full dataset features and target variables
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Perform feature selection with RFE on the balanced training data
selector = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=5, step=1)
selector = selector.fit(X_train_smote, y_train_smote)

# Define X_train_selected with the features selected by RFE
X_train_selected = X_train_smote.iloc[:, selector.support_]

# Check that the feature set and the target set have the same number of samples
assert X_train_selected.shape[0] == y_train_smote.shape[0], "The number of samples should be the same"

# Now you can perform grid search with cross-validation
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_selected, y_train_smote)

# Best parameters
best_params = grid_search.best_params_

# Train the Random Forest model with the best parameters
rf_best = RandomForestClassifier(**best_params, random_state=42)
rf_best.fit(X_train_selected, y_train_smote)

# Now you can use rf_best for prediction and evaluation on your test set


In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split, GridSearchCV

# # Assuming X and y are your features and target variable from the dataset
# # Splitting the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Use RFE to select features
# selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=5, step=1)
# selector = selector.fit(X_train, y_train)

# # Get the selected features
# selected_features = X_train.columns[selector.support_]

# # Define X_train_selected with the features selected by RFE
# X_train_selected = X_train[selected_features]

# # Now you can perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train_selected, y_train)

# # Best parameters
# best_params = grid_search.best_params_

# # Train the Random Forest model with the best parameters
# rf_best = RandomForestClassifier(**best_params)
# rf_best.fit(X_train_selected, y_train)
