## Saving Training and Testing Sets

This section saves the split training and testing sets into separate CSV files.

In [None]:
# Save the training sets to CSV files
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

# Save the testing sets to CSV files
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("\nTraining features saved as 'X_train.csv'")
print("Training target saved as 'y_train.csv'")
print("Testing features saved as 'X_test.csv'")
print("Testing target saved as 'y_test.csv'")


Training features saved as 'X_train.csv'
Training target saved as 'y_train.csv'
Testing features saved as 'X_test.csv'
Testing target saved as 'y_test.csv'


In [None]:
# Save the final processed DataFrame to a CSV file
df_final_pca.to_csv('stroke_processed_pca.csv', index=False)

print("Processed dataset saved as 'stroke_processed_pca.csv'")

Processed dataset saved as 'stroke_processed_pca.csv'


In [None]:
import os
import shutil

# Define the name of the new folder
output_folder = 'Processed'

# Create the new folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created folder: {output_folder}")
else:
    print(f"Folder '{output_folder}' already exists.")

# Define the list of files to move
files_to_move = ['stroke_processed_pca.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']

# Move each file to the new folder
for file_name in files_to_move:
    if os.path.exists(file_name):
        shutil.move(file_name, os.path.join(output_folder, file_name))
        print(f"Moved '{file_name}' to '{output_folder}'")
    else:
        print(f"File '{file_name}' not found.")

Folder 'Processed' already exists.
Moved 'stroke_processed_pca.csv' to 'Processed'
Moved 'X_train.csv' to 'Processed'
Moved 'y_train.csv' to 'Processed'
Moved 'X_test.csv' to 'Processed'
Moved 'y_test.csv' to 'Processed'


## Summary:

### Data Analysis Key Findings

*   The dataset initially contained 201 missing values in the 'bmi' column, which were successfully imputed with the mean value of approximately 28.89.
*   Categorical variables ('gender', 'ever\_married', 'work\_type', 'Residence\_type', 'smoking\_status') were successfully encoded using one-hot encoding, resulting in the creation of new binary columns.
*   Outliers were identified and handled in the 'avg\_glucose\_level' and 'bmi' columns using the IQR method, capping extreme values. 627 outliers were capped in 'avg\_glucose\_level' and 126 in 'bmi'.
*   Numerical features ('id', 'age', 'avg\_glucose\_level', 'bmi') were scaled using `StandardScaler`, resulting in features with means close to 0 and standard deviations close to 1.
*   Feature selection using a RandomForestClassifier identified 'age', 'bmi', and 'avg\_glucose\_level' as the most important features, and a total of 13 features were selected based on an importance threshold of 0.01.
*   Principal Component Analysis (PCA) was applied to the 13 selected features, and it was determined that 10 principal components retain at least 95% of the data's variance, successfully reducing the dimensionality from 13 to 10.




# **This code is to delete The processed data when troubleshooting**

In [None]:
# import os
# import shutil

# folder_path = '/content/Delete Stuff'

# # Check if the folder exists
# if os.path.exists(folder_path):
#     # List all items in the folder
#     for item in os.listdir(folder_path):
#         item_path = os.path.join(folder_path, item)
#         # Check if it's a file and remove it
#         if os.path.isfile(item_path):
#             os.remove(item_path)
#             print(f"Deleted file: {item_path}")
#         # Check if it's a directory and remove it recursively
#         elif os.path.isdir(item_path):
#             shutil.rmtree(item_path)
#             print(f"Deleted directory: {item_path}")
#     print(f"All files and subfolders in '{folder_path}' have been deleted.")
# else:
#     print(f"Folder '{folder_path}' does not exist.")

All files and subfolders in '/content/Delete Stuff' have been deleted.
