In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib
import os

# Create directories if they don't exist
os.makedirs(r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python", exist_ok=True)

# Load data
file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Data\heart.csv"
df = pd.read_csv(file_path)

# Split features and target
X = df.drop("HeartDisease", axis=1)  # Features
y = df["HeartDisease"]              # Target

# One-hot encoding for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model (optional, for verification purposes)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Add a 'Best Guess' column for the test data
y_proba = model.predict_proba(X_test)  # Get predicted probabilities
best_guess = y_proba.argmax(axis=1)   # Index of the max probability (class prediction)

# Create a DataFrame to save with the predictions and best guess
test_data_with_predictions = pd.DataFrame(X_test, columns=X.columns)
test_data_with_predictions['HeartDisease_Prediction'] = y_pred
test_data_with_predictions['Best_Guess'] = best_guess

# Save the predictions and best guess DataFrame to the specified folder
predictions_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\test_predictions_with_best_guess.csv"
test_data_with_predictions.to_csv(predictions_file_path, index=False)
print(f"Predictions with 'Best Guess' saved to {predictions_file_path}")

# Save the trained model, scaler, and model columns
model_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\heart_disease_model.pkl"
scaler_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\scaler.pkl"
model_columns_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\heart_model_columns.pkl"

# Save the model and scaler to disk
joblib.dump(model, model_file_path)
joblib.dump(scaler, scaler_file_path)

# Save the model columns (the feature names) to disk
joblib.dump(X.columns.tolist(), model_columns_file_path)

print("Model, scaler, and model columns saved successfully!")


Model Accuracy: 0.88
Predictions with 'Best Guess' saved to C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\test_predictions_with_best_guess.csv
Model, scaler, and model columns saved successfully!


In [23]:
import pandas as pd

# Load the predictions file
predictions_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\test_predictions_with_best_guess.csv"
predictions_df = pd.read_csv(predictions_file_path)

# Display the DataFrame (optional)
print(predictions_df.head())


        Age  RestingBP  Cholesterol  FastingBS     MaxHR   Oldpeak     Sex_M  \
0  1.006537   0.410909    -0.034755  -0.551341  1.658016 -0.832432 -1.938163   
1 -0.054192   0.681120     2.919751  -0.551341 -0.267596 -0.832432  0.515952   
2  1.218683   1.491752    -1.818435   1.813758 -0.581981  0.293283  0.515952   
3  0.264027  -0.129513    -1.818435  -0.551341 -0.581981  0.105664  0.515952   
4  0.051881  -1.318441     1.008012  -0.551341  0.754157 -0.832432  0.515952   

   ChestPainType_ATA  ChestPainType_NAP  ChestPainType_TA  RestingECG_Normal  \
0           2.075177          -0.532838         -0.229679           0.814275   
1          -0.481887           1.876744         -0.229679           0.814275   
2          -0.481887          -0.532838         -0.229679          -1.228087   
3          -0.481887          -0.532838         -0.229679          -1.228087   
4           2.075177          -0.532838         -0.229679           0.814275   

   RestingECG_ST  ExerciseAngina_Y  ST