In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib

# Load data
file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\heart.csv"
df = pd.read_csv(file_path)

# Split features and target
X = df.drop("HeartDisease", axis=1)  # Features
y = df["HeartDisease"]              # Target

# One-hot encoding for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model (optional, for verification purposes)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Add a 'Best Guess' column for the test data
y_proba = model.predict_proba(X_test)  # Get predicted probabilities
best_guess = y_proba.argmax(axis=1)   # Index of the max probability (class prediction)

# Create a DataFrame to save with the predictions and best guess
test_data_with_predictions = pd.DataFrame(X_test, columns=X.columns)
test_data_with_predictions['HeartDisease_Prediction'] = y_pred
test_data_with_predictions['Best_Guess'] = best_guess

# Save the predictions and best guess DataFrame to the specified folder
predictions_file_path = r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\test_predictions_with_best_guess.csv"
test_data_with_predictions.to_csv(predictions_file_path, index=False)
print(f"Predictions with 'Best Guess' saved to {predictions_file_path}")

# Save the model and scaler
joblib.dump(model, r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\heart_disease_model.pkl")
joblib.dump(scaler, r"C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\scaler.pkl")
print("Model and scaler saved successfully!")


Model Accuracy: 0.88
Predictions with 'Best Guess' saved to C:\Capstone Project\Module 1\Data-Analytics-Capstone\Shiny App\rsconnect-python\test_predictions_with_best_guess.csv
Model and scaler saved successfully!
