In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 1. Load the processed dataset
df_ml = pd.read_csv('breast_cancer_ready_for_ml.csv')

# 2. Define Leakage Columns (Must match Notebook 04)
# We remove columns that "give away" the answer to simulate a real prediction scenario
leakage_cols = [col for col in df_ml.columns if 'Site' in col or 'Recurrence' in col or 'Death' in col or 'Survival' in col]

# 3. Prepare Features (X) and Target (y)
X = df_ml.drop(columns=['Sample_Type'] + leakage_cols)
y = df_ml['Sample_Type']

# 4. Split Data (using same random_state=42 ensures we get the same split as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Retrain the "Realistic" Model
final_rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
final_rf.fit(X_train, y_train)

print("Setup Complete! Model re-trained and data loaded.")
print(f"Features expected by model: {len(X_train.columns)}")

Setup Complete! Model re-trained and data loaded.
Features expected by model: 491


In [2]:
# Select a random patient from the test set
random_index = np.random.randint(0, len(X_test))

# Use .iloc to get the row (as a DataFrame to keep column names)
sample_patient = X_test.iloc[[random_index]]
actual_status = y_test.iloc[random_index]

# Make prediction
prediction = final_rf.predict(sample_patient)[0]
probability = final_rf.predict_proba(sample_patient)[0][1]

print(f"--- Patient Check (Index {random_index}) ---")
print(f"Actual Status:   {'Metastasis (1)' if actual_status == 1 else 'Primary (0)'}")
print(f"Model Predicted: {'Metastasis (1)' if prediction == 1 else 'Primary (0)'}")
print(f"Risk Probability: {probability*100:.2f}%")

if actual_status == prediction:
    print("Prediction matches reality.")
else:
    print("Prediction incorrect (This happens, accuracy is ~85%).")

--- Patient Check (Index 47) ---
Actual Status:   Metastasis (1)
Model Predicted: Metastasis (1)
Risk Probability: 82.97%
Prediction matches reality.


In [3]:
# Create a blank template with the same columns as training data
new_patient = pd.DataFrame(0, index=[0], columns=X_train.columns)

# --- INPUT CLINICAL DATA HERE ---
# Let's simulate a high-risk patient
if 'Invasive_Carcinoma_Diagnosis_Age' in new_patient.columns:
    new_patient['Invasive_Carcinoma_Diagnosis_Age'] = 40  # Young age is often higher risk

if 'Mutation_Count' in new_patient.columns:
    new_patient['Mutation_Count'] = 20  # High mutation load

if 'Overall_Primary_Tumor_Grade_High' in new_patient.columns:
    new_patient['Overall_Primary_Tumor_Grade_High'] = 1  # High grade tumor

# Make prediction
pred_new = final_rf.predict(new_patient)[0]
prob_new = final_rf.predict_proba(new_patient)[0][1]

print("\n--- Hypothetical Patient Prediction ---")
print(f"Patient Profile: Age=40, Mutations=20, High Grade Tumor")
print(f"Predicted Class: {pred_new} ({'Metastasis' if pred_new == 1 else 'Primary'})")
print(f"Probability of Metastasis: {prob_new*100:.2f}%")


--- Hypothetical Patient Prediction ---
Patient Profile: Age=40, Mutations=20, High Grade Tumor
Predicted Class: 0 (Primary)
Probability of Metastasis: 33.45%
