In [3]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import NotFittedError

class PrognosisRecommender:
    
    def __init__(self, k_neighbors=5):
        self.k_neighbors = k_neighbors
        self.preprocessor = None
        self.rf_model = None
        self.knn_model = None
        self.fitted_data_X = None
        self.fitted_data_y = None
        self.feature_names = []

    def _load_and_clean_data(self, csv_data):
        """
        Loads the raw CSV string, cleans headers, and drops leaky cols.
        """
        df = pd.read_csv(csv_data)
        
        # Fix the empty column created by ",,"
        df = df.drop(df.columns[3], axis=1)
        
        # Fix the typo in the header
        df = df.rename(columns={"Reginol Node Positive": "Regional Node Positive"})
        
        # Define target and features
        self.target_col = 'Status'
        self.outcome_cols = ['Status', 'Survival Months']
        
        # 'Survival Months' is an OUTCOME, not a feature.
        # Using it to predict 'Status' is data leakage.
        self.all_features = [col for col in df.columns if col not in self.outcome_cols]
        
        return df

    def _build_preprocessor(self, X):
        """
        Defines the ColumnTransformer to handle mixed data types.
        """
        # Identify numerical and categorical features
        self.numerical_features = X.select_dtypes(include=np.number).columns.tolist()
        self.categorical_features = X.select_dtypes(include='object').columns.tolist()
        
        # Create transformers
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Create the main preprocessor
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])
        
    def fit(self, csv_data):
        """
        Fits the preprocessor, RandomForest, and KNN models on the data.
        """
        # 1. Load and Clean
        df = self._load_and_clean_data(csv_data)
        
        X = df[self.all_features]
        # Encode target: Dead=1, Alive=0
        y = df[self.target_col].map({'Dead': 1, 'Alive': 0})
        
        # 2. Build and fit the preprocessor
        self._build_preprocessor(X)
        X_processed = self.preprocessor.fit_transform(X)
        
        # Get feature names after one-hot encoding
        try:
            ohe_feature_names = self.preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(self.categorical_features)
            self.feature_names = self.numerical_features + list(ohe_feature_names)
        except AttributeError: # Older sklearn
            pass
            
        # 3. Train the Prognosis Model (RandomForest)
        self.rf_model = RandomForestClassifier(random_state=42)
        self.rf_model.fit(X_processed, y)
        
        # 4. Train the Similarity Model (NearestNeighbors)
        self.knn_model = NearestNeighbors(n_neighbors=self.k_neighbors)
        self.knn_model.fit(X_processed)
        
        # 5. Store data for lookup
        self.fitted_data_X = X
        self.fitted_data_y = df[self.outcome_cols]
        
        print("Prognosis Recommender system is trained and ready.")

    def get_prognosis(self, new_patient_data):
        """
        Provides a full prognosis report for a new patient.
        
        new_patient_data: A dictionary with patient features.
        """
        if self.preprocessor is None:
            raise NotFittedError("Must call .fit() before .get_prognosis()")
            
        # Convert dict to DataFrame
        new_patient_df = pd.DataFrame([new_patient_data])
        
        # Ensure column order matches
        new_patient_df = new_patient_df[self.all_features]
        
        # 1. Process the new patient's data
        patient_processed = self.preprocessor.transform(new_patient_df)
        
        # 2. Get Risk Score from RandomForest
        # predict_proba returns [prob_Alive, prob_Dead]
        risk_prob = self.rf_model.predict_proba(patient_processed)[0][1] # Get prob of 'Dead'
        
        # 3. Find Similar Patients
        distances, indices = self.knn_model.kneighbors(patient_processed)
        
        # Get the indices of the 5 nearest neighbors
        neighbor_indices = indices[0]
        
        # Look up these patients in the original dataset
        similar_patients_X = self.fitted_data_X.iloc[neighbor_indices]
        similar_patients_y = self.fitted_data_y.iloc[neighbor_indices]
        
        similar_patients_report = pd.concat([similar_patients_X, similar_patients_y], axis=1)
        
        return risk_prob, similar_patients_report

    def get_global_feature_importance(self):
        """
        Returns the most important features for the prognosis model.
        """
        if self.rf_model and self.feature_names:
            importances = self.rf_model.feature_importances_
            feature_imp_df = pd.DataFrame(zip(self.feature_names, importances),
                                          columns=['Feature', 'Importance'])
            return feature_imp_df.sort_values('Importance', ascending=False)
        return "Model not fitted or feature names not available."

# --- Main execution ---
if __name__ == "__main__":
    
    # 1. Initialize and train the system
    recommender = PrognosisRecommender(k_neighbors=5)
    recommender.fit(csv_data="/mnt/10EE4B76EE4B5360/College/pccoe/7th Sem/RS/RS-A4_SEER Breast Cancer Dataset .csv")
    
    # 2. Create a new patient profile for testing
    # (We'll use a modified version of the first row)
    new_patient = {
        'Age': 45,
        'Race': 'White',
        'Marital Status': 'Married (including common law)',
        'T Stage ': 'T2', # Note the space in 'T Stage '
        'N Stage': 'N3',
        '6th Stage': 'IIIC',
        'Grade': 'Poorly differentiated; Grade III',
        'A Stage': 'Regional',
        'Tumor Size': 50,
        'Estrogen Status': 'Positive',
        'Progesterone Status': 'Positive',
        'Regional Node Examined': 20,
        'Regional Node Positive': 15
    }

    # 3. Get the prognosis
    risk_score, similar_patients = recommender.get_prognosis(new_patient)
    
    # 4. Print the report
    print("\n--- Prognosis Report for New Patient ---")
    print(f"Patient Profile: Age={new_patient['Age']}, T-Stage={new_patient['T Stage ']}, N-Stage={new_patient['N Stage']}, Tumor Size={new_patient['Tumor Size']}")
    print("-" * 40)
    
    print(f"\n[Prognosis Model (Random Forest)]")
    print(f"Predicted Risk of Mortality: {risk_score * 100:.2f}%")
    
    print("\n[Similarity Model (k-Nearest Neighbors)]")
    print("Found 5 similar patients in the database:")
    
    # Define columns to show for similarity
    display_cols = ['Age', 'T Stage ', 'N Stage', 'Tumor Size', 'Grade', 'Status', 'Survival Months']
    print(similar_patients[display_cols].to_string(index=False))
    
    # 5. Show Global Feature Importance
    print("\n" + "-" * 40)
    print("\nGlobal Feature Importance (Top 5):")
    print("(These are the most important features *for the model* across all patients)")
    print(recommender.get_global_feature_importance().head(5).to_string(index=False))

Prognosis Recommender system is trained and ready.

--- Prognosis Report for New Patient ---
Patient Profile: Age=45, T-Stage=T2, N-Stage=N3, Tumor Size=50
----------------------------------------

[Prognosis Model (Random Forest)]
Predicted Risk of Mortality: 24.00%

[Similarity Model (k-Nearest Neighbors)]
Found 5 similar patients in the database:
 Age T Stage  N Stage  Tumor Size                            Grade Status  Survival Months
  43       T2      N3          35 Poorly differentiated; Grade III  Alive               65
  41       T2      N3          42 Poorly differentiated; Grade III  Alive               81
  53       T2      N3          30 Poorly differentiated; Grade III  Alive              101
  55       T2      N3          43 Poorly differentiated; Grade III   Dead               13
  48       T2      N3          21 Poorly differentiated; Grade III  Alive               33

----------------------------------------

Global Feature Importance (Top 5):
(These are the most impo