In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression  # <-- CHANGED
from sklearn.metrics import mean_squared_error

print("--- Smurf Heart Failure Project: Part 1 (Simple Linear Regression) ---")

try:
    # === 1. LOAD TRAINING DATA ===
    print("Loading training data (X_train.csv, y_train.csv)...")
    df_X_train = pd.read_csv("data/data_labeled/X_train.csv")
    df_y_train = pd.read_csv("data/data_labeled/y_train.csv", header=None, names=['heart_failure_risk'])
    
    # Flatten y_train to the (n_samples,) shape sklearn expects
    y_train = df_y_train['heart_failure_risk'].values

    # Clean column names by replacing spaces with underscores
    df_X_train_renamed = df_X_train.rename(columns={
        'smurfberry liquor': 'smurfberry_liquor',
        'smurfin donuts': 'smurfin_dots'
    })

    # === 2. DEFINE PREPROCESSING PIPELINE ===
    print("Building preprocessing pipeline...")
    
    # Define which columns go into which transformer
    numerical_features = [
        'age', 'blood pressure', 'calcium', 'cholesterol', 'hemoglobin',
        'height', 'potassium', 'vitamin D', 'weight'
    ]
    
    ordinal_features = ['sarsaparilla', 'smurfberry_liquor', 'smurfin_dots']
    
    nominal_features = ['profession']
    
    # Define the specific order for the ordinal features
    ordinal_categories = [
        'Very low', 'Low', 'Moderate', 'High', 'Very high'
    ]
    # Create the list of lists for the OrdinalEncoder
    categories_list = [ordinal_categories] * len(ordinal_features)

    # Create the individual transformers
    # No imputation needed as data was complete
    numeric_transformer = StandardScaler()
    ordinal_transformer = OrdinalEncoder(categories=categories_list)
    nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 

    # Build the master ColumnTransformer
    # 'remainder='drop'' will automatically drop 'img_filename'
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('ord', ordinal_transformer, ordinal_features),
            ('nom', nominal_transformer, nominal_features)
        ],
        remainder='drop'
    )

    # === 3. FIT PREPROCESSOR AND TRAIN MODEL ===
    print("Fitting preprocessor on training data...")
    X_train_processed = preprocessor.fit_transform(df_X_train_renamed)
    
    print("Training Linear Model (LinearRegression)...") # <-- CHANGED
    
    # Use a standard LinearRegression model
    model = LinearRegression() # <-- CHANGED
    
    model.fit(X_train_processed, y_train)
    
    print("...Model training complete.") # <-- REMOVED alpha print

    # === 4. LOAD AND PROCESS TEST DATA ===
    print("Loading test data (X_test.csv, y_test.csv)...")
    df_X_test = pd.read_csv("data/data_labeled/X_test.csv")
    df_y_test = pd.read_csv("data/data_labeled/y_test.csv", header=None, names=['heart_failure_risk'])
    y_true = df_y_test['heart_failure_risk'].values

    # Apply the SAME column renaming to the test set
    df_X_test_renamed = df_X_test.rename(columns={
        'smurfberry liquor': 'smurfberry_liquor',
        'smurfin donuts': 'smurfin_dots'
    })
    
    print("Applying fitted preprocessor to test data...")
    # CRITICAL: Use .transform() only. DO NOT .fit() on test data.
    X_test_processed = preprocessor.transform(df_X_test_renamed)

    # === 5. MAKE PREDICTIONS AND EVALUATE ===
    print("Making predictions on training set...")
    y_pred_train = model.predict(X_train_processed)
    
    print("Making predictions on test set...")
    y_pred_test = model.predict(X_test_processed)

    # Calculate RMSE for both training and test sets
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_true, y_pred_test))

    print("\n" + "="*50)
    print(f"     PART 1 BASELINE MODEL EVALUATION (LinearRegression)") # <-- CHANGED
    print(f"Training Set RMSE: {rmse_train:.6f}")
    print(f"Test Set RMSE: {rmse_test:.6f}")
    print("="*50 + "\n")

    # === 6. SAVE THE FINAL PIPELINE AND MODEL ===
    # These files are now ready for Part 2
    joblib.dump(preprocessor, 'smurf_preprocessor.joblib')
    joblib.dump(model, 'linear_model_simple.joblib') # <-- CHANGED
    print("Saved final preprocessor to 'smurf_preprocessor.joblib'")
    print("Saved final linear model to 'linear_model_simple.joblib'") # <-- CHANGED

except FileNotFoundError as e:
    print(f"\nERROR: File not found.")
    print(f"Please make sure X_train.csv, y_train.csv, X_test.csv, and y_test.csv are in the same directory.")
except Exception as e:
    print(f"\nAn error occurred: {e}")
    print("This might be due to unexpected values in the CSV files (e.g., a new 'profession' or a different ordinal value).")

--- Smurf Heart Failure Project: Part 1 (Simple Linear Regression) ---
Loading training data (X_train.csv, y_train.csv)...
Building preprocessing pipeline...
Fitting preprocessor on training data...
Training Linear Model (LinearRegression)...
...Model training complete.
Loading test data (X_test.csv, y_test.csv)...
Applying fitted preprocessor to test data...
Making predictions on training set...
Making predictions on test set...

     PART 1 BASELINE MODEL EVALUATION (LinearRegression)
Training Set RMSE: 0.054224
Test Set RMSE: 0.055820

Saved final preprocessor to 'smurf_preprocessor.joblib'
Saved final linear model to 'linear_model_simple.joblib'
