In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE # Import SMOTE
import joblib # For saving and loading models
import os # Import os module to set environment variables

# --- Set environment variables to mitigate threading issues ---
# This often resolves 'NoneType' object has no attribute 'split' errors
# by forcing single-threaded operation for underlying numerical libraries.
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
# You can also try setting MKL_NUM_THREADS if you suspect MKL is involved
# os.environ['MKL_NUM_THREADS'] = '1'

# --- 1. Load the dataset ---
# The dataset 'parkinsons.csv' is assumed to be available in the environment.
try:
    df = pd.read_csv('/content/parkinsons.csv')
    print("Dataset loaded successfully.")
    print("First 5 rows of the dataset:")
    print(df.head())
    print("\nDataset information:")
    df.info()
except FileNotFoundError:
    print("Error: 'parkinsons.csv' not found. Please ensure the file is uploaded correctly.")
    exit()

# --- 2. Data Preprocessing ---

# Identify features (X) and target (y)
# For parkinsons.csv, 'status' is typically the target variable (1 for Parkinson's, 0 for healthy)
# All other columns are features. The 'name' column is usually an identifier and should be dropped.
X = df.drop(['status', 'name'], axis=1) # Assuming 'status' is the outcome column and 'name' is an identifier
y = df['status']

# Note: The Parkinson's dataset typically does not have 0s representing missing values
# in critical columns that need imputation like the diabetes dataset.
# If your 'parkinsons.csv' has specific missing values (e.g., NaN), you would need to add
# appropriate handling here (e.g., df.fillna(df.mean()), or more advanced imputation).

# Function to apply preprocessing steps (outlier treatment, scaling)
# This function will be used for both training and new data prediction
def preprocess_data(data_df, scaler=None, is_training=True, train_iqr_bounds=None):
    processed_df = data_df.copy()

    # Outlier Treatment (Capping using IQR method)
    # For new data, use IQR bounds calculated from training data
    current_iqr_bounds = {}
    if is_training:
        for column in processed_df.columns:
            Q1 = processed_df[column].quantile(0.25)
            Q3 = processed_df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            processed_df[column] = processed_df[column].clip(lower=lower_bound, upper=upper_bound)
            current_iqr_bounds[column] = {'lower': lower_bound, 'upper': upper_bound}
    else:
        if train_iqr_bounds is None:
            raise ValueError("train_iqr_bounds must be provided for new data preprocessing.")
        for column in processed_df.columns:
            lower_bound = train_iqr_bounds[column]['lower']
            upper_bound = train_iqr_bounds[column]['upper']
            processed_df[column] = processed_df[column].clip(lower=lower_bound, upper=upper_bound)
        current_iqr_bounds = train_iqr_bounds # No need to recalculate for new data

    # Feature Scaling
    if is_training:
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(processed_df)
    else:
        if scaler is None:
            raise ValueError("Scaler must be provided for new data preprocessing.")
        scaled_data = scaler.transform(processed_df)

    processed_df_scaled = pd.DataFrame(scaled_data, columns=processed_df.columns)

    if is_training:
        return processed_df_scaled, scaler, current_iqr_bounds
    else:
        return processed_df_scaled


# Preprocess training data
print("\nApplying preprocessing steps to training data...")
X_processed, scaler, train_iqr_bounds = preprocess_data(X, is_training=True)

print("Outliers capped for all features.")

print("\nFirst 5 rows of scaled features (after outlier treatment):")
print(X_processed.head())

# --- 3. Split the data into training and testing sets ---
# We'll use 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nOriginal Training set size: {X_train.shape[0]} samples")
print(f"Original Testing set size: {X_test.shape[0]} samples")
print(f"Original Outcome distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Original Outcome distribution in testing set:\n{y_test.value_counts(normalize=True)}")

# --- 4. Apply SMOTE to the training data ---
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
# The environment variables set at the top should handle threading,
# so the explicit threadpool_limits context manager is less critical here but can remain.
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nResampled Training set size: {X_train_resampled.shape[0]} samples")
print(f"Resampled Outcome distribution in training set:\n{y_train_resampled.value_counts(normalize=True)}")


# --- 5. Train an SVM Classifier with Enhanced Hyperparameter Tuning on Resampled Data ---
print("\nStarting Enhanced Hyperparameter Tuning for SVM on resampled data (this might take a moment)...")

# Define an expanded parameter grid to search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],          # Regularization parameter: expanded range
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],      # Kernel coefficient for 'rbf'
    'kernel': ['rbf']                            # Focusing on RBF kernel
}

# Create a GridSearchCV object
# Set probability=True to enable predict_proba
# n_jobs=1 is kept to ensure single-threaded operation for GridSearchCV itself
grid_search = GridSearchCV(
    SVC(random_state=42, class_weight='balanced', probability=True),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=1, # Explicitly set to 1
    verbose=1
)

# Fit GridSearchCV to the RESAMPLED training data
# The environment variables should handle underlying BLAS/LAPACK threading
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and best score
print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-weighted score on resampled data: {grid_search.best_score_:.4f}")

# Get the best estimator (the SVM model with the best parameters)
best_svm_model = grid_search.best_estimator_
print("\nBest SVM model trained successfully using best parameters on resampled data.")

# --- 6. Evaluate the best model's performance on the ORIGINAL Test Set ---

# Make predictions on the test set using the best model
y_pred = best_svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment): {accuracy:.4f}")

# Generate a classification report (precision, recall, f1-score)
print("\nClassification Report on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment):")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment):")
print(conf_matrix)

# Interpretation of Confusion Matrix:
# [[True Negatives (TN)  False Positives (FP)]
#  [False Negatives (FN) True Positives (TP)]]
# For Parkinson's disease prediction:
# TN: Correctly predicted no Parkinson's disease
# FP: Incorrectly predicted Parkinson's disease (Type I error)
# FN: Incorrectly predicted no Parkinson's disease (Type II error, more critical in medical diagnosis)
# TP: Correctly predicted Parkinson's disease

# --- 7. Save the trained model and preprocessing objects ---
model_filename = 'parkinsons_svm_model.joblib'
scaler_filename = 'parkinsons_scaler.joblib'
train_iqr_bounds_filename = 'parkinsons_train_iqr_bounds.joblib'

joblib.dump(best_svm_model, model_filename)
joblib.dump(scaler, scaler_filename)
joblib.dump(train_iqr_bounds, train_iqr_bounds_filename)

print(f"\nModel saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")
print(f"Training IQR bounds for outlier treatment saved to {train_iqr_bounds_filename}")


# --- 8. Example: Load model and predict on new data ---
print("\n--- Example: Loading saved model and predicting on new data ---")

# Load the saved model, scaler, and IQR bounds
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)
loaded_train_iqr_bounds = joblib.load(train_iqr_bounds_filename)

print("Model, scaler, and IQR bounds loaded successfully.")

# Create some hypothetical new data for prediction
# Ensure the new data has the same columns as the training data, in the same order
# Common columns in parkinsons.csv: MDVP:Fo(Hz), MDVP:Fhi(Hz), MDVP:Flo(Hz), MDVP:Jitter(%),
# MDVP:Jitter(Abs), MDVP:RAP, MDVP:PPQ, Jitter:RAP, MDVP:Shimmer, MDVP:Shimmer(dB),
# Shimmer:APQ3, Shimmer:APQ5, MDVP:APQ, Shimmer:DD, DFA, spread1, spread2, D2, PPE
new_patient_data = pd.DataFrame({
    'MDVP:Fo(Hz)': [119.992], 'MDVP:Fhi(Hz)': [157.302], 'MDVP:Flo(Hz)': [74.997],
    'MDVP:Jitter(%)': [0.00784], 'MDVP:Jitter(Abs)': [0.00007], 'MDVP:RAP': [0.00370],
    'MDVP:PPQ': [0.00494], 'Jitter:DDP': [0.01109], 'MDVP:Shimmer': [0.04374],
    'MDVP:Shimmer(dB)': [0.426], 'Shimmer:APQ3': [0.02182], 'Shimmer:APQ5': [0.03130],
    'MDVP:APQ': [0.02971], 'Shimmer:DDA': [0.06545], 'DFA': [0.73789],
    'spread1': [-5.454504], 'spread2': [0.282866], 'D2': [2.000000], 'PPE': [0.100000]
}, index=[0])

print("\nNew patient data for prediction:")
print(new_patient_data)

# Align columns of new_patient_data with X_processed columns
new_patient_data_aligned = new_patient_data.reindex(columns=X_processed.columns, fill_value=0)


# Preprocess the new data using the loaded scaler and IQR bounds
# Note: is_training=False, and provide the loaded scaler and IQR bounds
new_patient_processed = preprocess_data(
    new_patient_data_aligned,
    scaler=loaded_scaler,
    is_training=False,
    train_iqr_bounds=loaded_train_iqr_bounds
)

# Make prediction
new_prediction = loaded_model.predict(new_patient_processed)
prediction_proba = loaded_model.predict_proba(new_patient_processed) # Get probabilities

print(f"\nPredicted Outcome for new patient: {'Parkinsons Disease' if new_prediction[0] == 1 else 'No Parkinsons Disease'}")
print(f"Prediction Probability (No Parkinsons Disease, Parkinsons Disease): {prediction_proba[0]}")

# Another example: a patient likely without Parkinson's disease (hypothetical normal values)
new_patient_data_2 = pd.DataFrame({
    'MDVP:Fo(Hz)': [150.0], 'MDVP:Fhi(Hz)': [200.0], 'MDVP:Flo(Hz)': [100.0],
    'MDVP:Jitter(%)': [0.003], 'MDVP:Jitter(Abs)': [0.00002], 'MDVP:RAP': [0.00150],
    'MDVP:PPQ': [0.00200], 'Jitter:DDP': [0.00450], 'MDVP:Shimmer': [0.01500],
    'MDVP:Shimmer(dB)': [0.150], 'Shimmer:APQ3': [0.00700], 'Shimmer:APQ5': [0.00900],
    'MDVP:APQ': [0.00800], 'Shimmer:DDA': [0.02100], 'DFA': [0.70000],
    'spread1': [-4.000000], 'spread2': [0.200000], 'D2': [1.800000], 'PPE': [0.050000]
}, index=[0])

print("\nNew patient data 2 for prediction:")
print(new_patient_data_2)

# Align columns of new_patient_data_2 with X_processed columns
new_patient_data_aligned_2 = new_patient_data_2.reindex(columns=X_processed.columns, fill_value=0)


new_patient_processed_2 = preprocess_data(
    new_patient_data_aligned_2,
    scaler=loaded_scaler,
    is_training=False,
    train_iqr_bounds=loaded_train_iqr_bounds
)

new_prediction_2 = loaded_model.predict(new_patient_processed_2)
prediction_proba_2 = loaded_model.predict_proba(new_patient_processed_2)

print(f"\nPredicted Outcome for new patient 2: {'Parkinsons Disease' if new_prediction_2[0] == 1 else 'No Parkinsons Disease'}")
print(f"Prediction Probability (No Parkinsons Disease, Parkinsons Disease): {prediction_proba_2[0]}")

Dataset loaded successfully.
First 5 rows of the dataset:
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     