In [2]:
import sys
sys.path.append('../src')  # or './src' if running from project root
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from data_utils import load_migraine_data, check_missing_values, remove_duplicates, detect_outliers

# Example usage:
df = load_migraine_data('C://Users//nyolc//Downloads//migraine_data.csv')
check_missing_values(df)
df_clean = remove_duplicates(df)
outliers, lb, ub = detect_outliers(df_clean, 'Age')

Dataset shape: (400, 24)
No missing values found in the dataset
Number of duplicate rows: 6
Duplicate rows:
     Age  Duration  Frequency  Location  Character  Intensity  Nausea  Vomit  \
94    28         1          5         1          1          2       1      0   
118   28         1          5         1          1          2       1      0   
169   31         1          1         1          1          2       1      1   
200   50         1          1         1          1          3       1      0   
280   22         1          1         1          1          2       1      0   
281   35         1          1         1          1          3       1      0   

     Phonophobia  Photophobia  ...  Vertigo  Tinnitus  Hypoacusis  Diplopia  \
94             1            1  ...        0         0           0         0   
118            1            1  ...        0         0           0         0   
169            1            1  ...        0         0           0         0   
200            

In [3]:
# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                           roc_auc_score, confusion_matrix, classification_report)

# Model imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [4]:
# Visualization setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# ## 2. Data Loading and Initial Exploration


In [5]:
def load_and_explore_data(filepath):
    """Load and perform comprehensive data exploration"""
    print("📊 Loading and exploring migraine dataset...")
    
    # Load data
    df = pd.read_csv(filepath)
    
    print(f"\n Dataset Overview:")
    print(f"   • Shape: {df.shape}")
    print(f"   • Columns: {list(df.columns)}")
    
    # Basic info
    print(f"\n Data Types:")
    print(df.dtypes)
    
    print(f"\n Missing Values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])
    
    print(f"\n Summary Statistics:")
    print(df.describe())
    
    return df

# Load the data
df = load_and_explore_data('C://Users//nyolc//Downloads//migraine_data.csv')

📊 Loading and exploring migraine dataset...

 Dataset Overview:
   • Shape: (400, 24)
   • Columns: ['Age', 'Duration', 'Frequency', 'Location', 'Character', 'Intensity', 'Nausea', 'Vomit', 'Phonophobia', 'Photophobia', 'Visual', 'Sensory', 'Dysphasia', 'Dysarthria', 'Vertigo', 'Tinnitus', 'Hypoacusis', 'Diplopia', 'Defect', 'Ataxia', 'Conscience', 'Paresthesia', 'DPF', 'Type']

 Data Types:
Age             int64
Duration        int64
Frequency       int64
Location        int64
Character       int64
Intensity       int64
Nausea          int64
Vomit           int64
Phonophobia     int64
Photophobia     int64
Visual          int64
Sensory         int64
Dysphasia       int64
Dysarthria      int64
Vertigo         int64
Tinnitus        int64
Hypoacusis      int64
Diplopia        int64
Defect          int64
Ataxia          int64
Conscience      int64
Paresthesia     int64
DPF             int64
Type           object
dtype: object

 Missing Values:
Series([], dtype: int64)

 Summary Statistics

# ## 3. Data Preprocessing and Cleaning


In [6]:
# --- 3.1 Handle Outliers (remove outliers in 'Age') ---
# Remove outliers detected earlier (using bounds from detect_outliers)
df_clean = df_clean[(df_clean['Age'] >= lb) & (df_clean['Age'] <= ub)]

# --- 3.2 Encode Categorical Variables ---
# Encode 'Type' (target) as numbers
le = LabelEncoder()
df_clean['Type_encoded'] = le.fit_transform(df_clean['Type'])

# --- 3.3 Feature/Target Split ---
X = df_clean.drop(['Type', 'Type_encoded'], axis=1)
y = df_clean['Type_encoded']

# --- 3.4 Feature Scaling ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3.5 Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (312, 23), Test shape: (78, 23)


## 4. Model Training and Evaluation

In [7]:
# --- 4.1 Define Models ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Ridge Classifier": RidgeClassifier(),
    "MLP (Neural Net)": MLPClassifier(max_iter=1000)
}

# --- 4.2 Cross-Validation and Training ---
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_weighted')
    print(f"CV F1 (weighted): {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    # Fit and evaluate on test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Test Accuracy: {acc:.3f}")
    print(f"Test F1 (weighted): {f1:.3f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
    results[name] = {'cv_f1': cv_scores.mean(), 'test_acc': acc, 'test_f1': f1}

# --- 4.3 Compare Results ---
import pandas as pd
results_df = pd.DataFrame(results).T
display(results_df.sort_values('test_f1', ascending=False))


Training Logistic Regression...
CV F1 (weighted): 0.904 ± 0.027
Test Accuracy: 0.923
Test F1 (weighted): 0.919
Classification Report:
                                precision    recall  f1-score   support

            Basilar-type aura       0.67      0.67      0.67         3
 Familial hemiplegic migraine       0.75      0.60      0.67         5
        Migraine without aura       0.92      1.00      0.96        12
                        Other       1.00      0.67      0.80         3
 Sporadic hemiplegic migraine       1.00      0.67      0.80         3
   Typical aura with migraine       0.94      0.98      0.96        48
Typical aura without migraine       1.00      1.00      1.00         4

                     accuracy                           0.92        78
                    macro avg       0.90      0.80      0.84        78
                 weighted avg       0.92      0.92      0.92        78


Training Random Forest...
CV F1 (weighted): 0.879 ± 0.031
Test Accuracy: 0.936


Unnamed: 0,cv_f1,test_acc,test_f1
Random Forest,0.878629,0.935897,0.933788
Logistic Regression,0.904368,0.923077,0.919156
Naive Bayes,0.930572,0.923077,0.919156
MLP (Neural Net),0.897132,0.910256,0.911885
SVM,0.875297,0.910256,0.908279
Gradient Boosting,0.887671,0.897436,0.898042
Ridge Classifier,0.85515,0.897436,0.883459
KNN,0.841491,0.858974,0.853796
Decision Tree,0.816107,0.846154,0.838843
AdaBoost,0.648214,0.717949,0.6357


## 5. Model Results Visualization and Comparison

In [None]:
def visualize_model_results(results, y_test, le):
    """Create comprehensive visualizations of model results"""
    print("Creating model comparison visualizations...")
    
    # Prepare data for visualization
    metrics_df = pd.DataFrame({
        name: {
            'Accuracy': data['test_acc'],
            'F1 Score': data['test_f1'],
            'CV F1': data['cv_f1']
        }
        for name, data in results.items()
    }).T
    
    # Create visualization figure
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Model comparison bar plot
    plt.subplot(3, 3, 1)
    metrics_df[['Accuracy', 'F1 Score']].plot(kind='bar')
    plt.title('Model Performance Comparison')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.ylabel('Score')
    
    # 2. Cross-validation scores
    plt.subplot(3, 3, 2)
    cv_scores = [data['cv_f1'] for data in results.values()]
    model_names = list(results.keys())
    
    plt.bar(range(len(model_names)), cv_scores)
    plt.xticks(range(len(model_names)), model_names, rotation=45)
    plt.title('Cross-Validation F1 Scores')
    plt.ylabel('F1 Score')
    
    # 3. Accuracy vs F1 Score scatter
    plt.subplot(3, 3, 3)
    plt.scatter([data['test_acc'] for data in results.values()], 
                [data['test_f1'] for data in results.values()])
    for i, name in enumerate(results.keys()):
        plt.annotate(name, (list(results.values())[i]['test_acc'], 
                           list(results.values())[i]['test_f1']))
    plt.xlabel('Test Accuracy')
    plt.ylabel('Test F1 Score')
    plt.title('Accuracy vs F1 Score')
    
    # 4-6. Confusion matrices for top 3 models
    top_models = sorted(results.items(), key=lambda x: x[1]['test_f1'], reverse=True)[:3]
    
    for i, (name, data) in enumerate(top_models, 4):
        plt.subplot(3, 3, i)
        # Get the model and make predictions
        model = models[name]
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=le.classes_, yticklabels=le.classes_)
        plt.title(f'Confusion Matrix - {name}')
    
    # 7. Feature importance (for Random Forest)
    plt.subplot(3, 3, 7)
    rf_model = models['Random Forest']
    if hasattr(rf_model, 'feature_importances_'):
        feature_names = [col for col in df_clean.columns if col not in ['Type', 'Type_encoded']]
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=True).tail(10)
        
        plt.barh(importance_df['feature'], importance_df['importance'])
        plt.title('Top 10 Feature Importances (Random Forest)')
    
    # 8. Model ranking
    plt.subplot(3, 3, 8)
    ranking_df = metrics_df.rank(ascending=False).mean(axis=1).sort_values()
    plt.barh(ranking_df.index, ranking_df.values)
    plt.title('Overall Model Ranking (Lower is Better)')
    
    # 9. Performance distribution
    plt.subplot(3, 3, 9)
    metrics_df.boxplot()
    plt.title('Performance Metrics Distribution')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return metrics_df

# Create visualizations
metrics_summary = visualize_model_results(results, y_test, le)

## 6. Model Performance Summary

In [None]:
def print_performance_summary(results):
    """Print comprehensive performance summary"""
    print("MODEL PERFORMANCE SUMMARY")
    print("=" * 60)
    
    # Sort models by F1 score
    sorted_results = sorted(results.items(), key=lambda x: x[1]['test_f1'], reverse=True)
    
    for rank, (name, data) in enumerate(sorted_results, 1):
        print(f"\n{rank}. {name}")
        print(f"   Accuracy:     {data['test_acc']:.4f}")
        print(f"   F1 Score:     {data['test_f1']:.4f}")
        print(f"   CV F1 Score:  {data['cv_f1']:.4f}")
    
    # Best model analysis
    best_model_name = sorted_results[0][0]
    best_model_data = sorted_results[0][1]
    
    print(f"\nBEST PERFORMING MODEL: {best_model_name}")
    print("=" * 40)
    print(f"This model achieved the highest F1 score of {best_model_data['test_f1']:.4f}")
    print(f"with a cross-validation F1 score of {best_model_data['cv_f1']:.4f}")
    
    return best_model_name, best_model_data

best_model_name, best_model_info = print_performance_summary(results)

## 7. Advanced Model Analysis

In [None]:
def perform_advanced_analysis(best_model_name, results, X_test, y_test, le):
    """Perform advanced analysis on the best model"""
    print(f"Advanced Analysis for {best_model_name}")
    print("=" * 50)
    
    best_model = models[best_model_name]
    y_pred = best_model.predict(X_test)
    
    # Detailed classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    # Feature importance analysis (if available)
    if hasattr(best_model, 'feature_importances_'):
        print(f"\nTop 10 Most Important Features:")
        feature_names = [col for col in df_clean.columns if col not in ['Type', 'Type_encoded']]
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        for i, (_, row) in enumerate(importance_df.head(10).iterrows()):
            print(f"   {i+1}. {row['feature']}: {row['importance']:.4f}")
    
    # Error analysis
    errors = X_test[y_test != y_pred]
    if len(errors) > 0:
        print(f"\nError Analysis:")
        print(f"   Total misclassifications: {len(errors)}")
        print(f"   Error rate: {len(errors)/len(y_test)*100:.2f}%")
    
    # Class-wise performance
    print(f"\nClass-wise Performance:")
    for i, class_name in enumerate(le.classes_):
        class_mask = y_test == i
        if class_mask.sum() > 0:
            class_acc = (y_pred[class_mask] == y_test[class_mask]).mean()
            print(f"   {class_name}: {class_acc:.3f}")

perform_advanced_analysis(best_model_name, results, X_test, y_test, le)

## 8. Hyperparameter Optimization for Best Model

In [None]:
def optimize_best_model(best_model_name, X_train, y_train, X_test, y_test):
    """Perform hyperparameter optimization for the best model"""
    print(f"Hyperparameter Optimization for {best_model_name}")
    print("=" * 50)
    
    if best_model_name == "Random Forest":
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        base_model = RandomForestClassifier(random_state=42)
    
    elif best_model_name == "Logistic Regression":
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
        base_model = LogisticRegression(max_iter=1000, random_state=42)
    
    elif best_model_name == "SVM":
        param_grid = {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto', 0.1, 0.01]
        }
        base_model = SVC(probability=True, random_state=42)
    
    else:
        print("Hyperparameter optimization not implemented for this model type")
        return None
    
    # Perform grid search
    grid_search = GridSearchCV(
        base_model, 
        param_grid, 
        cv=5, 
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    # Evaluate optimized model
    y_pred_opt = grid_search.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_opt)
    test_f1 = f1_score(y_test, y_pred_opt, average='weighted')
    
    print(f"Optimized model test accuracy: {test_acc:.4f}")
    print(f"Optimized model test F1: {test_f1:.4f}")
    
    return grid_search.best_estimator_

optimized_model = optimize_best_model(best_model_name, X_train, y_train, X_test, y_test)

## 9. Feature Selection Analysis

In [None]:
def perform_feature_selection(X_train, y_train, X_test, y_test, feature_names):
    """Perform feature selection analysis"""
    print("Feature Selection Analysis")
    print("=" * 30)
    
    # Recursive Feature Elimination with Random Forest
    from sklearn.feature_selection import RFE
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=rf, n_features_to_select=10)
    
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    
    # Train model with selected features
    rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_selected.fit(X_train_rfe, y_train)
    
    y_pred_rfe = rf_selected.predict(X_test_rfe)
    acc_rfe = accuracy_score(y_test, y_pred_rfe)
    f1_rfe = f1_score(y_test, y_pred_rfe, average='weighted')
    
    print(f"Performance with top 10 features:")
    print(f"   Accuracy: {acc_rfe:.4f}")
    print(f"   F1 Score: {f1_rfe:.4f}")
    
    # Show selected features
    selected_features = [feature_names[i] for i in range(len(feature_names)) if rfe.support_[i]]
    print(f"\nTop 10 selected features:")
    for i, feature in enumerate(selected_features, 1):
        print(f"   {i}. {feature}")
    
    return selected_features, acc_rfe, f1_rfe

selected_features, acc_rfe, f1_rfe = perform_feature_selection(X_train, y_train, X_test, y_test, 
                                                             [col for col in df_clean.columns if col not in ['Type', 'Type_encoded']])

## 10. Model Interpretability Analysis

In [None]:
def analyze_model_interpretability(best_model_name, models, X_test, y_test, le):
    """Analyze model interpretability"""
    print("Model Interpretability Analysis")
    print("=" * 35)
    
    best_model = models[best_model_name]
    
    # Feature importance plot
    if hasattr(best_model, 'feature_importances_'):
        feature_names = [col for col in df_clean.columns if col not in ['Type', 'Type_encoded']]
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(12, 8))
        plt.barh(importance_df['feature'], importance_df['importance'])
        plt.title(f'Feature Importance - {best_model_name}')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.show()
    
    # Partial dependence plots for top features
    if hasattr(best_model, 'feature_importances_'):
        from sklearn.inspection import partial_dependence
        
        top_features = [col for col in df_clean.columns if col not in ['Type', 'Type_encoded']][:3]
        
        plt.figure(figsize=(15, 5))
        for i, feature in enumerate(top_features, 1):
            plt.subplot(1, 3, i)
            try:
                partial_dependence.plot_partial_dependence(
                    best_model, X_test, [feature], 
                    feature_names=[col for col in df_clean.columns if col not in ['Type', 'Type_encoded']]
                )
                plt.title(f'Partial Dependence - {feature}')
            except:
                plt.text(0.5, 0.5, f'PDP not available\nfor {feature}', 
                        ha='center', va='center', transform=plt.gca().transAxes)
                plt.title(f'Partial Dependence - {feature}')
        
        plt.tight_layout()
        plt.show()

analyze_model_interpretability(best_model_name, models, X_test, y_test, le)

## 11. Recommendations and Next Steps

In [None]:
def provide_recommendations(metrics_summary, best_model_name, results):
    """Provide actionable recommendations based on results"""
    print("RECOMMENDATIONS AND NEXT STEPS")
    print("=" * 40)
    
    print(f"Best Model: {best_model_name}")
    print(f"   This model showed the best overall performance")
    print(f"   Consider using this for production deployment")
    
    print(f"\nModel Improvement Suggestions:")
    print(f"   Hyperparameter tuning using GridSearchCV")
    print(f"   Feature selection to reduce overfitting")
    print(f"   Ensemble methods combining top 3 models")
    print(f"   Cross-validation with different strategies")
    
    print(f"\nData Collection Recommendations:")
    print(f"   Collect more data to improve model robustness")
    print(f"   Include additional relevant features")
    print(f"   Address class imbalance if present")
    
    print(f"\nDeployment Considerations:")
    print(f"   Implement model monitoring")
    print(f"   Set up automated retraining pipeline")
    print(f"   Create prediction confidence intervals")
    
    print(f"\nBusiness Impact:")
    avg_accuracy = metrics_summary['Accuracy'].mean()
    print(f"   Average model accuracy: {avg_accuracy:.1%}")
    print(f"   Potential for automated migraine prediction")
    print(f"   Can assist in preventive healthcare decisions")
    
    # Specific recommendations based on results
    print(f"\nSpecific Recommendations:")
    if best_model_name == "Random Forest":
        print(f"   Random Forest shows good performance and interpretability")
        print(f"   Consider feature importance for clinical insights")
    elif best_model_name == "Logistic Regression":
        print(f"   Logistic Regression provides good baseline and interpretability")
        print(f"   Consider regularization for better generalization")
    elif best_model_name == "SVM":
        print(f"   SVM shows good performance but may be slower for large datasets")
        print(f"   Consider kernel selection for better performance")

provide_recommendations(metrics_summary, best_model_name, results)

## 12. Conclusion

In [None]:
print("ANALYSIS COMPLETE!")
print("=" * 30)
print("This comprehensive analysis successfully:")
print("Preprocessed and cleaned the migraine dataset")
print("Trained and evaluated 10 different ML models")
print("Identified the best performing model")
print("Provided actionable insights and recommendations")
print("\nThe models are ready for deployment and further optimization!")

# Save the best model
import joblib
joblib.dump(models[best_model_name], f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl')
print(f"\nBest model saved as: best_model_{best_model_name.replace(' ', '_').lower()}.pkl")

In [None]:
## Hyperparameter Tuning

In [8]:
# Example: Grid Search for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='f1_weighted')
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

Best params: {'max_depth': None, 'n_estimators': 200}
Best CV F1: 0.8660142093857431
