In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings("ignore")

# Load the data
df = pd.read_csv("asthma_detection.csv")
print(f"Original dataset shape: {df.shape}")
print(f"Class distribution:\n{df['Asthma'].value_counts()}")

# 1. ADVANCED DATA PREPROCESSING

def advanced_outlier_removal(df, method='iqr', factor=1.5):
    """Remove outliers using IQR or Z-score method"""
    df_clean = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col != 'Asthma']
    
    for col in numeric_cols:
        if method == 'iqr':
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        elif method == 'zscore':
            z_scores = np.abs((df_clean[col] - df_clean[col].mean()) / df_clean[col].std())
            df_clean = df_clean[z_scores < factor]
    
    return df_clean

def create_feature_interactions(df):
    """Create meaningful feature interactions"""
    df_enhanced = df.copy()
    
    # Respiratory symptoms interaction
    df_enhanced['Respiratory_Score'] = (df_enhanced['Dry-Cough'] + 
                                       df_enhanced['Difficulty-in-Breathing'] + 
                                       df_enhanced['Sore-Throat']) / 3
    
    # Nasal symptoms interaction
    df_enhanced['Nasal_Score'] = (df_enhanced['Nasal-Congestion'] + 
                                 df_enhanced['Runny-Nose']) / 2
    
    # Overall symptom severity
    symptom_cols = ['Tiredness', 'Dry-Cough', 'Difficulty-in-Breathing', 
                   'Sore-Throat', 'Pains', 'Nasal-Congestion', 'Runny-Nose']
    df_enhanced['Total_Symptom_Score'] = df_enhanced[symptom_cols].sum(axis=1)
    df_enhanced['Avg_Symptom_Score'] = df_enhanced[symptom_cols].mean(axis=1)
    
    # Age-related features
    if 'Age_60+' in df_enhanced.columns:
        df_enhanced['Age_Symptom_Interaction'] = df_enhanced['Age_60+'] * df_enhanced['Total_Symptom_Score']
    
    # Symptom combinations
    df_enhanced['Cough_Breathing_Combo'] = df_enhanced['Dry-Cough'] * df_enhanced['Difficulty-in-Breathing']
    df_enhanced['Nasal_Throat_Combo'] = df_enhanced['Nasal-Congestion'] * df_enhanced['Sore-Throat']
    
    return df_enhanced

# Apply advanced preprocessing
df_clean = advanced_outlier_removal(df, method='iqr', factor=1.5)
df_enhanced = create_feature_interactions(df_clean)

print(f"After outlier removal: {df_clean.shape}")
print(f"After feature engineering: {df_enhanced.shape}")

# 2. FEATURE SELECTION AND SCALING

def select_best_features(X, y, k=15):
    """Select k best features using statistical tests"""
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    return X_selected, selected_features, selector

# Prepare features and target
X = df_enhanced.drop('Asthma', axis=1)
y = df_enhanced['Asthma']

# Feature selection
X_selected, selected_features, feature_selector = select_best_features(X, y, k=min(15, X.shape[1]))
print(f"Selected features: {list(selected_features)}")

# 3. HANDLE CLASS IMBALANCE
print(f"Class distribution before balancing:\n{pd.Series(y).value_counts()}")

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_selected, y)
print(f"Class distribution after SMOTE:\n{pd.Series(y_balanced).value_counts()}")

# 4. TRAIN-TEST SPLIT WITH STRATIFICATION
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# 5. SCALING
scaler = RobustScaler()  # More robust to outliers than StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. HYPERPARAMETER TUNING

def tune_random_forest(X_train, y_train):
    """Tune Random Forest hyperparameters"""
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    }
    
    rf = RandomForestClassifier(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_params_

def tune_gradient_boosting(X_train, y_train):
    """Tune Gradient Boosting hyperparameters"""
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    gb = GradientBoostingClassifier(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(gb, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_params_

def tune_svm(X_train, y_train):
    """Tune SVM hyperparameters"""
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }
    
    svm = SVC(random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_params_

# Perform hyperparameter tuning
print("Tuning Random Forest...")
best_rf, best_rf_params = tune_random_forest(X_train_scaled, y_train)
print(f"Best RF params: {best_rf_params}")

print("Tuning Gradient Boosting...")
best_gb, best_gb_params = tune_gradient_boosting(X_train_scaled, y_train)
print(f"Best GB params: {best_gb_params}")

print("Tuning SVM...")
best_svm, best_svm_params = tune_svm(X_train_scaled, y_train)
print(f"Best SVM params: {best_svm_params}")

# 7. ENSEMBLE METHODS

# Create individual models
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Create voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('gb', best_gb),
        ('svm', best_svm),
        ('lr', lr)
    ],
    voting='hard'
)

voting_clf.fit(X_train_scaled, y_train)

# 8. MODEL EVALUATION

def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    return accuracy, precision, recall, f1, y_pred

# Evaluate all models
models = {
    'Tuned Random Forest': best_rf,
    'Tuned Gradient Boosting': best_gb,
    'Tuned SVM': best_svm,
    'Logistic Regression': lr,
    'Voting Ensemble': voting_clf
}

results = {}
predictions = {}

for name, model in models.items():
    acc, prec, rec, f1, y_pred = evaluate_model(model, X_test_scaled, y_test, name)
    results[name] = {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}
    predictions[name] = y_pred

# 9. CROSS-VALIDATION FOR BEST MODEL
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = models[best_model_name]

print(f"\nBest model: {best_model_name}")
print("Performing cross-validation...")

cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")

# 10. FEATURE IMPORTANCE
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': selected_features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

# 11. CONFUSION MATRIX FOR BEST MODEL
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, predictions[best_model_name])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test))
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 12. MODEL COMPARISON VISUALIZATION
results_df = pd.DataFrame(results).T
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['accuracy', 'precision', 'recall', 'f1']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    results_df[metric].plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'Model Comparison - {metric.capitalize()}')
    ax.set_ylabel(metric.capitalize())
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# 13. SAVE THE BEST MODEL
from joblib import dump
dump(best_model, f'best_asthma_model_{best_model_name.lower().replace(" ", "_")}.pkl')
dump(scaler, 'asthma_scaler.pkl')
dump(feature_selector, 'asthma_feature_selector.pkl')

print(f"\nBest model saved as: best_asthma_model_{best_model_name.lower().replace(' ', '_')}.pkl")
print("Scaler and feature selector also saved.")

# 14. PREDICTION FUNCTION
def predict_asthma(symptoms_dict, model=best_model, scaler=scaler, selector=feature_selector):
    """
    Predict asthma level for new symptoms
    
    symptoms_dict should contain all original features
    """
    # Create DataFrame from input
    input_df = pd.DataFrame([symptoms_dict])
    
    # Apply same feature engineering
    input_enhanced = create_feature_interactions(input_df)
    
    # Select features
    input_selected = selector.transform(input_enhanced)
    
    # Scale features
    input_scaled = scaler.transform(input_selected)
    
    # Make prediction
    prediction = model.predict(input_scaled)[0]
    probability = model.predict_proba(input_scaled)[0] if hasattr(model, 'predict_proba') else None
    
    return prediction, probability

# Example usage
example_symptoms = {
    'Age_60+': 0,
    'Tiredness': 3,
    'Dry-Cough': 4,
    'Difficulty-in-Breathing': 5,
    'Sore-Throat': 2,
    'Pains': 1,
    'Nasal-Congestion': 3,
    'Runny-Nose': 2
}

prediction, probability = predict_asthma(example_symptoms)
print(f"\nExample prediction: {prediction}")
if probability is not None:
    print(f"Prediction probabilities: {probability}")

print(f"\nFinal Accuracy Improvement:")
print(f"Original Accuracy: 84%")
print(f"Best Model Accuracy: {results[best_model_name]['accuracy']*100:.2f}%")
print(f"Improvement: {(results[best_model_name]['accuracy'] - 0.84)*100:.2f} percentage points")

ImportError: cannot import name 'tarfile_extractall' from 'sklearn.utils.fixes' (c:\Users\GARV ANAND\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\fixes.py)

ImportError: cannot import name 'tarfile_extractall' from 'sklearn.utils.fixes' (c:\Users\GARV ANAND\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\fixes.py)