#Classification using Transfer learning

#Feature Extraction

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import warnings
warnings.filterwarnings('ignore')

def extract_mfcc_features(file_path, n_mfcc=13, sr=22050):
    """Extract 52 MFCC features (13 coefficients x 4 statistics)"""
    y, _ = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    mfcc_features = []
    for i in range(n_mfcc):
        mfcc_features.extend([
            np.mean(mfccs[i]), np.std(mfccs[i]),
            np.min(mfccs[i]), np.max(mfccs[i])
        ])
    return mfcc_features

def extract_spectral_features(file_path, sr=22050):
    """Extract 8 spectral features"""
    y, _ = librosa.load(file_path, sr=sr)

    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]

    return [
        np.mean(spectral_centroids), np.std(spectral_centroids),
        np.mean(spectral_rolloff), np.std(spectral_rolloff),
        np.mean(spectral_bandwidth), np.std(spectral_bandwidth),
        np.mean(zero_crossing_rate), np.std(zero_crossing_rate)
    ]

def extract_pitch_features(file_path, sr=22050):
    """Extract 4 pitch features"""
    y, _ = librosa.load(file_path, sr=sr)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)

    f0 = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch = pitches[index, t]
        if pitch > 0:
            f0.append(pitch)

    if len(f0) > 0:
        return [np.mean(f0), np.std(f0), np.min(f0), np.max(f0)]
    return [0, 0, 0, 0]

def extract_all_features(audio_directory, output_file='all_speaker_features.csv'):
    """Extract all 64 features from audio files and save to CSV"""
    wav_files = [os.path.join(audio_directory, f)
                 for f in os.listdir(audio_directory)
                 if f.lower().endswith('.wav')]

    features_data = []
    labels = []

    mfcc_names = [f'mfcc_{i}_{stat}' for i in range(13)
                  for stat in ['mean', 'std', 'min', 'max']]
    spectral_names = ['spectral_centroid_mean', 'spectral_centroid_std',
                      'spectral_rolloff_mean', 'spectral_rolloff_std',
                      'spectral_bandwidth_mean', 'spectral_bandwidth_std',
                      'zcr_mean', 'zcr_std']
    pitch_names = ['pitch_mean', 'pitch_std', 'pitch_min', 'pitch_max']
    feature_names = mfcc_names + spectral_names + pitch_names

    for file_path in wav_files:
        try:
            all_features = (extract_mfcc_features(file_path) +
                          extract_spectral_features(file_path) +
                          extract_pitch_features(file_path))
            features_data.append(all_features)
            speaker = os.path.basename(file_path).split('-')[0]
            labels.append(speaker)
        except Exception as e:
            print(f"Error: {file_path}: {e}")

    df = pd.DataFrame(features_data, columns=feature_names)
    df['speaker'] = labels
    df.to_csv(output_file, index=False)
    print(f"Extracted {len(df)} samples, saved to {output_file}")
    return df



#Data Analysis & preprocessing

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('all_speaker_features.csv')
print(f"Samples: {len(df)}, Features: {df.shape[1] - 1}")
print(df['speaker'].value_counts().sort_index())

# Speaker distribution plot
plt.figure(figsize=(8, 4))
counts = df['speaker'].value_counts().sort_index()
colors = ['#e74c3c' if s in ['murad', 'teymur'] else '#3498db' for s in counts.index]
plt.bar(counts.index, counts.values, color=colors)
plt.xlabel('Speaker')
plt.ylabel('Samples')
plt.title('Speaker Distribution (Red = Brothers)')
plt.tight_layout()
plt.savefig('speaker_distribution.png', dpi=150)
plt.close()

# Scaling
X = df.drop('speaker', axis=1)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Before/after scaling plots
sample_features = ['mfcc_0_mean', 'mfcc_5_mean', 'spectral_centroid_mean',
                   'spectral_rolloff_mean', 'zcr_mean', 'pitch_mean']

plt.figure(figsize=(10, 5))
sns.boxplot(data=X[sample_features])
plt.title('BEFORE Scaling: Feature Ranges Differ Dramatically')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('before_scaling_boxplot.png', dpi=150)
plt.close()

plt.figure(figsize=(10, 5))
sns.boxplot(data=X_scaled[sample_features])
plt.title('AFTER Scaling: All Features Centered Around 0')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('after_scaling_boxplot.png', dpi=150)
plt.close()

# Correlation analysis
corr_matrix = X_scaled.corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=150)
plt.close()

columns_to_remove = ['spectral_rolloff_mean', 'zcr_std']
X_final = X_scaled.drop(columns=columns_to_remove)
X_final['speaker'] = df['speaker'].values
X_final.to_csv('features_processed.csv', index=False)
print(f"Final features: {X_final.shape[1] - 1}")

#Train/Validation/Test Split

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('features_processed.csv')

# Split brothers separately to ensure adequate test representation
brothers = df[df['speaker'].isin(['murad', 'teymur'])].copy()
diverse = df[~df['speaker'].isin(['murad', 'teymur'])].copy()

brother_test_list, brother_val_list, brother_train_list = [], [], []

for speaker in ['murad', 'teymur']:
    speaker_data = brothers[brothers['speaker'] == speaker]
    test_samples = speaker_data.sample(n=5, random_state=42)
    remaining = speaker_data.drop(test_samples.index)
    val_samples = remaining.sample(n=3, random_state=42)
    train_samples = remaining.drop(val_samples.index)

    brother_test_list.append(test_samples)
    brother_val_list.append(val_samples)
    brother_train_list.append(train_samples)

brother_test = pd.concat(brother_test_list)
brother_val = pd.concat(brother_val_list)
brother_train = pd.concat(brother_train_list)

# Stratified split for diverse speakers
y_div = diverse['speaker']
div_train, div_temp, _, y_div_temp = train_test_split(
    diverse, y_div, test_size=0.3, random_state=42, stratify=y_div)
div_val, div_test, _, _ = train_test_split(
    div_temp, y_div_temp, test_size=0.5, random_state=42, stratify=y_div_temp)

# Combine and shuffle
train_df = pd.concat([brother_train, div_train]).sample(frac=1, random_state=42).reset_index(drop=True)
val_df = pd.concat([brother_val, div_val]).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat([brother_test, div_test]).sample(frac=1, random_state=42).reset_index(drop=True)

train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)
test_df.to_csv('test.csv', index=False)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

#Feature Selection - Lasso

In [None]:
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

X_train = train_df.drop('speaker', axis=1)
y_train = train_df['speaker']
X_val = val_df.drop('speaker', axis=1)
y_val = val_df['speaker']
X_test = test_df.drop('speaker', axis=1)
y_test = test_df['speaker']

# Lasso feature selection with C=0.25
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.25,
                           random_state=42, max_iter=1000)
lasso.fit(X_train, y_train)

# Select features with non-zero coefficients
selected_mask = np.any(lasso.coef_ != 0, axis=0)
selected_features = X_train.columns[selected_mask].tolist()
print(f"Lasso selected {len(selected_features)} features")

# Save selected feature datasets
train_selected = pd.concat([X_train[selected_features], y_train], axis=1)
val_selected = pd.concat([X_val[selected_features], y_val], axis=1)
test_selected = pd.concat([X_test[selected_features], y_test], axis=1)

train_selected.to_csv('train_selected.csv', index=False)
val_selected.to_csv('val_selected.csv', index=False)
test_selected.to_csv('test_selected.csv', index=False)

#Feature selection - correlation

In [None]:
from sklearn.feature_selection import f_classif

feature_cols = [col for col in train_df.columns if col != 'speaker']
X_train = train_df[feature_cols]

# Compute F-scores
f_scores, _ = f_classif(X_train, y_train)
f_score_dict = dict(zip(feature_cols, f_scores))

# Find and remove correlated features (|r| > 0.7)
corr_matrix = X_train.corr().abs()
THRESHOLD = 0.7
features_to_remove = set()

upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

for col in upper_tri.columns:
    for idx in upper_tri.index:
        corr_val = upper_tri.loc[idx, col]
        if pd.notna(corr_val) and corr_val > THRESHOLD:
            # Remove feature with lower F-score
            if idx not in features_to_remove and col not in features_to_remove:
                if f_score_dict[idx] >= f_score_dict[col]:
                    features_to_remove.add(col)
                else:
                    features_to_remove.add(idx)

selected_features_corr = [f for f in feature_cols if f not in features_to_remove]
print(f"Correlation-based: {len(selected_features_corr)} features")

# Save reduced datasets
X_val = val_df[feature_cols]
X_test = test_df[feature_cols]

train_reduced = pd.concat([X_train[selected_features_corr], y_train], axis=1)
val_reduced = pd.concat([X_val[selected_features_corr], y_val], axis=1)
test_reduced = pd.concat([X_test[selected_features_corr], y_test], axis=1)

train_reduced.to_csv('train_corr_reduced.csv', index=False)
val_reduced.to_csv('val_corr_reduced.csv', index=False)
test_reduced.to_csv('test_corr_reduced.csv', index=False)

# Correlation comparison plot
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
sns.heatmap(corr_matrix, cmap='RdBu_r', center=0, ax=axes[0])
axes[0].set_title(f'Before: {len(feature_cols)} Features')

corr_reduced = X_train[selected_features_corr].corr().abs()
sns.heatmap(corr_reduced, cmap='RdBu_r', center=0, ax=axes[1])
axes[1].set_title(f'After: {len(selected_features_corr)} Features')
plt.tight_layout()
plt.savefig('correlation_reduction_heatmaps.png', dpi=150)
plt.close()

#Model 1 - Training

In [None]:
from sklearn.metrics import precision_score, recall_score
import pickle

train_df = pd.read_csv('train_selected.csv')
val_df = pd.read_csv('val_selected.csv')

feature_cols = [col for col in train_df.columns if col != 'speaker']
X_train = train_df[feature_cols]
y_train = train_df['speaker']
X_val = val_df[feature_cols]
y_val = val_df['speaker']

# Hyperparameter tuning
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
results = []

for C in C_values:
    model = LogisticRegression(C=C, penalty='l2', solver='lbfgs',
                               max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_val, y_pred, average='macro', zero_division=0)
    results.append({'C': C, 'precision': precision, 'recall': recall, 'model': model})

results_df = pd.DataFrame(results)
best_idx = ((results_df['precision'] + results_df['recall']) / 2).idxmax()
best_model = results[best_idx]['model']
print(f"Best C: {results[best_idx]['C']}")

# Save model
with open('logreg_best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Tuning plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.semilogx(results_df['C'], results_df['precision'], 'b-o', label='Precision')
ax.semilogx(results_df['C'], results_df['recall'], 'r-s', label='Recall')
ax.axvline(x=results[best_idx]['C'], color='green', linestyle=':', label=f"Best C={results[best_idx]['C']}")
ax.set_xlabel('C')
ax.set_ylabel('Score')
ax.set_title(f'Hyperparameter Tuning (L2, {len(feature_cols)} Lasso-selected features)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('logreg_tuning.png', dpi=150)
plt.close()


#Model 2 - Training

In [None]:
train_df = pd.read_csv('train_corr_reduced.csv')
val_df = pd.read_csv('val_corr_reduced.csv')

feature_cols = [col for col in train_df.columns if col != 'speaker']
X_train = train_df[feature_cols]
y_train = train_df['speaker']
X_val = val_df[feature_cols]
y_val = val_df['speaker']

results = []
for C in C_values:
    model = LogisticRegression(C=C, penalty='l2', solver='lbfgs',
                               max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_val, y_pred, average='macro', zero_division=0)
    results.append({'C': C, 'precision': precision, 'recall': recall, 'model': model})

results_df = pd.DataFrame(results)
best_idx = ((results_df['precision'] + results_df['recall']) / 2).idxmax()
best_model_corr = results[best_idx]['model']
print(f"Best C: {results[best_idx]['C']}")

with open('logreg_corr_best_model.pkl', 'wb') as f:
    pickle.dump(best_model_corr, f)

# Tuning plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.semilogx(results_df['C'], results_df['precision'], 'b-o', label='Precision')
ax.semilogx(results_df['C'], results_df['recall'], 'r-s', label='Recall')
ax.axvline(x=results[best_idx]['C'], color='green', linestyle=':', label=f"Best C={results[best_idx]['C']}")
ax.set_xlabel('C')
ax.set_ylabel('Score')
ax.set_title(f'Hyperparameter Tuning (L2, {len(feature_cols)} Correlation-reduced features)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('logreg_corr_tuning.png', dpi=150)
plt.close()

#Learning Curves

In [None]:

train_lasso = pd.read_csv('train_selected.csv')
val_lasso = pd.read_csv('val_selected.csv')
train_corr = pd.read_csv('train_corr_reduced.csv')
val_corr = pd.read_csv('val_corr_reduced.csv')

def compute_learning_curve(X_train, y_train, X_val, y_val, train_sizes, C=0.1):
    results = {'train_size': [], 'train_prec': [], 'train_rec': [], 'val_prec': [], 'val_rec': []}
    n_total = len(X_train)

    for size_frac in train_sizes:
        n_samples = int(size_frac * n_total)
        X_sub, y_sub = X_train.iloc[:n_samples], y_train.iloc[:n_samples]

        model = LogisticRegression(C=C, penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
        model.fit(X_sub, y_sub)

        y_train_pred = model.predict(X_sub)
        y_val_pred = model.predict(X_val)

        results['train_size'].append(n_samples)
        results['train_prec'].append(precision_score(y_sub, y_train_pred, average='macro', zero_division=0))
        results['train_rec'].append(recall_score(y_sub, y_train_pred, average='macro', zero_division=0))
        results['val_prec'].append(precision_score(y_val, y_val_pred, average='macro', zero_division=0))
        results['val_rec'].append(recall_score(y_val, y_val_pred, average='macro', zero_division=0))

    return pd.DataFrame(results)

train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]

feature_cols_lasso = [c for c in train_lasso.columns if c != 'speaker']
feature_cols_corr = [c for c in train_corr.columns if c != 'speaker']

lasso_curves = compute_learning_curve(
    train_lasso[feature_cols_lasso], train_lasso['speaker'],
    val_lasso[feature_cols_lasso], val_lasso['speaker'], train_sizes)

corr_curves = compute_learning_curve(
    train_corr[feature_cols_corr], train_corr['speaker'],
    val_corr[feature_cols_corr], val_corr['speaker'], train_sizes)

# Plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].plot(lasso_curves['train_size'], lasso_curves['train_prec'], 'b-o', label='Train')
axes[0, 0].plot(lasso_curves['train_size'], lasso_curves['val_prec'], 'r-s', label='Validation')
axes[0, 0].set_title(f'Lasso Model ({len(feature_cols_lasso)} features) - Precision')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(lasso_curves['train_size'], lasso_curves['train_rec'], 'b-o', label='Train')
axes[0, 1].plot(lasso_curves['train_size'], lasso_curves['val_rec'], 'r-s', label='Validation')
axes[0, 1].set_title(f'Lasso Model ({len(feature_cols_lasso)} features) - Recall')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].plot(corr_curves['train_size'], corr_curves['train_prec'], 'b-o', label='Train')
axes[1, 0].plot(corr_curves['train_size'], corr_curves['val_prec'], 'r-s', label='Validation')
axes[1, 0].set_title(f'Correlation Model ({len(feature_cols_corr)} features) - Precision')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(corr_curves['train_size'], corr_curves['train_rec'], 'b-o', label='Train')
axes[1, 1].plot(corr_curves['train_size'], corr_curves['val_rec'], 'r-s', label='Validation')
axes[1, 1].set_title(f'Correlation Model ({len(feature_cols_corr)} features) - Recall')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Learning Curves (C=0.1, L2 Regularization)')
plt.tight_layout()
plt.savefig('learning_curves.png', dpi=150)
plt.close()

#Evaluations

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize
from matplotlib.patches import Circle

# Load test data and models
test_lasso = pd.read_csv('test_selected.csv')
test_corr = pd.read_csv('test_corr_reduced.csv')

feature_cols_lasso = [c for c in test_lasso.columns if c != 'speaker']
feature_cols_corr = [c for c in test_corr.columns if c != 'speaker']

X_test_lasso = test_lasso[feature_cols_lasso]
X_test_corr = test_corr[feature_cols_corr]
y_test = test_lasso['speaker']

with open('logreg_best_model.pkl', 'rb') as f:
    model_lasso = pickle.load(f)
with open('logreg_corr_best_model.pkl', 'rb') as f:
    model_corr = pickle.load(f)

# Predictions
y_pred_lasso = model_lasso.predict(X_test_lasso)
y_pred_corr = model_corr.predict(X_test_corr)
y_prob_lasso = model_lasso.predict_proba(X_test_lasso)
y_prob_corr = model_corr.predict_proba(X_test_corr)

classes = sorted(y_test.unique())

# Overall metrics
print("Lasso Model:")
print(f"  Precision: {precision_score(y_test, y_pred_lasso, average='macro'):.4f}")
print(f"  Recall: {recall_score(y_test, y_pred_lasso, average='macro'):.4f}")
print("\nCorrelation Model:")
print(f"  Precision: {precision_score(y_test, y_pred_corr, average='macro'):.4f}")
print(f"  Recall: {recall_score(y_test, y_pred_corr, average='macro'):.4f}")

# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, y_pred, title in [(axes[0], y_pred_lasso, "Lasso Model (43 features)"),
                           (axes[1], y_pred_corr, "Correlation Model (29 features)")]:
    cm = confusion_matrix(y_test, y_pred, labels=classes)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes, ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title(title)

    # Highlight brothers
    murad_idx, teymur_idx = classes.index('murad'), classes.index('teymur')
    for idx in [murad_idx, teymur_idx]:
        ax.add_patch(plt.Rectangle((0, idx), len(classes), 1, fill=False, edgecolor='red', linewidth=2))
        ax.add_patch(plt.Rectangle((idx, 0), 1, len(classes), fill=False, edgecolor='red', linewidth=2))

plt.suptitle('Confusion Matrices (Red boxes = Brother classes)')
plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=150)
plt.close()

# Brother analysis
def analyze_brothers(y_true, y_pred, name):
    murad_mask = y_true == 'murad'
    teymur_mask = y_true == 'teymur'

    murad_correct = (y_pred[murad_mask] == 'murad').sum()
    murad_as_teymur = (y_pred[murad_mask] == 'teymur').sum()
    teymur_correct = (y_pred[teymur_mask] == 'teymur').sum()
    teymur_as_murad = (y_pred[teymur_mask] == 'murad').sum()

    print(f"\n{name}:")
    print(f"  Murad: {murad_correct}/{murad_mask.sum()} correct, {murad_as_teymur} as Teymur")
    print(f"  Teymur: {teymur_correct}/{teymur_mask.sum()} correct, {teymur_as_murad} as Murad")
    print(f"  Brother confusion: {murad_as_teymur + teymur_as_murad}")

analyze_brothers(y_test, y_pred_lasso, "Lasso Model")
analyze_brothers(y_test, y_pred_corr, "Correlation Model")

# ROC curves
y_test_bin = label_binarize(y_test, classes=classes)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for ax, y_prob, title in [(axes[0], y_prob_lasso, "Lasso Model"),
                           (axes[1], y_prob_corr, "Correlation Model")]:
    for i, class_name in enumerate(classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc = auc(fpr, tpr)
        lw = 3 if class_name in ['murad', 'teymur'] else 1
        label = f'{class_name} (AUC={roc_auc:.2f})' + (' ★' if class_name in ['murad', 'teymur'] else '')
        ax.plot(fpr, tpr, linewidth=lw, label=label)

    ax.plot([0, 1], [0, 1], 'k--', label='Random')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right', fontsize=8)
    ax.grid(True, alpha=0.3)

plt.suptitle('ROC Curves (★ = Brother classes)')
plt.tight_layout()
plt.savefig('roc_curves.png', dpi=150)
plt.close()

# Error analysis
lasso_errors = set(np.where(y_pred_lasso != y_test.values)[0])
corr_errors = set(np.where(y_pred_corr != y_test.values)[0])

print(f"\nLasso errors: {len(lasso_errors)}")
print(f"Correlation errors: {len(corr_errors)}")

for idx in corr_errors:
    print(f"  Sample {idx}: {y_test.iloc[idx]} → {y_pred_corr[idx]}")

# Venn diagram
fig, ax = plt.subplots(figsize=(8, 8))
circle1 = Circle((0.35, 0.5), 0.3, alpha=0.5, color='blue')
circle2 = Circle((0.65, 0.5), 0.3, alpha=0.5, color='red')
ax.add_patch(circle1)
ax.add_patch(circle2)

only_lasso = lasso_errors - corr_errors
only_corr = corr_errors - lasso_errors
both = lasso_errors & corr_errors

ax.text(0.2, 0.5, str(len(only_lasso)), fontsize=24, ha='center', va='center', fontweight='bold')
ax.text(0.5, 0.5, str(len(both)), fontsize=24, ha='center', va='center', fontweight='bold')
ax.text(0.8, 0.5, str(len(only_corr)), fontsize=24, ha='center', va='center', fontweight='bold')
ax.text(0.2, 0.15, 'Lasso only', fontsize=12, ha='center', color='blue')
ax.text(0.8, 0.15, 'Correlation only', fontsize=12, ha='center', color='red')
ax.text(0.5, 0.85, 'Both models', fontsize=12, ha='center')

correct_both = len(y_test) - len(lasso_errors | corr_errors)
ax.text(0.5, 0.05, f'Correctly classified by both: {correct_both}/{len(y_test)}', fontsize=12, ha='center')

ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_aspect('equal')
ax.axis('off')
ax.set_title('Venn Diagram of Classification Errors')
plt.savefig('error_venn_diagram.png', dpi=150)
plt.close()

print("\nDone! All plots saved.")