# Generate Misclassified Samples for Streamlit App

This notebook identifies misclassified samples from the training data and calculates the average distance between correctly classified points. The results will be used in the Streamlit app to show users how close they are to problematic outliers.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


## 2. Load Model and Training Data

In [2]:
# Load the trained model components
print("📂 Loading model components...")
with open('xgboost_personality_model.pkl', 'rb') as f:
    components = pickle.load(f)

model = components['model']
target_encoder = components['target_encoder']
label_encoders = components['label_encoders']
feature_columns = components['feature_columns']
categorical_columns = components['categorical_columns']
numerical_columns = components['numerical_columns']
feature_stats = components['feature_stats']
accuracy = components['accuracy']

print(f"✅ Model loaded with accuracy: {accuracy:.4f}")
print(f"Feature columns: {feature_columns}")
print(f"Categorical columns: {categorical_columns}")
print(f"Numerical columns: {numerical_columns}")

📂 Loading model components...
✅ Model loaded with accuracy: 0.9714
Feature columns: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']
Categorical columns: ['Stage_fear', 'Drained_after_socializing']
Numerical columns: ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']


In [19]:
# Load training data
print("\n📂 Loading training data...")
train_df = pd.read_csv('train_imputed_xgb.csv')
print(f"✅ Training data loaded: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")
print(f"\nPersonality distribution:")
print(train_df['Personality'].value_counts())


📂 Loading training data...
✅ Training data loaded: (18524, 25)
Columns: ['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality', 'xgb_Friends_circle_size_indicator', 'xgb_Friends_circle_size_oof', 'xgb_Social_event_attendance_indicator', 'xgb_Social_event_attendance_oof', 'xgb_Drained_after_socializing_indicator', 'xgb_Drained_after_socializing_oof', 'xgb_Time_spent_Alone_indicator', 'xgb_Time_spent_Alone_oof', 'xgb_Post_frequency_indicator', 'xgb_Post_frequency_oof', 'xgb_Going_outside_indicator', 'xgb_Going_outside_oof', 'xgb_Stage_fear_indicator', 'xgb_Stage_fear_oof', 'xgb_Personality_indicator', 'xgb_Personality_oof']

Personality distribution:
Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64


## 3. Prepare Features and Identify Misclassified Samples

In [20]:
# Prepare training features
print("🔧 Preparing features...")
X_train = train_df[feature_columns].copy()
y_train = train_df['Personality'].copy()

print(f"Original feature types:")
for col in feature_columns:
    print(f"  {col}: {X_train[col].dtype} - Sample values: {X_train[col].unique()[:5]}")

🔧 Preparing features...
Original feature types:
  Time_spent_Alone: float64 - Sample values: [0. 1. 6. 3. 2.]
  Stage_fear: object - Sample values: ['No' 'Yes']
  Social_event_attendance: float64 - Sample values: [6. 7. 1. 4. 8.]
  Going_outside: float64 - Sample values: [4.00000000e+00 3.00000000e+00 0.00000000e+00 5.00000000e+00
 4.35828697e-04]
  Drained_after_socializing: object - Sample values: ['No' 'Yes']
  Friends_circle_size: float64 - Sample values: [15. 10.  3. 11. 13.]
  Post_frequency: float64 - Sample values: [5.         8.         0.         0.02070745 3.        ]


In [21]:
# Encode categorical variables
print("\n🔧 Encoding categorical variables...")
X_train_encoded = X_train.copy()

for col in categorical_columns:
    if col in X_train_encoded.columns and col in label_encoders:
        le = label_encoders[col]
        print(f"Encoding {col}: {X_train_encoded[col].unique()} -> ", end="")
        X_train_encoded[col] = le.transform(X_train_encoded[col].astype(str))
        print(f"{X_train_encoded[col].unique()}")

print(f"\n✅ Features encoded. Shape: {X_train_encoded.shape}")


🔧 Encoding categorical variables...
Encoding Stage_fear: ['No' 'Yes'] -> [0 1]
Encoding Drained_after_socializing: ['No' 'Yes'] -> [0 1]

✅ Features encoded. Shape: (18524, 7)


In [22]:
# Encode target variable
print("\n🔧 Encoding target variable...")
y_train_encoded = target_encoder.transform(y_train)
print(f"Target classes: {target_encoder.classes_}")
print(f"Encoded target distribution: {np.bincount(y_train_encoded)}")


🔧 Encoding target variable...
Target classes: ['Extrovert' 'Introvert']
Encoded target distribution: [13699  4825]


In [23]:
# Make predictions on training data
print("\n🤖 Making predictions on training data...")
y_pred = model.predict(X_train_encoded)
y_pred_proba = model.predict_proba(X_train_encoded)

print(f"Predictions made. Shape: {y_pred.shape}")
print(f"Prediction distribution: {np.bincount(y_pred)}")


🤖 Making predictions on training data...
Predictions made. Shape: (18524,)
Prediction distribution: [13790  4734]


In [24]:
# Identify misclassified and correctly classified samples
print("\n🎯 Identifying misclassified samples...")
misclassified_mask = y_train_encoded != y_pred
correctly_classified_mask = y_train_encoded == y_pred

num_misclassified = np.sum(misclassified_mask)
num_correctly_classified = np.sum(correctly_classified_mask)
total_samples = len(y_train_encoded)

print(f"📊 Classification Results:")
print(f"  Total samples: {total_samples}")
print(f"  Correctly classified: {num_correctly_classified} ({num_correctly_classified/total_samples*100:.1f}%)")
print(f"  Misclassified: {num_misclassified} ({num_misclassified/total_samples*100:.1f}%)")
print(f"  Accuracy: {num_correctly_classified/total_samples:.4f}")


🎯 Identifying misclassified samples...
📊 Classification Results:
  Total samples: 18524
  Correctly classified: 17933 (96.8%)
  Misclassified: 591 (3.2%)
  Accuracy: 0.9681


## 4. Extract Misclassified Samples

In [25]:
# Get misclassified samples with additional information
print("\n📊 Extracting misclassified samples...")

misclassified_data = []
misclassified_indices = np.where(misclassified_mask)[0]

for idx in misclassified_indices:
    try:
        # Check if the sample has valid data
        sample_features = X_train_encoded.iloc[idx][feature_columns]
        if sample_features.isnull().any():
            print(f"⚠️ Skipping sample {idx} due to missing values")
            continue
            
        sample_info = {
            'original_index': train_df.iloc[idx]['id'] if 'id' in train_df.columns else idx,
            'true_label': y_train.iloc[idx],
            'predicted_label': target_encoder.inverse_transform([y_pred[idx]])[0],
            'confidence': np.max(y_pred_proba[idx]),
        }
        
        # Add all features
        for feature in feature_columns:
            feature_value = X_train_encoded.iloc[idx][feature]
            # Handle potential NaN values
            if pd.isna(feature_value):
                print(f"⚠️ NaN value found in feature {feature} for sample {idx}")
                feature_value = 0  # or use median/mode
            sample_info[feature] = feature_value
        
        misclassified_data.append(sample_info)
        
    except Exception as e:
        print(f"❌ Error processing sample {idx}: {e}")
        continue

misclassified_df = pd.DataFrame(misclassified_data)
print(f"✅ Misclassified samples extracted: {len(misclassified_df)} samples")

if len(misclassified_df) > 0:
    print(f"\nSample of misclassified data:")
    print(misclassified_df.head())
    
    # Check for any remaining NaN values
    nan_check = misclassified_df.isnull().sum()
    if nan_check.any():
        print(f"\n⚠️ NaN values in misclassified data:")
        print(nan_check[nan_check > 0])
    else:
        print("\n✅ No NaN values in misclassified data")
else:
    print("❌ No valid misclassified samples found!")


📊 Extracting misclassified samples...
✅ Misclassified samples extracted: 591 samples

Sample of misclassified data:
   original_index true_label predicted_label  confidence  Time_spent_Alone  \
0              63  Extrovert       Introvert    0.821004               8.0   
1             102  Extrovert       Introvert    0.901223               6.0   
2             124  Introvert       Extrovert    0.890060               4.0   
3             139  Extrovert       Introvert    0.818464              11.0   
4             149  Introvert       Extrovert    0.892519               0.0   

   Stage_fear  Social_event_attendance  Going_outside  \
0         1.0                      4.0            2.0   
1         1.0                      2.0            0.0   
2         0.0                      7.0            6.0   
3         1.0                      2.0            2.0   
4         0.0                      4.0            6.0   

   Drained_after_socializing  Friends_circle_size  Post_frequency  
0  

In [26]:
# Analyze misclassification patterns
print("\n🔍 Analyzing misclassification patterns...")

print("Misclassification by true label:")
print(misclassified_df['true_label'].value_counts())

print("\nMisclassification by predicted label:")
print(misclassified_df['predicted_label'].value_counts())

print("\nConfidence distribution of misclassified samples:")
print(f"Mean confidence: {misclassified_df['confidence'].mean():.3f}")
print(f"Median confidence: {misclassified_df['confidence'].median():.3f}")
print(f"Min confidence: {misclassified_df['confidence'].min():.3f}")
print(f"Max confidence: {misclassified_df['confidence'].max():.3f}")


🔍 Analyzing misclassification patterns...
Misclassification by true label:
true_label
Introvert    341
Extrovert    250
Name: count, dtype: int64

Misclassification by predicted label:
predicted_label
Extrovert    341
Introvert    250
Name: count, dtype: int64

Confidence distribution of misclassified samples:
Mean confidence: 0.875
Median confidence: 0.930
Min confidence: 0.502
Max confidence: 0.998


## 5. Calculate Average Distance Between Correctly Classified Points

In [27]:
# Inspect the encoded data for any issues
print("\n🔍 Inspecting encoded data quality...")
print(f"Encoded data shape: {X_train_encoded.shape}")
print(f"Data types:\n{X_train_encoded.dtypes}")
print(f"\nData summary:")
print(X_train_encoded.describe())

# Check for missing values
missing_values = X_train_encoded.isnull().sum()
if missing_values.any():
    print(f"\n⚠️ Missing values found:")
    print(missing_values[missing_values > 0])
else:
    print("\n✅ No missing values found")

# Check for infinite values
infinite_values = np.isinf(X_train_encoded.select_dtypes(include=[np.number])).sum()
if infinite_values.any():
    print(f"\n⚠️ Infinite values found:")
    print(infinite_values[infinite_values > 0])
else:
    print("\n✅ No infinite values found")

# Check data ranges
print(f"\nData ranges:")
for col in X_train_encoded.columns:
    if X_train_encoded[col].dtype in ['int64', 'float64']:
        print(f"  {col}: [{X_train_encoded[col].min():.2f}, {X_train_encoded[col].max():.2f}]")
    else:
        print(f"  {col}: {X_train_encoded[col].unique()}")


🔍 Inspecting encoded data quality...
Encoded data shape: (18524, 7)
Data types:
Time_spent_Alone             float64
Stage_fear                     int32
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing      int32
Friends_circle_size          float64
Post_frequency               float64
dtype: object

Data summary:
       Time_spent_Alone    Stage_fear  Social_event_attendance  Going_outside  \
count      18524.000000  18524.000000             18524.000000   18524.000000   
mean           2.936211      0.257018                 4.931425       3.724326   
std            3.005802      0.437001                 2.955463       2.260321   
min           -0.001171      0.000000                -0.007918      -0.000364   
25%            1.000000      0.000000                 3.000000       2.000000   
50%            2.000000      0.000000                 5.000000       4.000000   
75%            4.000000      1.000000                 7.000000  

In [28]:
# Calculate average distance between correctly classified points
print("\n📏 Calculating average distance between correctly classified points...")

correctly_classified_samples = X_train_encoded[correctly_classified_mask].values
print(f"Number of correctly classified samples: {len(correctly_classified_samples)}")

# Check for NaN or infinite values
print(f"Checking for data quality issues...")
print(f"  NaN values: {np.isnan(correctly_classified_samples).sum()}")
print(f"  Infinite values: {np.isinf(correctly_classified_samples).sum()}")

# Clean the data - remove any rows with NaN or infinite values
if np.isnan(correctly_classified_samples).any() or np.isinf(correctly_classified_samples).any():
    print("⚠️ Found NaN or infinite values, cleaning data...")
    valid_mask = ~np.isnan(correctly_classified_samples).any(axis=1) & ~np.isinf(correctly_classified_samples).any(axis=1)
    correctly_classified_samples = correctly_classified_samples[valid_mask]
    print(f"After cleaning: {len(correctly_classified_samples)} samples remaining")

if len(correctly_classified_samples) > 1:
    # Sample a subset for efficiency if too many samples
    if len(correctly_classified_samples) > 1000:
        print(f"Sampling 1000 points from {len(correctly_classified_samples)} for efficiency...")
        np.random.seed(42)  # For reproducibility
        sample_indices = np.random.choice(len(correctly_classified_samples), 1000, replace=False)
        sample_points = correctly_classified_samples[sample_indices]
    else:
        sample_points = correctly_classified_samples
    
    print(f"Calculating pairwise distances for {len(sample_points)} points...")
    
    # Double-check for any remaining NaN values
    if np.isnan(sample_points).any() or np.isinf(sample_points).any():
        print("❌ Still have NaN/infinite values after cleaning")
        print("Sample point statistics:")
        print(f"  Shape: {sample_points.shape}")
        print(f"  Min: {np.nanmin(sample_points)}")
        print(f"  Max: {np.nanmax(sample_points)}")
        print(f"  NaN count: {np.isnan(sample_points).sum()}")
        print(f"  Inf count: {np.isinf(sample_points).sum()}")
        avg_correct_distance = 0
    else:
        try:
            # Calculate pairwise distances
            distances = euclidean_distances(sample_points)
            
            # Get upper triangle (excluding diagonal) to avoid counting same pair twice
            upper_triangle = np.triu(distances, k=1)
            non_zero_distances = upper_triangle[upper_triangle > 0]
            
            avg_correct_distance = np.mean(non_zero_distances)
            median_correct_distance = np.median(non_zero_distances)
            std_correct_distance = np.std(non_zero_distances)
            
            print(f"\n📊 Distance Statistics for Correctly Classified Points:")
            print(f"  Average distance: {avg_correct_distance:.4f}")
            print(f"  Median distance: {median_correct_distance:.4f}")
            print(f"  Standard deviation: {std_correct_distance:.4f}")
            print(f"  Number of distance pairs: {len(non_zero_distances)}")
            
        except Exception as e:
            print(f"❌ Error calculating distances: {e}")
            avg_correct_distance = 0
    
else:
    avg_correct_distance = 0
    print("⚠️ Not enough correctly classified samples to calculate distances")


📏 Calculating average distance between correctly classified points...
Number of correctly classified samples: 17933
Checking for data quality issues...
  NaN values: 0
  Infinite values: 0
Sampling 1000 points from 17933 for efficiency...
Calculating pairwise distances for 1000 points...

📊 Distance Statistics for Correctly Classified Points:
  Average distance: 9.4807
  Median distance: 9.0339
  Standard deviation: 4.0260
  Number of distance pairs: 499499


## 6. Save Results

In [29]:
# Save misclassified samples to CSV
misclassified_filename = 'misclassified_samples.csv'
misclassified_df.to_csv(misclassified_filename, index=False)
print(f"✅ Misclassified samples saved to '{misclassified_filename}'")
print(f"File contains {len(misclassified_df)} rows and {len(misclassified_df.columns)} columns")
print(f"Columns: {list(misclassified_df.columns)}")

✅ Misclassified samples saved to 'misclassified_samples.csv'
File contains 591 rows and 11 columns
Columns: ['original_index', 'true_label', 'predicted_label', 'confidence', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']


In [33]:
# Save only the feature values for distance calculations (no metadata)
misclassified_features_only = misclassified_df[feature_columns].copy()

# Clean the misclassified features data
print(f"\n🧹 Cleaning misclassified features data...")
print(f"Before cleaning:")
print(f"  NaN values: {misclassified_features_only.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(misclassified_features_only.select_dtypes(include=[np.number])).sum().sum()}")

# Handle NaN values - fill with global median/mode from the entire training set
for col in feature_columns:
    if misclassified_features_only[col].isnull().any():
        if col in numerical_columns:
            # Use median from the entire training set
            global_median = X_train_encoded[col].median()
            if pd.isna(global_median):
                global_median = 0
            misclassified_features_only[col].fillna(global_median, inplace=True)
            print(f"  Filled {misclassified_features_only[col].isnull().sum()} NaN values in {col} with global median: {global_median}")
        else:
            # Use mode from the entire training set
            global_mode = X_train_encoded[col].mode()
            if len(global_mode) > 0:
                mode_val = global_mode[0]
            else:
                mode_val = 0
            misclassified_features_only[col].fillna(mode_val, inplace=True)
            print(f"  Filled {misclassified_features_only[col].isnull().sum()} NaN values in {col} with global mode: {mode_val}")

# Handle infinite values - replace with 0
numerical_cols = misclassified_features_only.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    inf_mask = np.isinf(misclassified_features_only[col])
    if inf_mask.any():
        misclassified_features_only.loc[inf_mask, col] = 0
        print(f"  Replaced {inf_mask.sum()} infinite values in {col} with 0")

# Final check and force cleanup of any remaining NaN/inf values
print(f"\nAfter initial cleaning:")
print(f"  NaN values: {misclassified_features_only.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(misclassified_features_only.select_dtypes(include=[np.number])).sum().sum()}")

# Force fill any remaining NaN values with 0
if misclassified_features_only.isnull().sum().sum() > 0:
    print("  Final cleanup: filling any remaining NaN values with 0")
    misclassified_features_only.fillna(0, inplace=True)

# Force replace any remaining infinite values with 0
for col in numerical_cols:
    misclassified_features_only[col] = misclassified_features_only[col].replace([np.inf, -np.inf], 0)

print(f"\nFinal verification:")
print(f"  NaN values: {misclassified_features_only.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(misclassified_features_only.select_dtypes(include=[np.number])).sum().sum()}")

# Save the cleaned features
features_filename = 'misclassified_features.csv'
misclassified_features_only.to_csv(features_filename, index=False)
print(f"✅ Cleaned misclassified features saved to '{features_filename}'")
print(f"File contains {len(misclassified_features_only)} rows and {len(misclassified_features_only.columns)} columns")
print(f"Feature columns: {list(misclassified_features_only.columns)}")


🧹 Cleaning misclassified features data...
Before cleaning:
  NaN values: 0
  Infinite values: 0

After initial cleaning:
  NaN values: 0
  Infinite values: 0

Final verification:
  NaN values: 0
  Infinite values: 0
✅ Cleaned misclassified features saved to 'misclassified_features.csv'
File contains 591 rows and 7 columns
Feature columns: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency']


In [31]:
# Calculate average distance between correctly classified points
print(f"\n📏 Calculating average distance between correctly classified points...")

# Get correctly classified samples
correctly_classified_mask = y_pred == y_train_encoded
correctly_classified_samples = X_train_encoded[correctly_classified_mask]

print(f"Number of correctly classified samples: {len(correctly_classified_samples)}")

# Clean the correctly classified features data
print(f"\n🧹 Cleaning correctly classified features data...")
correct_features_df = pd.DataFrame(correctly_classified_samples, columns=feature_columns)

print(f"Before cleaning:")
print(f"  NaN values: {correct_features_df.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(correct_features_df.select_dtypes(include=[np.number])).sum().sum()}")

# Handle NaN values
for col in feature_columns:
    if correct_features_df[col].isnull().any():
        if col in numerical_columns:
            median_val = correct_features_df[col].median()
            if pd.isna(median_val):
                median_val = 0
            correct_features_df[col].fillna(median_val, inplace=True)
        else:
            mode_val = correct_features_df[col].mode()
            if len(mode_val) > 0:
                correct_features_df[col].fillna(mode_val[0], inplace=True)
            else:
                correct_features_df[col].fillna(0, inplace=True)

# Handle infinite values
numerical_cols = correct_features_df.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    inf_mask = np.isinf(correct_features_df[col])
    if inf_mask.any():
        correct_features_df.loc[inf_mask, col] = 0

print(f"After cleaning:")
print(f"  NaN values: {correct_features_df.isnull().sum().sum()}")
print(f"  Infinite values: {np.isinf(correct_features_df.select_dtypes(include=[np.number])).sum().sum()}")

# Convert back to numpy array for distance calculations
correctly_classified_samples = correct_features_df.values

# Calculate pairwise distances (sample a subset if too large)
max_samples = 1000  # Limit for computational efficiency
if len(correctly_classified_samples) > max_samples:
    print(f"Sampling {max_samples} correctly classified points for distance calculation...")
    sample_indices = np.random.choice(len(correctly_classified_samples), max_samples, replace=False)
    sample_points = correctly_classified_samples[sample_indices]
else:
    sample_points = correctly_classified_samples

print(f"Calculating distances for {len(sample_points)} samples...")

# Calculate distances safely using only valid pairs
def safe_euclidean_distance_matrix(X):
    """Calculate pairwise euclidean distances with NaN/inf protection"""
    n = len(X)
    distances = []
    
    for i in range(n):
        for j in range(i+1, n):
            try:
                # Check for any remaining NaN or inf values
                point1, point2 = X[i], X[j]
                
                if np.any(np.isnan(point1)) or np.any(np.isnan(point2)) or \
                   np.any(np.isinf(point1)) or np.any(np.isinf(point2)):
                    continue  # Skip this pair
                
                dist = np.sqrt(np.sum((point1 - point2) ** 2))
                
                if np.isfinite(dist) and dist > 0:  # Only add valid, positive distances
                    distances.append(dist)
                    
            except Exception as e:
                continue  # Skip problematic pairs
    
    return np.array(distances)

distances = safe_euclidean_distance_matrix(sample_points)

if len(distances) > 0:
    avg_correct_distance = np.mean(distances)
    median_correct_distance = np.median(distances)
    std_correct_distance = np.std(distances)
    
    print(f"\n✅ BASELINE_CORRECT_DISTANCE = {avg_correct_distance:.6f}")
    print(f"📊 Additional statistics:")
    print(f"  Median distance: {median_correct_distance:.6f}")
    print(f"  Standard deviation: {std_correct_distance:.6f}")
    print(f"  Calculated from {len(distances)} valid distance pairs")
    print(f"  Distance range: [{np.min(distances):.6f}, {np.max(distances):.6f}]")
else:
    print("❌ Could not calculate baseline distance - no valid distance pairs found")
    avg_correct_distance = 1.0  # Fallback value

# Save the average distance as a constant for the Streamlit app
print(f"\n📝 Average distance between correctly classified points: {avg_correct_distance:.6f}")
print(f"\n💡 Use this value in your Streamlit app as the baseline distance.")
print(f"\n📋 Summary for Streamlit Integration:")
print(f"  - CSV file: '{features_filename}'")
print(f"  - Average baseline distance: {avg_correct_distance:.6f}")
print(f"  - Number of misclassified samples: {len(misclassified_df)}")
print(f"  - Features to use for distance calculation: {feature_columns}")


📏 Calculating average distance between correctly classified points...
Number of correctly classified samples: 17933

🧹 Cleaning correctly classified features data...
Before cleaning:
  NaN values: 0
  Infinite values: 0
After cleaning:
  NaN values: 0
  Infinite values: 0
Sampling 1000 correctly classified points for distance calculation...
Calculating distances for 1000 samples...

✅ BASELINE_CORRECT_DISTANCE = 9.341171
📊 Additional statistics:
  Median distance: 8.887026
  Standard deviation: 3.930843
  Calculated from 499496 valid distance pairs
  Distance range: [0.000110, 22.758952]

📝 Average distance between correctly classified points: 9.341171

💡 Use this value in your Streamlit app as the baseline distance.

📋 Summary for Streamlit Integration:
  - CSV file: 'misclassified_features.csv'
  - Average baseline distance: 9.341171
  - Number of misclassified samples: 591
  - Features to use for distance calculation: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'G

In [None]:
# Calculate class-specific statistics for Streamlit
print("\n📊 Calculating class-specific statistics...")

# Overall statistics
overall_stats = {
    'total_samples': total_samples,
    'correctly_classified': num_correctly_classified,
    'misclassified': num_misclassified,
    'overall_accuracy': num_correctly_classified / total_samples
}

print(f"\n📈 Overall Statistics:")
print(f"  Total samples: {overall_stats['total_samples']}")
print(f"  Correctly classified: {overall_stats['correctly_classified']} ({overall_stats['correctly_classified']/overall_stats['total_samples']*100:.1f}%)")
print(f"  Misclassified: {overall_stats['misclassified']} ({overall_stats['misclassified']/overall_stats['total_samples']*100:.1f}%)")
print(f"  Overall accuracy: {overall_stats['overall_accuracy']:.4f}")

# Class-specific statistics
class_stats = {}
for class_name in target_encoder.classes_:
    class_encoded = target_encoder.transform([class_name])[0]
    
    # Samples where true label is this class
    true_class_mask = y_train_encoded == class_encoded
    true_class_total = np.sum(true_class_mask)
    
    # Correctly classified samples of this class
    true_class_correct = np.sum(true_class_mask & correctly_classified_mask)
    
    # Misclassified samples of this class
    true_class_misclassified = np.sum(true_class_mask & misclassified_mask)
    
    # Class-specific accuracy
    class_accuracy = true_class_correct / true_class_total if true_class_total > 0 else 0
    
    class_stats[class_name] = {
        'total_samples': int(true_class_total),
        'correctly_classified': int(true_class_correct),
        'misclassified': int(true_class_misclassified),
        'class_accuracy': float(class_accuracy)
    }
    
    print(f"\n📊 {class_name} Statistics:")
    print(f"  Total {class_name} samples: {true_class_total}")
    print(f"  Correctly classified: {true_class_correct} ({true_class_correct/true_class_total*100:.1f}%)")
    print(f"  Misclassified: {true_class_misclassified} ({true_class_misclassified/true_class_total*100:.1f}%)")
    print(f"  Class accuracy: {class_accuracy:.4f}")

# Save statistics for Streamlit
import json

statistics = {
    'overall': overall_stats,
    'by_class': class_stats,
    'baseline_distance': float(avg_correct_distance)
}

stats_filename = 'model_statistics.json'
with open(stats_filename, 'w') as f:
    json.dump(statistics, f, indent=2)

print(f"\n✅ Statistics saved to '{stats_filename}'")
print(f"\n📋 Statistics for Streamlit Integration:")
print(f"  - Overall accuracy: {overall_stats['overall_accuracy']:.4f}")
print(f"  - Baseline distance: {avg_correct_distance:.6f}")
print(f"  - Class-specific stats available for: {list(class_stats.keys())}")
print(f"  - Use model_statistics.json to load all stats in Streamlit")

## 7. Test Distance Calculation Function

In [32]:
# Test the distance calculation that will be used in Streamlit
def calculate_distance_to_misclassified(user_input_array, misclassified_features):
    """Calculate the distance from user input to nearest misclassified sample"""
    if len(misclassified_features) == 0:
        return None, None
    
    # Convert user input to array if it's not already
    user_array = np.array(user_input_array).reshape(1, -1)
    
    # Check for NaN or infinite values
    if np.isnan(user_array).any() or np.isinf(user_array).any():
        print("⚠️ User input contains NaN or infinite values")
        return None, None
        
    if np.isnan(misclassified_features).any() or np.isinf(misclassified_features).any():
        print("⚠️ Misclassified features contain NaN or infinite values")
        return None, None
    
    try:
        # Calculate distances to all misclassified samples
        distances = euclidean_distances(user_array, misclassified_features)
        
        # Find minimum distance and its index
        min_distance = np.min(distances)
        min_index = np.argmin(distances)
        
        return min_distance, min_index
        
    except Exception as e:
        print(f"❌ Error calculating distances: {e}")
        return None, None

# Test with a sample from the training data
print("\n🧪 Testing distance calculation function...")

if len(misclassified_df) > 0 and 'misclassified_features_only' in locals():
    # Use a clean sample for testing
    test_sample = X_train_encoded.iloc[0][feature_columns].values
    
    # Clean the test sample
    if np.isnan(test_sample).any() or np.isinf(test_sample).any():
        print("⚠️ Test sample contains NaN/infinite values, cleaning...")
        # Replace NaN and inf with 0 for testing
        test_sample = np.nan_to_num(test_sample, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Use the cleaned misclassified features
    misclassified_features_clean = misclassified_features_only.values
    
    print(f"Test sample shape: {test_sample.shape}")
    print(f"Misclassified features shape: {misclassified_features_clean.shape}")
    print(f"Test sample has NaN: {np.isnan(test_sample).any()}")
    print(f"Test sample has inf: {np.isinf(test_sample).any()}")
    print(f"Misclassified features has NaN: {np.isnan(misclassified_features_clean).any()}")
    print(f"Misclassified features has inf: {np.isinf(misclassified_features_clean).any()}")
    
    min_dist, min_idx = calculate_distance_to_misclassified(test_sample, misclassified_features_clean)
    
    if min_dist is not None and avg_correct_distance > 0:
        print(f"Test sample distance to nearest misclassified: {min_dist:.4f}")
        print(f"Nearest misclassified sample index: {min_idx}")
        print(f"Baseline average distance: {avg_correct_distance:.4f}")
        print(f"Ratio (test_distance / baseline): {min_dist / avg_correct_distance:.2f}")
        
        if min_dist < avg_correct_distance:
            print("⚠️ This test sample is closer to misclassified samples than the average distance between correct samples!")
        else:
            print("✅ This test sample is in a 'safe' region away from misclassified samples.")
    else:
        print("❌ Could not calculate test distances")
        if avg_correct_distance == 0:
            print("   - Baseline distance is 0, check distance calculation")
        if min_dist is None:
            print("   - Distance calculation returned None")
else:
    print("❌ No misclassified samples available for testing")

# Additional test: Try loading the saved CSV file
try:
    print(f"\n🧪 Testing CSV file loading...")
    loaded_features = pd.read_csv(features_filename)
    print(f"Loaded CSV shape: {loaded_features.shape}")
    print(f"Loaded CSV has NaN: {loaded_features.isnull().any().any()}")
    print(f"Loaded CSV has inf: {np.isinf(loaded_features.select_dtypes(include=[np.number])).any().any()}")
    
    if not loaded_features.isnull().any().any() and not np.isinf(loaded_features.select_dtypes(include=[np.number])).any().any():
        print("✅ CSV file is clean and ready for Streamlit integration")
    else:
        print("⚠️ CSV file still has data quality issues")
        
except Exception as e:
    print(f"❌ Error loading CSV file: {e}")


🧪 Testing distance calculation function...
Test sample shape: (7,)
Misclassified features shape: (591, 7)
Test sample has NaN: False
Test sample has inf: False
Misclassified features has NaN: False
Misclassified features has inf: False
Test sample distance to nearest misclassified: 1.4142
Nearest misclassified sample index: 532
Baseline average distance: 9.3412
Ratio (test_distance / baseline): 0.15
⚠️ This test sample is closer to misclassified samples than the average distance between correct samples!

🧪 Testing CSV file loading...
Loaded CSV shape: (591, 7)
Loaded CSV has NaN: False
Loaded CSV has inf: False
✅ CSV file is clean and ready for Streamlit integration


## 8. Summary

**Files Created:**
1. `misclassified_samples.csv` - Full information about misclassified samples including metadata
2. `misclassified_features.csv` - Only the feature values for distance calculations
3. `enhanced_misclassified_samples.csv` - **NEW**: Features + metadata for showing users specific samples they're closest to
4. `model_statistics.json` - Overall and class-specific model performance statistics

**Key Values for Streamlit:**
- Average distance between correctly classified points: Use this as baseline
- Use the `misclassified_features.csv` file to calculate distances
- Use `enhanced_misclassified_samples.csv` to show users which specific sample they're closest to
- Use `model_statistics.json` for displaying overall and class-specific performance

**Model Performance Summary:**
- Overall accuracy: 96.8% (17,933 correct out of 18,524 total samples)
- Extrovert accuracy: 98.2% (13,449 correct out of 13,699 samples)
- Introvert accuracy: 92.9% (4,484 correct out of 4,825 samples)
- Total misclassified: 591 samples

**New Enhanced Features:**
- **Closest Misclassified Sample Display**: Users can see exactly which misclassified sample they are most similar to
- **Feature-by-Feature Comparison**: Shows how the user's input compares to the closest misclassified sample
- **Similarity Analysis**: Displays which features are most/least similar to problematic samples
- **Risk Assessment**: Provides detailed insights into why a prediction might be reliable or unreliable

**Integration Notes:**
- Load `misclassified_features.csv` in Streamlit for distance calculations
- Load `enhanced_misclassified_samples.csv` for detailed sample comparison
- Load `model_statistics.json` for displaying statistics
- Use the average distance value as a hardcoded constant
- When user makes a prediction:
  1. Calculate their distance to nearest misclassified sample
  2. Show which specific sample they're closest to
  3. Display feature-by-feature comparison
  4. Compare distance to baseline to determine risk level
- Display overall and class-specific statistics to give users context about model performance

In [34]:
# Test function to verify data integrity and CSV loading
def test_distance_calculation():
    """Test the distance calculation and CSV data integrity"""
    print("🧪 Testing data integrity and distance calculations...")
    
    try:
        # Test CSV loading
        print("\n1. Testing CSV loading...")
        test_df = pd.read_csv('misclassified_features.csv')
        print(f"   ✅ Successfully loaded CSV with {len(test_df)} rows and {len(test_df.columns)} columns")
        
        # Check for data quality issues
        print("\n2. Checking data quality...")
        nan_count = test_df.isnull().sum().sum()
        inf_count = np.isinf(test_df.select_dtypes(include=[np.number])).sum().sum()
        
        print(f"   NaN values in CSV: {nan_count}")
        print(f"   Infinite values in CSV: {inf_count}")
        
        if nan_count == 0 and inf_count == 0:
            print("   ✅ CSV data is clean")
        else:
            print("   ⚠️ CSV data has quality issues")
        
        # Test distance calculation with a sample
        print("\n3. Testing distance calculation...")
        if len(test_df) >= 2:
            test_sample = test_df.iloc[0].values
            misclassified_features_clean = test_df.values
            
            # Test distance calculation
            def calculate_distance_to_misclassified(user_features, misclassified_features):
                distances = []
                for i, mis_features in enumerate(misclassified_features):
                    try:
                        if np.any(np.isnan(user_features)) or np.any(np.isnan(mis_features)) or \
                           np.any(np.isinf(user_features)) or np.any(np.isinf(mis_features)):
                            continue
                        
                        dist = np.sqrt(np.sum((user_features - mis_features) ** 2))
                        if np.isfinite(dist):
                            distances.append(dist)
                    except:
                        continue
                
                return min(distances) if distances else float('inf')
            
            min_distance = calculate_distance_to_misclassified(test_sample, misclassified_features_clean)
            print(f"   Test distance calculation: {min_distance:.6f}")
            
            if np.isfinite(min_distance):
                print("   ✅ Distance calculation working correctly")
            else:
                print("   ❌ Distance calculation failed")
        
        print(f"\n4. Summary:")
        print(f"   Total misclassified samples: {len(test_df)}")
        print(f"   Feature columns: {len(test_df.columns)}")
        print(f"   Baseline correct distance: {avg_correct_distance:.6f}")
        print(f"   Data ready for Streamlit app: {'✅' if nan_count == 0 and inf_count == 0 else '❌'}")
        
    except Exception as e:
        print(f"❌ Test failed with error: {str(e)}")
        import traceback
        traceback.print_exc()

# Run the test
test_distance_calculation()

🧪 Testing data integrity and distance calculations...

1. Testing CSV loading...
   ✅ Successfully loaded CSV with 591 rows and 7 columns

2. Checking data quality...
   NaN values in CSV: 0
   Infinite values in CSV: 0
   ✅ CSV data is clean

3. Testing distance calculation...
   Test distance calculation: 0.000000
   ✅ Distance calculation working correctly

4. Summary:
   Total misclassified samples: 591
   Feature columns: 7
   Baseline correct distance: 9.341171
   Data ready for Streamlit app: ✅


## 📊 Model Statistics and Class-Specific Analysis

In this section, we'll compute and save overall and class-specific statistics about model performance, including misclassification rates by personality type.

In [35]:
# Calculate overall model statistics
import json

print("📊 Computing overall model statistics...")

# Total samples
total_samples = len(y_train)
correctly_classified_count = np.sum(correctly_classified_mask)
misclassified_count = np.sum(misclassified_mask)

# Overall accuracy (should match what we computed before)
overall_accuracy = correctly_classified_count / total_samples

print(f"Total training samples: {total_samples}")
print(f"Correctly classified: {correctly_classified_count}")
print(f"Misclassified: {misclassified_count}")
print(f"Overall accuracy: {overall_accuracy:.4f}")
print(f"Overall misclassification rate: {1 - overall_accuracy:.4f}")

# Verify our counts are correct
assert correctly_classified_count + misclassified_count == total_samples, "Sample counts don't add up!"
print("✅ Sample counts verified")

📊 Computing overall model statistics...
Total training samples: 18524
Correctly classified: 17933
Misclassified: 591
Overall accuracy: 0.9681
Overall misclassification rate: 0.0319
✅ Sample counts verified


In [36]:
# Calculate class-specific statistics
print("\n📈 Computing class-specific statistics...")

# Get unique classes and their labels
unique_classes = np.unique(y_train_encoded)
class_labels = target_encoder.inverse_transform(unique_classes)

print(f"Number of personality types: {len(unique_classes)}")
print(f"Personality types: {list(class_labels)}")

# Initialize dictionaries to store class-specific stats
class_stats = {}

for class_idx, class_label in zip(unique_classes, class_labels):
    # Get mask for samples of this class
    class_mask = (y_train_encoded == class_idx)
    
    # Count total samples for this class
    total_class_samples = np.sum(class_mask)
    
    # Count correctly classified samples for this class
    correct_class_samples = np.sum(class_mask & correctly_classified_mask)
    
    # Count misclassified samples for this class
    misclassified_class_samples = np.sum(class_mask & misclassified_mask)
    
    # Calculate class-specific accuracy
    class_accuracy = correct_class_samples / total_class_samples if total_class_samples > 0 else 0
    class_misclassification_rate = misclassified_class_samples / total_class_samples if total_class_samples > 0 else 0
    
    # Store in dictionary
    class_stats[class_label] = {
        'total_samples': int(total_class_samples),
        'correctly_classified': int(correct_class_samples),
        'misclassified': int(misclassified_class_samples),
        'accuracy': float(class_accuracy),
        'misclassification_rate': float(class_misclassification_rate)
    }
    
    print(f"\n{class_label}:")
    print(f"  Total samples: {total_class_samples}")
    print(f"  Correctly classified: {correct_class_samples}")
    print(f"  Misclassified: {misclassified_class_samples}")
    print(f"  Accuracy: {class_accuracy:.4f}")
    print(f"  Misclassification rate: {class_misclassification_rate:.4f}")
    
    # Verify counts add up
    assert correct_class_samples + misclassified_class_samples == total_class_samples, f"Counts don't add up for {class_label}!"

print("\n✅ All class-specific statistics computed and verified")


📈 Computing class-specific statistics...
Number of personality types: 2
Personality types: ['Extrovert', 'Introvert']

Extrovert:
  Total samples: 13699
  Correctly classified: 13449
  Misclassified: 250
  Accuracy: 0.9818
  Misclassification rate: 0.0182

Introvert:
  Total samples: 4825
  Correctly classified: 4484
  Misclassified: 341
  Accuracy: 0.9293
  Misclassification rate: 0.0707

✅ All class-specific statistics computed and verified


In [37]:
# Create comprehensive statistics dictionary
print("\n💾 Preparing statistics for export...")

model_statistics = {
    'overall': {
        'total_samples': int(total_samples),
        'correctly_classified': int(correctly_classified_count),
        'misclassified': int(misclassified_count),
        'accuracy': float(overall_accuracy),
        'misclassification_rate': float(1 - overall_accuracy)
    },
    'by_class': class_stats,
    'distance_metrics': {
        'average_correct_distance': float(avg_correct_distance),
        'median_correct_distance': float(median_correct_distance),
        'std_correct_distance': float(std_correct_distance)
    },
    'metadata': {
        'num_classes': len(unique_classes),
        'class_labels': list(class_labels),
        'feature_columns': feature_columns,
        'total_misclassified_samples_saved': len(misclassified_df)
    }
}

# Save to JSON file
stats_filename = 'model_statistics.json'
with open(stats_filename, 'w') as f:
    json.dump(model_statistics, f, indent=2)

print(f"✅ Model statistics saved to {stats_filename}")
print(f"   - Overall accuracy: {overall_accuracy:.4f}")
print(f"   - Total classes: {len(unique_classes)}")
print(f"   - Baseline distance: {avg_correct_distance:.6f}")
print(f"   - File size: {len(json.dumps(model_statistics))} characters")


💾 Preparing statistics for export...
✅ Model statistics saved to model_statistics.json
   - Overall accuracy: 0.9681
   - Total classes: 2
   - Baseline distance: 9.341171
   - File size: 969 characters


## 📋 Enhanced Misclassified Samples for User Comparison

Create an enhanced version of misclassified samples that includes both features and metadata for showing users which specific sample they are closest to.

In [38]:
# Create enhanced misclassified samples file for user comparison
print("\n📋 Creating enhanced misclassified samples file...")

# Create a comprehensive misclassified samples file that includes both features and metadata
enhanced_misclassified = []

for idx, row in misclassified_df.iterrows():
    # Get the original sample info
    sample_data = {
        'sample_id': idx,
        'original_index': row['original_index'],
        'true_label': row['true_label'],
        'predicted_label': row['predicted_label'],
        'confidence': row['confidence']
    }
    
    # Add all feature values
    for feature in feature_columns:
        sample_data[feature] = row[feature]
    
    # Add some human-readable descriptions
    sample_data['description'] = f"Sample #{idx}: True={row['true_label']}, Predicted={row['predicted_label']}, Confidence={row['confidence']:.2f}"
    
    enhanced_misclassified.append(sample_data)

enhanced_misclassified_df = pd.DataFrame(enhanced_misclassified)

# Save the enhanced version
enhanced_filename = 'enhanced_misclassified_samples.csv'
enhanced_misclassified_df.to_csv(enhanced_filename, index=False)

print(f"✅ Enhanced misclassified samples saved to '{enhanced_filename}'")
print(f"   - Contains {len(enhanced_misclassified_df)} samples")
print(f"   - Includes features + metadata for user comparison")
print(f"   - Columns: {list(enhanced_misclassified_df.columns)}")

# Verify the enhanced file has clean data
print(f"\n🔍 Data quality check:")
print(f"   - NaN values: {enhanced_misclassified_df.isnull().sum().sum()}")
print(f"   - Infinite values: {np.isinf(enhanced_misclassified_df.select_dtypes(include=[np.number])).sum().sum()}")

# Show a sample
print(f"\n📊 Sample of enhanced data:")
print(enhanced_misclassified_df[['sample_id', 'true_label', 'predicted_label', 'confidence', 'description']].head())


📋 Creating enhanced misclassified samples file...
✅ Enhanced misclassified samples saved to 'enhanced_misclassified_samples.csv'
   - Contains 591 samples
   - Includes features + metadata for user comparison
   - Columns: ['sample_id', 'original_index', 'true_label', 'predicted_label', 'confidence', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'description']

🔍 Data quality check:
   - NaN values: 0
   - Infinite values: 0

📊 Sample of enhanced data:
   sample_id true_label predicted_label  confidence  \
0          0  Extrovert       Introvert    0.821004   
1          1  Extrovert       Introvert    0.901223   
2          2  Introvert       Extrovert    0.890060   
3          3  Extrovert       Introvert    0.818464   
4          4  Introvert       Extrovert    0.892519   

                                         description  
0  Sample #0: True=Extrovert, Predicted=Introvert...  

In [40]:
# Final summary display
print("=" * 80)
print("🎉 COMPLETION SUMMARY")
print("=" * 80)

print(f"\n📁 FILES CREATED:")
print(f"   ✅ {misclassified_filename} - {len(misclassified_df)} misclassified samples with metadata")
print(f"   ✅ {features_filename} - {len(misclassified_features_only)} samples for distance calculations")
print(f"   ✅ {stats_filename} - Model performance statistics")

print(f"\n📊 OVERALL MODEL PERFORMANCE:")
print(f"   • Total training samples: {total_samples:,}")
print(f"   • Correctly classified: {correctly_classified_count:,} ({overall_accuracy:.1%})")
print(f"   • Misclassified: {misclassified_count:,} ({1-overall_accuracy:.1%})")

print(f"\n📈 CLASS-SPECIFIC PERFORMANCE:")
for class_name, stats in class_stats.items():
    print(f"   • {class_name}:")
    print(f"     - Total: {stats['total_samples']:,} samples")
    print(f"     - Accuracy: {stats['accuracy']:.1%}")
    print(f"     - Misclassified: {stats['misclassified']:,} samples")

print(f"\n📏 DISTANCE METRICS:")
print(f"   • Average distance between correct samples: {avg_correct_distance:.6f}")
print(f"   • Median distance: {median_correct_distance:.6f}")
print(f"   • Standard deviation: {std_correct_distance:.6f}")

print(f"\n🚀 STREAMLIT INTEGRATION:")
print(f"   1. Load '{features_filename}' for distance calculations")
print(f"   2. Load '{stats_filename}' for displaying performance statistics")
print(f"   3. Use baseline distance: {avg_correct_distance:.6f}")
print(f"   4. Run: streamlit run streamlit_personality_app.py")

print(f"\n✅ All data processing complete! Ready for Streamlit app integration.")
print("=" * 80)

🎉 COMPLETION SUMMARY

📁 FILES CREATED:
   ✅ misclassified_samples.csv - 591 misclassified samples with metadata
   ✅ misclassified_features.csv - 591 samples for distance calculations
   ✅ model_statistics.json - Model performance statistics

📊 OVERALL MODEL PERFORMANCE:
   • Total training samples: 18,524
   • Correctly classified: 17,933 (96.8%)
   • Misclassified: 591 (3.2%)

📈 CLASS-SPECIFIC PERFORMANCE:
   • Extrovert:
     - Total: 13,699 samples
     - Accuracy: 98.2%
     - Misclassified: 250 samples
   • Introvert:
     - Total: 4,825 samples
     - Accuracy: 92.9%
     - Misclassified: 341 samples

📏 DISTANCE METRICS:
   • Average distance between correct samples: 9.341171
   • Median distance: 8.887026
   • Standard deviation: 3.930843

🚀 STREAMLIT INTEGRATION:
   1. Load 'misclassified_features.csv' for distance calculations
   2. Load 'model_statistics.json' for displaying performance statistics
   3. Use baseline distance: 9.341171
   4. Run: streamlit run streamlit_perso