# Lab Exam - Set 5

This notebook contains implementations for all questions in Set 5.

## Question 17: NumPy Arrays - Arithmetic and Statistical Operations with Universal Functions

**Concepts:**
- **NumPy Arrays**: Efficient multi-dimensional containers for numerical data
- **Universal Functions (ufuncs)**: Fast element-wise operations on arrays
- **Arithmetic operations**: Addition, subtraction, multiplication, division, power, modulo
- **Statistical operations**: Mean, median, standard deviation, variance, min, max, percentiles
- **Broadcasting**: NumPy's ability to perform operations on arrays of different shapes
- **Aggregation functions**: Sum, product, cumulative sum, cumulative product

In [None]:
import numpy as np

# Create NumPy arrays
arr1 = np.array([10, 20, 30, 40, 50])
arr2 = np.array([1, 2, 3, 4, 5])
arr3 = np.random.randint(1, 100, size=(5, 5))  # 5x5 random array

print("Array 1:", arr1)
print("Array 2:", arr2)
print("\nArray 3 (5x5 random):")
print(arr3)

# ========== ARITHMETIC OPERATIONS USING UNIVERSAL FUNCTIONS ==========
print("\n" + "="*60)
print("ARITHMETIC OPERATIONS")
print("="*60)

# Addition
add_result = np.add(arr1, arr2)
print("\nAddition (arr1 + arr2):", add_result)

# Subtraction
sub_result = np.subtract(arr1, arr2)
print("Subtraction (arr1 - arr2):", sub_result)

# Multiplication
mul_result = np.multiply(arr1, arr2)
print("Multiplication (arr1 * arr2):", mul_result)

# Division
div_result = np.divide(arr1, arr2)
print("Division (arr1 / arr2):", div_result)

# Power
power_result = np.power(arr2, 2)
print("Power (arr2 ** 2):", power_result)

# Modulo
mod_result = np.mod(arr1, arr2)
print("Modulo (arr1 % arr2):", mod_result)

# Square root
sqrt_result = np.sqrt(arr1)
print("Square root of arr1:", sqrt_result)

# Exponential
exp_result = np.exp(arr2)
print("Exponential of arr2:", exp_result)

# Logarithm
log_result = np.log(arr1)
print("Natural log of arr1:", log_result)

# Absolute value
arr_negative = np.array([-5, -10, 15, -20, 25])
abs_result = np.abs(arr_negative)
print("\nAbsolute value of [-5, -10, 15, -20, 25]:", abs_result)

# ========== STATISTICAL OPERATIONS ==========
print("\n" + "="*60)
print("STATISTICAL OPERATIONS")
print("="*60)

# Using arr3 for statistical operations
print("\nStatistics on Array 3:")

# Mean
mean_all = np.mean(arr3)
mean_axis0 = np.mean(arr3, axis=0)  # Column-wise mean
mean_axis1 = np.mean(arr3, axis=1)  # Row-wise mean
print(f"Mean (overall): {mean_all:.2f}")
print(f"Mean (column-wise): {mean_axis0}")
print(f"Mean (row-wise): {mean_axis1}")

# Median
median_all = np.median(arr3)
print(f"\nMedian (overall): {median_all:.2f}")

# Standard Deviation
std_all = np.std(arr3)
print(f"Standard Deviation: {std_all:.2f}")

# Variance
var_all = np.var(arr3)
print(f"Variance: {var_all:.2f}")

# Minimum and Maximum
min_val = np.min(arr3)
max_val = np.max(arr3)
print(f"\nMinimum: {min_val}")
print(f"Maximum: {max_val}")

# Minimum and Maximum indices
min_idx = np.argmin(arr3)
max_idx = np.argmax(arr3)
print(f"Minimum index (flattened): {min_idx}")
print(f"Maximum index (flattened): {max_idx}")

# Percentiles
percentile_25 = np.percentile(arr3, 25)
percentile_50 = np.percentile(arr3, 50)  # Same as median
percentile_75 = np.percentile(arr3, 75)
print(f"\n25th Percentile: {percentile_25:.2f}")
print(f"50th Percentile (Median): {percentile_50:.2f}")
print(f"75th Percentile: {percentile_75:.2f}")

# Sum and Product
sum_all = np.sum(arr3)
prod_arr2 = np.prod(arr2)
print(f"\nSum of all elements in arr3: {sum_all}")
print(f"Product of all elements in arr2: {prod_arr2}")

# Cumulative sum and product
cumsum_arr2 = np.cumsum(arr2)
cumprod_arr2 = np.cumprod(arr2)
print(f"\nCumulative sum of arr2: {cumsum_arr2}")
print(f"Cumulative product of arr2: {cumprod_arr2}")

# ========== ADDITIONAL USEFUL OPERATIONS ==========
print("\n" + "="*60)
print("ADDITIONAL OPERATIONS")
print("="*60)

# Sorting
sorted_arr = np.sort(arr1)
print(f"\nSorted arr1: {sorted_arr}")

# Unique values
arr_duplicates = np.array([1, 2, 2, 3, 3, 3, 4, 4, 5])
unique_vals = np.unique(arr_duplicates)
print(f"Unique values in [1,2,2,3,3,3,4,4,5]: {unique_vals}")

# Correlation coefficient
corr_coef = np.corrcoef(arr1, arr2)
print(f"\nCorrelation coefficient matrix:")
print(corr_coef)

# Statistical summary
print("\n" + "="*60)
print("STATISTICAL SUMMARY FOR ARRAY 3")
print("="*60)
print(f"Shape: {arr3.shape}")
print(f"Size: {arr3.size}")
print(f"Data type: {arr3.dtype}")
print(f"Mean: {np.mean(arr3):.2f}")
print(f"Median: {np.median(arr3):.2f}")
print(f"Std Dev: {np.std(arr3):.2f}")
print(f"Variance: {np.var(arr3):.2f}")
print(f"Min: {np.min(arr3)}")
print(f"Max: {np.max(arr3)}")
print(f"Range: {np.ptp(arr3)}")

## Question 18: Pandas DataFrame - Import CSV, Drop Duplicates, and Group-wise Statistics

**Concepts:**
- **CSV Import**: Reading comma-separated values files using `pd.read_csv()`
- **Duplicate Records**: Rows with identical values in all or specific columns
- **drop_duplicates()**: Method to remove duplicate rows from DataFrame
- **GroupBy**: Splitting data into groups based on criteria
- **Aggregation**: Computing summary statistics for each group
- **Group-wise statistics**: Mean, sum, count, min, max for different categories

In [None]:
import pandas as pd
import numpy as np

# Create a sample CSV file with duplicates and multiple categories
data = {
    'Employee_ID': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 102, 105, 111, 112],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry', 
             'Ivy', 'Jack', 'Bob', 'Eve', 'Kelly', 'Liam'],
    'Department': ['Sales', 'IT', 'Sales', 'HR', 'IT', 'Sales', 'HR', 'IT', 
                   'Sales', 'HR', 'IT', 'IT', 'Sales', 'HR'],
    'Age': [28, 35, 42, 29, 31, 38, 27, 33, 45, 30, 35, 31, 26, 34],
    'Salary': [50000, 75000, 60000, 55000, 72000, 58000, 53000, 78000, 
               62000, 54000, 75000, 72000, 51000, 56000],
    'Experience': [3, 8, 15, 5, 7, 10, 4, 9, 18, 6, 8, 7, 2, 7]
}

df_sample = pd.DataFrame(data)
df_sample.to_csv('employee_data.csv', index=False)
print("Sample CSV file 'employee_data.csv' created successfully!\n")

# ========== IMPORT CSV FILE ==========
print("="*60)
print("STEP 1: IMPORT CSV FILE")
print("="*60)

df = pd.read_csv('employee_data.csv')
print("\nOriginal DataFrame:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Total records: {len(df)}")

# Display basic information
print("\nDataFrame Info:")
print(df.info())

# ========== IDENTIFY AND DROP DUPLICATE RECORDS ==========
print("\n" + "="*60)
print("STEP 2: IDENTIFY AND DROP DUPLICATE RECORDS")
print("="*60)

# Check for duplicates
print(f"\nTotal duplicate rows: {df.duplicated().sum()}")

# Show duplicate rows
duplicates = df[df.duplicated(keep=False)]
if not duplicates.empty:
    print("\nDuplicate records found:")
    print(duplicates.sort_values('Employee_ID'))

# Check duplicates based on specific columns (Employee_ID)
print(f"\nDuplicate Employee_IDs: {df.duplicated(subset=['Employee_ID']).sum()}")

# Show which Employee_IDs are duplicated
duplicate_ids = df[df.duplicated(subset=['Employee_ID'], keep=False)]
if not duplicate_ids.empty:
    print("\nRecords with duplicate Employee_IDs:")
    print(duplicate_ids.sort_values('Employee_ID'))

# Drop duplicates based on Employee_ID (keep first occurrence)
df_cleaned = df.drop_duplicates(subset=['Employee_ID'], keep='first')

print(f"\n\nAfter removing duplicates:")
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
print(f"Records removed: {len(df) - len(df_cleaned)}")

print("\nCleaned DataFrame:")
print(df_cleaned)

# ========== COMPUTE GROUP-WISE STATISTICS ==========
print("\n" + "="*60)
print("STEP 3: COMPUTE GROUP-WISE STATISTICS")
print("="*60)

# Group by Department and compute various statistics
print("\n1. GROUP BY DEPARTMENT - MEAN VALUES:")
print("="*50)
dept_mean = df_cleaned.groupby('Department')[['Age', 'Salary', 'Experience']].mean()
print(dept_mean)

print("\n2. GROUP BY DEPARTMENT - SUM VALUES:")
print("="*50)
dept_sum = df_cleaned.groupby('Department')[['Salary']].sum()
print(dept_sum)

print("\n3. GROUP BY DEPARTMENT - COUNT:")
print("="*50)
dept_count = df_cleaned.groupby('Department').size()
print(dept_count)

print("\n4. GROUP BY DEPARTMENT - MIN AND MAX SALARY:")
print("="*50)
dept_minmax = df_cleaned.groupby('Department')['Salary'].agg(['min', 'max', 'mean'])
print(dept_minmax)

print("\n5. GROUP BY DEPARTMENT - MULTIPLE AGGREGATIONS:")
print("="*50)
dept_agg = df_cleaned.groupby('Department').agg({
    'Age': ['mean', 'min', 'max'],
    'Salary': ['mean', 'sum', 'std'],
    'Experience': ['mean', 'median']
})
print(dept_agg)

print("\n6. GROUP BY DEPARTMENT - DETAILED STATISTICS:")
print("="*50)
dept_describe = df_cleaned.groupby('Department')['Salary'].describe()
print(dept_describe)

# Additional grouping examples
print("\n7. GROUP BY AGE RANGES:")
print("="*50)
# Create age groups
df_cleaned['Age_Group'] = pd.cut(df_cleaned['Age'], 
                                   bins=[0, 30, 40, 100], 
                                   labels=['Young', 'Middle', 'Senior'])
age_group_stats = df_cleaned.groupby('Age_Group')[['Salary', 'Experience']].mean()
print(age_group_stats)

# Save cleaned data
df_cleaned.to_csv('employee_data_cleaned.csv', index=False)
print("\n\nCleaned data saved to 'employee_data_cleaned.csv'")

# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Original records: {len(df)}")
print(f"Duplicate records removed: {len(df) - len(df_cleaned)}")
print(f"Final records: {len(df_cleaned)}")
print(f"Departments: {df_cleaned['Department'].nunique()}")
print(f"Department names: {df_cleaned['Department'].unique()}")

## Question 19: Overlapping Histograms - Compare Distributions Among Features

**Concepts:**
- **Histogram**: Graphical representation of data distribution using bins
- **Overlapping plots**: Multiple distributions on same axes for comparison
- **Distribution comparison**: Analyzing shape, center, and spread of different features
- **Alpha transparency**: Making overlapping plots visible using transparency
- **Matplotlib**: Python library for creating visualizations
- **Seaborn**: Statistical data visualization library built on matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Create sample dataset with multiple features
np.random.seed(42)
n_samples = 200

data = {
    'Math_Score': np.random.normal(75, 10, n_samples),
    'Physics_Score': np.random.normal(70, 12, n_samples),
    'Chemistry_Score': np.random.normal(72, 11, n_samples),
    'English_Score': np.random.normal(68, 15, n_samples),
    'Category': np.random.choice(['A', 'B', 'C'], n_samples)
}

df = pd.DataFrame(data)

# Ensure scores are within 0-100 range
for col in ['Math_Score', 'Physics_Score', 'Chemistry_Score', 'English_Score']:
    df[col] = df[col].clip(0, 100)

print("Dataset created with multiple features:")
print(df.head(10))
print(f"\nDataset shape: {df.shape}")
print("\nBasic statistics:")
print(df.describe())

# ========== PLOT 1: OVERLAPPING HISTOGRAMS FOR ALL SUBJECTS ==========
print("\n" + "="*60)
print("OVERLAPPING HISTOGRAMS - COMPARING ALL SUBJECTS")
print("="*60)

plt.figure(figsize=(12, 6))

# Plot overlapping histograms with transparency
plt.hist(df['Math_Score'], bins=20, alpha=0.5, label='Math', color='blue', edgecolor='black')
plt.hist(df['Physics_Score'], bins=20, alpha=0.5, label='Physics', color='red', edgecolor='black')
plt.hist(df['Chemistry_Score'], bins=20, alpha=0.5, label='Chemistry', color='green', edgecolor='black')
plt.hist(df['English_Score'], bins=20, alpha=0.5, label='English', color='orange', edgecolor='black')

plt.xlabel('Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Overlapping Histograms - Subject Score Distributions', fontsize=14, fontweight='bold')
plt.legend(loc='upper right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# ========== PLOT 2: OVERLAPPING DENSITY PLOTS ==========
print("\nDensity Plots for better comparison...")

plt.figure(figsize=(12, 6))

# Plot density curves
df['Math_Score'].plot(kind='density', label='Math', color='blue', linewidth=2)
df['Physics_Score'].plot(kind='density', label='Physics', color='red', linewidth=2)
df['Chemistry_Score'].plot(kind='density', label='Chemistry', color='green', linewidth=2)
df['English_Score'].plot(kind='density', label='English', color='orange', linewidth=2)

plt.xlabel('Score', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.title('Overlapping Density Plots - Subject Score Distributions', fontsize=14, fontweight='bold')
plt.legend(loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# ========== PLOT 3: COMBINED HISTOGRAM AND DENSITY ==========
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
subjects = ['Math_Score', 'Physics_Score', 'Chemistry_Score', 'English_Score']
colors = ['blue', 'red', 'green', 'orange']

for idx, (subject, color) in enumerate(zip(subjects, colors)):
    row = idx // 2
    col = idx % 2
    
    # Histogram
    axes[row, col].hist(df[subject], bins=20, alpha=0.6, color=color, 
                        edgecolor='black', density=True, label='Histogram')
    
    # Density plot
    df[subject].plot(kind='density', ax=axes[row, col], color='darkblue', 
                     linewidth=2, label='Density')
    
    axes[row, col].set_xlabel('Score', fontsize=10)
    axes[row, col].set_ylabel('Density/Frequency', fontsize=10)
    axes[row, col].set_title(f'{subject.replace("_", " ")} Distribution', 
                             fontsize=11, fontweight='bold')
    axes[row, col].legend(fontsize=9)
    axes[row, col].grid(True, alpha=0.3)

plt.suptitle('Individual Subject Distributions - Histogram + Density', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# ========== PLOT 4: OVERLAPPING HISTOGRAMS BY CATEGORY ==========
print("\n" + "="*60)
print("OVERLAPPING HISTOGRAMS - COMPARING CATEGORIES")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for idx, subject in enumerate(subjects):
    row = idx // 2
    col = idx % 2
    
    # Plot histogram for each category
    for category in ['A', 'B', 'C']:
        data_cat = df[df['Category'] == category][subject]
        axes[row, col].hist(data_cat, bins=15, alpha=0.5, 
                           label=f'Category {category}', edgecolor='black')
    
    axes[row, col].set_xlabel('Score', fontsize=10)
    axes[row, col].set_ylabel('Frequency', fontsize=10)
    axes[row, col].set_title(f'{subject.replace("_", " ")} by Category', 
                             fontsize=11, fontweight='bold')
    axes[row, col].legend(fontsize=9)
    axes[row, col].grid(True, alpha=0.3)

plt.suptitle('Overlapping Histograms - Comparison by Category', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# ========== PLOT 5: SEABORN DISTPLOT (MODERN APPROACH) ==========
print("\nAdvanced visualization using Seaborn...")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# KDE plot
for subject, color in zip(subjects, colors):
    sns.kdeplot(data=df, x=subject, label=subject.replace('_Score', ''), 
                color=color, linewidth=2, ax=axes[0])

axes[0].set_xlabel('Score', fontsize=11)
axes[0].set_ylabel('Density', fontsize=11)
axes[0].set_title('KDE Plot - All Subjects', fontsize=12, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Box plot for comparison
df_melted = df.melt(value_vars=subjects, var_name='Subject', value_name='Score')
sns.boxplot(data=df_melted, x='Subject', y='Score', ax=axes[1], palette='Set2')
axes[1].set_xlabel('Subject', fontsize=11)
axes[1].set_ylabel('Score', fontsize=11)
axes[1].set_title('Box Plot - Score Comparison', fontsize=12, fontweight='bold')
axes[1].set_xticklabels([s.replace('_Score', '') for s in subjects], rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# ========== STATISTICAL COMPARISON ==========
print("\n" + "="*60)
print("STATISTICAL COMPARISON OF DISTRIBUTIONS")
print("="*60)

comparison_stats = pd.DataFrame({
    'Subject': subjects,
    'Mean': [df[s].mean() for s in subjects],
    'Median': [df[s].median() for s in subjects],
    'Std Dev': [df[s].std() for s in subjects],
    'Min': [df[s].min() for s in subjects],
    'Max': [df[s].max() for s in subjects]
})

print("\n", comparison_stats.to_string(index=False))

# Interpretation
print("\n" + "="*60)
print("INTERPRETATION")
print("="*60)
print("\n1. Distribution Shape: All subjects show approximately normal distribution")
print("2. Central Tendency: Math has highest mean, English has lowest")
print("3. Spread: English has highest variability (std dev), Math has lowest")
print("4. Overlap: Significant overlap between distributions indicates similar performance")

## Question 20: Logistic Regression for Binary Classification - Confusion Matrix and ROC Curve

**Concepts:**
- **Logistic Regression**: Supervised learning algorithm for binary classification
- **Binary Classification**: Predicting one of two possible outcomes (0 or 1, Yes or No)
- **Confusion Matrix**: Table showing True Positives, True Negatives, False Positives, False Negatives
- **ROC Curve**: Receiver Operating Characteristic curve showing TPR vs FPR
- **AUC**: Area Under the ROC Curve - measures model performance (0.5 to 1.0)
- **Accuracy**: (TP + TN) / Total predictions
- **Precision**: TP / (TP + FP)
- **Recall**: TP / (TP + FN)
- **F1-Score**: Harmonic mean of precision and recall

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

# ========== CREATE BINARY CLASSIFICATION DATASET ==========
print("="*60)
print("CREATING BINARY CLASSIFICATION DATASET")
print("="*60)

# Generate synthetic dataset
X, y = make_classification(n_samples=1000, 
                          n_features=4, 
                          n_informative=3, 
                          n_redundant=1, 
                          n_classes=2, 
                          random_state=42,
                          class_sep=1.0)

# Create DataFrame for better visualization
feature_names = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4']
df = pd.DataFrame(X, columns=feature_names)
df['Target'] = y

print("\nDataset created successfully!")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

print("\nClass distribution:")
print(df['Target'].value_counts())
print(f"\nClass 0: {(y == 0).sum()} samples")
print(f"Class 1: {(y == 1).sum()} samples")

# ========== DATA PREPROCESSING ==========
print("\n" + "="*60)
print("DATA PREPROCESSING")
print("="*60)

# Split features and target
X = df[feature_names]
y = df['Target']

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                      random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Feature scaling (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling completed!")

# ========== TRAIN LOGISTIC REGRESSION MODEL ==========
print("\n" + "="*60)
print("TRAINING LOGISTIC REGRESSION MODEL")
print("="*60)

# Create and train logistic regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

print("\nModel trained successfully!")
print(f"\nModel coefficients: {log_reg.coef_[0]}")
print(f"Model intercept: {log_reg.intercept_[0]}")

# ========== MAKE PREDICTIONS ==========
print("\n" + "="*60)
print("MAKING PREDICTIONS")
print("="*60)

# Predict on test set
y_pred = log_reg.predict(X_test_scaled)
y_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1

print("\nFirst 10 predictions vs actual:")
comparison_df = pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred[:10],
    'Probability_Class_1': y_pred_proba[:10]
})
print(comparison_df)

# ========== MODEL EVALUATION - ACCURACY ==========
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

# Calculate accuracy
train_accuracy = log_reg.score(X_train_scaled, y_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

# ========== CONFUSION MATRIX ==========
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("\nConfusion Matrix:")
print(cm)

# Extract values from confusion matrix
tn, fp, fn, tp = cm.ravel()

print(f"\nTrue Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Logistic Regression', fontsize=14, fontweight='bold')
plt.ylabel('Actual Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)

# Add text annotations
plt.text(0.5, -0.15, f'TN = {tn}', ha='center', transform=plt.gca().transAxes, fontsize=10)
plt.text(1.5, -0.15, f'FP = {fp}', ha='center', transform=plt.gca().transAxes, fontsize=10)
plt.text(0.5, 1.15, f'FN = {fn}', ha='center', transform=plt.gca().transAxes, fontsize=10)
plt.text(1.5, 1.15, f'TP = {tp}', ha='center', transform=plt.gca().transAxes, fontsize=10)

plt.tight_layout()
plt.show()

# ========== CLASSIFICATION REPORT ==========
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)

print("\n", classification_report(y_test, y_pred, 
                                   target_names=['Class 0', 'Class 1']))

# ========== ROC CURVE ==========
print("\n" + "="*60)
print("ROC CURVE AND AUC SCORE")
print("="*60)

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nAUC (Area Under Curve) Score: {roc_auc:.4f}")

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
         label='Random Classifier (AUC = 0.5)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', 
          fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# ========== COMBINED VISUALIZATION ==========
print("\nCreating combined visualization...")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Enhanced Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='YlOrRd', ax=axes[0],
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'],
            cbar_kws={'label': 'Count'})
axes[0].set_title('Confusion Matrix', fontsize=13, fontweight='bold')
axes[0].set_ylabel('True Label', fontsize=11)
axes[0].set_xlabel('Predicted Label', fontsize=11)

# Plot 2: ROC Curve
axes[1].plot(fpr, tpr, color='darkorange', lw=2.5, 
             label=f'Logistic Regression (AUC = {roc_auc:.4f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
             label='Random Guess (AUC = 0.5)')
axes[1].fill_between(fpr, tpr, alpha=0.2, color='orange')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate', fontsize=11)
axes[1].set_ylabel('True Positive Rate', fontsize=11)
axes[1].set_title('ROC Curve', fontsize=13, fontweight='bold')
axes[1].legend(loc='lower right', fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.suptitle('Logistic Regression Performance Metrics', 
             fontsize=15, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# ========== ADDITIONAL METRICS ==========
print("\n" + "="*60)
print("ADDITIONAL PERFORMANCE METRICS")
print("="*60)

# Calculate additional metrics manually
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print(f"\nAccuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"\nTotal samples: {len(y)}")
print(f"Training samples: {len(y_train)}")
print(f"Testing samples: {len(y_test)}")
print(f"\nModel Accuracy: {test_accuracy * 100:.2f}%")
print(f"AUC Score: {roc_auc:.4f}")
print("\nInterpretation:")
if roc_auc > 0.9:
    print("- Excellent model performance (AUC > 0.9)")
elif roc_auc > 0.8:
    print("- Good model performance (AUC > 0.8)")
elif roc_auc > 0.7:
    print("- Fair model performance (AUC > 0.7)")
else:
    print("- Poor model performance (AUC < 0.7)")

print(f"\n- The model correctly classified {tp + tn} out of {len(y_test)} samples")
print(f"- False predictions: {fp + fn}")