# Lab Exam - Set 8

This notebook contains implementations for all questions in Set 8.

## Question 29: Pandas DataFrame - Advanced String Manipulations

**Concepts:**
- **String operations**: Methods to manipulate text data in DataFrames
- **Case conversion**: Converting text to upper/lower case using `.str.upper()`, `.str.lower()`
- **String splitting**: Breaking strings into parts using `.str.split()`
- **String replacement**: Replacing substrings using `.str.replace()`
- **Pattern matching**: Finding patterns using `.str.contains()`, `.str.extract()`
- **String slicing**: Extracting portions of strings using `.str[]` or `.str.slice()`

In [None]:
import pandas as pd
import numpy as np

# Create a DataFrame with textual columns
data = {
    'Name': ['John Doe', 'jane smith', 'ALICE JOHNSON', 'Bob Brown', 'charlie DAVIS'],
    'Email': ['john.doe@email.com', 'jane.smith@gmail.com', 'alice@yahoo.com', 
              'bob.brown@email.com', 'charlie@hotmail.com'],
    'Phone': ['+1-555-1234', '+1-555-5678', '+1-555-9012', '+1-555-3456', '+1-555-7890'],
    'Address': ['123 Main St, New York, NY', '456 Oak Ave, Los Angeles, CA', 
                '789 Pine Rd, Chicago, IL', '321 Elm St, Houston, TX', 
                '654 Maple Dr, Phoenix, AZ'],
    'Product_Code': ['PROD-001-A', 'PROD-002-B', 'PROD-003-C', 'PROD-004-A', 'PROD-005-B']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*80 + "\n")

# 1. Case conversion - Standardize name format to Title Case
df['Name_Standardized'] = df['Name'].str.title()
print("1. Standardized Names (Title Case):")
print(df[['Name', 'Name_Standardized']])
print("\n" + "="*80 + "\n")

# 2. Extract first and last names by splitting
df[['First_Name', 'Last_Name']] = df['Name_Standardized'].str.split(' ', n=1, expand=True)
print("2. Split Names into First and Last:")
print(df[['Name_Standardized', 'First_Name', 'Last_Name']])
print("\n" + "="*80 + "\n")

# 3. Extract domain from email addresses
df['Email_Domain'] = df['Email'].str.extract(r'@([\w.]+)')
print("3. Extract Email Domain:")
print(df[['Email', 'Email_Domain']])
print("\n" + "="*80 + "\n")

# 4. Clean phone numbers - remove special characters
df['Phone_Cleaned'] = df['Phone'].str.replace(r'[^0-9]', '', regex=True)
print("4. Cleaned Phone Numbers (digits only):")
print(df[['Phone', 'Phone_Cleaned']])
print("\n" + "="*80 + "\n")

# 5. Extract city from address (second element after splitting by comma)
df['City'] = df['Address'].str.split(',').str[1].str.strip()
print("5. Extract City from Address:")
print(df[['Address', 'City']])
print("\n" + "="*80 + "\n")

# 6. Extract state abbreviation from address (last part)
df['State'] = df['Address'].str.split(',').str[-1].str.strip()
print("6. Extract State from Address:")
print(df[['Address', 'State']])
print("\n" + "="*80 + "\n")

# 7. Check if email contains specific domain
df['Is_Gmail'] = df['Email'].str.contains('gmail', case=False)
print("7. Check if Email is Gmail:")
print(df[['Email', 'Is_Gmail']])
print("\n" + "="*80 + "\n")

# 8. Extract product category (last character after last hyphen)
df['Product_Category'] = df['Product_Code'].str.split('-').str[-1]
print("8. Extract Product Category:")
print(df[['Product_Code', 'Product_Category']])
print("\n" + "="*80 + "\n")

# 9. Create initials from names
df['Initials'] = df['First_Name'].str[0] + df['Last_Name'].str[0]
print("9. Create Initials:")
print(df[['Name_Standardized', 'Initials']])
print("\n" + "="*80 + "\n")

# 10. String length calculation
df['Name_Length'] = df['Name_Standardized'].str.len()
print("10. Calculate Name Length:")
print(df[['Name_Standardized', 'Name_Length']])
print("\n" + "="*80 + "\n")

# Display final DataFrame with selected columns
print("Final DataFrame with String Manipulations:")
display_cols = ['Name_Standardized', 'First_Name', 'Last_Name', 'Email_Domain', 
                'Phone_Cleaned', 'City', 'State', 'Product_Category']
print(df[display_cols])

## Question 30: Missing Data Imputation and Feature Scaling

**Concepts:**
- **Missing data**: Incomplete values in dataset (NaN or None)
- **Imputation**: Filling missing values with substitutes (mean, median, mode)
- **Feature scaling**: Transforming features to similar ranges
- **Standardization**: Scaling data to mean=0 and std=1 using StandardScaler
- **Normalization**: Scaling data to range [0,1] using MinMaxScaler
- **SimpleImputer**: Scikit-learn tool for filling missing values

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Create a dataset with missing values
np.random.seed(42)
data = {
    'Age': [25, 30, np.nan, 28, 35, np.nan, 22, 29, 31, 27, np.nan, 33],
    'Salary': [50000, 60000, 55000, np.nan, 70000, 48000, np.nan, 62000, 58000, 51000, 65000, np.nan],
    'Experience': [2, 5, 3, np.nan, 8, 1, 2, np.nan, 6, 3, 7, 9],
    'Score': [85, 90, np.nan, 88, 92, 78, 83, np.nan, 89, 86, np.nan, 91],
    'Rating': [4.2, np.nan, 3.8, 4.5, 4.8, 3.5, np.nan, 4.3, 4.6, 4.0, 4.7, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)
print("\n" + "="*80 + "\n")

# Check missing values
print("Missing Values Count:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Missing percentage: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%")
print("\n" + "="*80 + "\n")

# Visualize missing data pattern
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Data Pattern (Yellow = Missing)')
plt.tight_layout()
plt.show()

# ==================== IMPUTATION ====================

# Method 1: Mean imputation for Age and Experience
mean_imputer = SimpleImputer(strategy='mean')
df[['Age', 'Experience']] = mean_imputer.fit_transform(df[['Age', 'Experience']])

# Method 2: Median imputation for Salary
median_imputer = SimpleImputer(strategy='median')
df[['Salary']] = median_imputer.fit_transform(df[['Salary']])

# Method 3: Most frequent (mode) imputation for Score
mode_imputer = SimpleImputer(strategy='most_frequent')
df[['Score']] = mode_imputer.fit_transform(df[['Score']])

# Method 4: Constant imputation for Rating (fill with median)
rating_median = df['Rating'].median()
df['Rating'].fillna(rating_median, inplace=True)

print("DataFrame After Imputation:")
print(df)
print(f"\nMissing values remaining: {df.isnull().sum().sum()}")
print("\n" + "="*80 + "\n")

# ==================== FEATURE SCALING ====================

# Create a copy for scaling demonstration
df_original = df.copy()

# 1. Standardization (Z-score normalization)
scaler_standard = StandardScaler()
df_standardized = df.copy()
df_standardized[df.columns] = scaler_standard.fit_transform(df)

print("Standardized Data (Mean=0, Std=1):")
print(df_standardized.head())
print("\nStatistics after Standardization:")
print(df_standardized.describe())
print("\n" + "="*80 + "\n")

# 2. Min-Max Normalization (scaling to [0,1])
scaler_minmax = MinMaxScaler()
df_normalized = df.copy()
df_normalized[df.columns] = scaler_minmax.fit_transform(df)

print("Normalized Data (Range [0,1]):")
print(df_normalized.head())
print("\nStatistics after Normalization:")
print(df_normalized.describe())
print("\n" + "="*80 + "\n")

# Visualize the effects of scaling
fig, axes = plt.subplots(3, 5, figsize=(18, 10))
fig.suptitle('Comparison: Original vs Standardized vs Normalized Data', fontsize=16)

columns = df.columns
for idx, col in enumerate(columns):
    # Original data
    axes[0, idx].hist(df_original[col], bins=10, color='skyblue', edgecolor='black')
    axes[0, idx].set_title(f'{col} (Original)')
    axes[0, idx].set_ylabel('Frequency')
    
    # Standardized data
    axes[1, idx].hist(df_standardized[col], bins=10, color='lightcoral', edgecolor='black')
    axes[1, idx].set_title(f'{col} (Standardized)')
    axes[1, idx].set_ylabel('Frequency')
    
    # Normalized data
    axes[2, idx].hist(df_normalized[col], bins=10, color='lightgreen', edgecolor='black')
    axes[2, idx].set_title(f'{col} (Normalized)')
    axes[2, idx].set_ylabel('Frequency')
    axes[2, idx].set_xlabel('Value')

plt.tight_layout()
plt.show()

# Compare statistics
print("\nComparison Summary:")
print("\nOriginal Data - Mean and Std:")
print(pd.DataFrame({'Mean': df_original.mean(), 'Std': df_original.std()}))
print("\nStandardized Data - Mean and Std:")
print(pd.DataFrame({'Mean': df_standardized.mean(), 'Std': df_standardized.std()}))
print("\nNormalized Data - Min and Max:")
print(pd.DataFrame({'Min': df_normalized.min(), 'Max': df_normalized.max()}))

## Question 31: Scatter Plots and Line Charts for Feature Visualization

**Concepts:**
- **Scatter plot**: Shows relationship between two continuous variables as points
- **Line chart**: Displays data points connected by lines, good for trends over time
- **Correlation**: Measure of how two variables are related (-1 to +1)
- **Trend analysis**: Identifying patterns in data over time or across variables
- **Matplotlib/Seaborn**: Python libraries for data visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Create a comprehensive dataset with numerical features
np.random.seed(42)
n_samples = 100

data = {
    'Hours_Studied': np.random.uniform(1, 10, n_samples),
    'Test_Score': None,
    'Age': np.random.randint(18, 30, n_samples),
    'Sleep_Hours': np.random.uniform(4, 9, n_samples),
    'Income': np.random.uniform(30000, 100000, n_samples),
    'Experience_Years': np.random.uniform(0, 10, n_samples),
    'Month': np.tile(range(1, 13), n_samples // 12 + 1)[:n_samples]
}

# Create correlations: Test_Score depends on Hours_Studied and Sleep_Hours
data['Test_Score'] = (data['Hours_Studied'] * 5 + 
                      data['Sleep_Hours'] * 3 + 
                      np.random.normal(0, 5, n_samples) + 40)
data['Test_Score'] = np.clip(data['Test_Score'], 0, 100)

# Income depends on Experience_Years and Age
data['Income'] = (data['Experience_Years'] * 5000 + 
                  data['Age'] * 1000 + 
                  np.random.normal(0, 5000, n_samples) + 30000)

df = pd.DataFrame(data)

print("Dataset Summary:")
print(df.describe())
print("\n" + "="*80 + "\n")

# Calculate correlation matrix
correlation_matrix = df[['Hours_Studied', 'Test_Score', 'Age', 'Sleep_Hours', 
                          'Income', 'Experience_Years']].corr()
print("Correlation Matrix:")
print(correlation_matrix)
print("\n" + "="*80 + "\n")

# ==================== SCATTER PLOTS ====================

# 1. Basic scatter plots showing relationships
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('Scatter Plots - Relationship Between Numerical Features', fontsize=16)

# Scatter 1: Hours Studied vs Test Score
axes[0, 0].scatter(df['Hours_Studied'], df['Test_Score'], alpha=0.6, color='blue')
axes[0, 0].set_xlabel('Hours Studied')
axes[0, 0].set_ylabel('Test Score')
axes[0, 0].set_title('Hours Studied vs Test Score')
axes[0, 0].grid(True, alpha=0.3)
# Add trend line
z = np.polyfit(df['Hours_Studied'], df['Test_Score'], 1)
p = np.poly1d(z)
axes[0, 0].plot(df['Hours_Studied'], p(df['Hours_Studied']), "r--", alpha=0.8, label='Trend')
axes[0, 0].legend()

# Scatter 2: Sleep Hours vs Test Score
axes[0, 1].scatter(df['Sleep_Hours'], df['Test_Score'], alpha=0.6, color='green')
axes[0, 1].set_xlabel('Sleep Hours')
axes[0, 1].set_ylabel('Test Score')
axes[0, 1].set_title('Sleep Hours vs Test Score')
axes[0, 1].grid(True, alpha=0.3)
z = np.polyfit(df['Sleep_Hours'], df['Test_Score'], 1)
p = np.poly1d(z)
axes[0, 1].plot(df['Sleep_Hours'], p(df['Sleep_Hours']), "r--", alpha=0.8, label='Trend')
axes[0, 1].legend()

# Scatter 3: Experience vs Income
axes[0, 2].scatter(df['Experience_Years'], df['Income'], alpha=0.6, color='orange')
axes[0, 2].set_xlabel('Experience (Years)')
axes[0, 2].set_ylabel('Income ($)')
axes[0, 2].set_title('Experience vs Income')
axes[0, 2].grid(True, alpha=0.3)
z = np.polyfit(df['Experience_Years'], df['Income'], 1)
p = np.poly1d(z)
axes[0, 2].plot(df['Experience_Years'], p(df['Experience_Years']), "r--", alpha=0.8, label='Trend')
axes[0, 2].legend()

# Scatter 4: Age vs Income
axes[1, 0].scatter(df['Age'], df['Income'], alpha=0.6, color='purple')
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Income ($)')
axes[1, 0].set_title('Age vs Income')
axes[1, 0].grid(True, alpha=0.3)
z = np.polyfit(df['Age'], df['Income'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df['Age'], p(df['Age']), "r--", alpha=0.8, label='Trend')
axes[1, 0].legend()

# Scatter 5: Age vs Test Score
axes[1, 1].scatter(df['Age'], df['Test_Score'], alpha=0.6, color='red')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Test Score')
axes[1, 1].set_title('Age vs Test Score')
axes[1, 1].grid(True, alpha=0.3)

# Scatter 6: Hours Studied vs Sleep Hours (colored by Test Score)
scatter = axes[1, 2].scatter(df['Hours_Studied'], df['Sleep_Hours'], 
                             c=df['Test_Score'], cmap='viridis', alpha=0.6)
axes[1, 2].set_xlabel('Hours Studied')
axes[1, 2].set_ylabel('Sleep Hours')
axes[1, 2].set_title('Hours Studied vs Sleep Hours (Color = Test Score)')
axes[1, 2].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1, 2], label='Test Score')

plt.tight_layout()
plt.show()

# ==================== LINE CHARTS ====================

# Aggregate data by month for time series analysis
monthly_data = df.groupby('Month').agg({
    'Test_Score': 'mean',
    'Hours_Studied': 'mean',
    'Sleep_Hours': 'mean',
    'Income': 'mean'
}).reset_index()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Line Charts - Trends Over Time (Monthly Averages)', fontsize=16)

# Line chart 1: Test Score trend
axes[0, 0].plot(monthly_data['Month'], monthly_data['Test_Score'], 
                marker='o', linewidth=2, markersize=8, color='blue')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Average Test Score')
axes[0, 0].set_title('Monthly Average Test Score Trend')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(range(1, 13))

# Line chart 2: Hours Studied trend
axes[0, 1].plot(monthly_data['Month'], monthly_data['Hours_Studied'], 
                marker='s', linewidth=2, markersize=8, color='green')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Average Hours Studied')
axes[0, 1].set_title('Monthly Average Study Hours Trend')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(range(1, 13))

# Line chart 3: Sleep Hours trend
axes[1, 0].plot(monthly_data['Month'], monthly_data['Sleep_Hours'], 
                marker='^', linewidth=2, markersize=8, color='orange')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Average Sleep Hours')
axes[1, 0].set_title('Monthly Average Sleep Hours Trend')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_xticks(range(1, 13))

# Line chart 4: Multiple metrics comparison
axes[1, 1].plot(monthly_data['Month'], monthly_data['Test_Score'], 
                marker='o', linewidth=2, label='Test Score', color='blue')
axes[1, 1].plot(monthly_data['Month'], monthly_data['Hours_Studied'] * 10, 
                marker='s', linewidth=2, label='Hours Studied (×10)', color='green')
axes[1, 1].plot(monthly_data['Month'], monthly_data['Sleep_Hours'] * 10, 
                marker='^', linewidth=2, label='Sleep Hours (×10)', color='orange')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Value')
axes[1, 1].set_title('Multi-Metric Comparison (Scaled)')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].legend()
axes[1, 1].set_xticks(range(1, 13))

plt.tight_layout()
plt.show()

# ==================== CORRELATION HEATMAP ====================

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap - Feature Relationships', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

# Print correlation insights
print("\nKey Correlations:")
print(f"Hours Studied vs Test Score: {correlation_matrix.loc['Hours_Studied', 'Test_Score']:.3f}")
print(f"Sleep Hours vs Test Score: {correlation_matrix.loc['Sleep_Hours', 'Test_Score']:.3f}")
print(f"Experience vs Income: {correlation_matrix.loc['Experience_Years', 'Income']:.3f}")
print(f"Age vs Income: {correlation_matrix.loc['Age', 'Income']:.3f}")

## Question 32: Support Vector Machine (SVM) for Classification

**Concepts:**
- **SVM**: Supervised learning algorithm that finds optimal hyperplane to separate classes
- **Hyperplane**: Decision boundary that separates different classes in feature space
- **Kernel**: Function to transform data into higher dimensions (linear, RBF, polynomial)
- **Support Vectors**: Data points closest to the decision boundary
- **RBF Kernel**: Radial Basis Function, useful for non-linear classification
- **Decision boundary**: The line/surface that separates different classes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, precision_score, recall_score, f1_score)
from sklearn.datasets import make_classification

# ==================== LOAD AND PREPARE DATA ====================

# Load the Iris dataset (classic multi-class classification problem)
iris = datasets.load_iris()
X = iris.data
y = iris.target

print("Dataset Information:")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {iris.feature_names}")
print(f"Target classes: {iris.target_names}")
print(f"Class distribution: {np.bincount(y)}")
print("\n" + "="*80 + "\n")

# Create DataFrame for better visualization
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("First few rows of dataset:")
print(df.head(10))
print("\n" + "="*80 + "\n")

# ==================== DATA PREPROCESSING ====================

# Split dataset into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                      random_state=42, stratify=y)

print("Data Split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print("\n" + "="*80 + "\n")

# Feature scaling (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature Scaling Applied (Standardization)")
print(f"Training set mean: {X_train_scaled.mean(axis=0)}")
print(f"Training set std: {X_train_scaled.std(axis=0)}")
print("\n" + "="*80 + "\n")

# ==================== TRAIN SVM MODELS ====================

# Train SVM with different kernels
kernels = ['linear', 'rbf', 'poly']
svm_models = {}

print("Training SVM Models with Different Kernels:\n")
for kernel in kernels:
    # Create and train SVM classifier
    svm = SVC(kernel=kernel, C=1.0, random_state=42)
    svm.fit(X_train_scaled, y_train)
    svm_models[kernel] = svm
    
    # Make predictions
    y_pred = svm.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{kernel.upper()} Kernel - Accuracy: {accuracy*100:.2f}%")

print("\n" + "="*80 + "\n")

# Use RBF kernel for detailed analysis (usually performs best)
best_kernel = 'rbf'
svm_rbf = svm_models[best_kernel]
y_pred = svm_rbf.predict(X_test_scaled)

# ==================== MODEL EVALUATION ====================

print(f"Detailed Evaluation for SVM with {best_kernel.upper()} Kernel:\n")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1-Score: {f1*100:.2f}%")
print(f"\nNumber of Support Vectors: {svm_rbf.n_support_}")
print("\n" + "="*80 + "\n")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print("\n" + "="*80 + "\n")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print("\n" + "="*80 + "\n")

# Cross-validation score
cv_scores = cross_val_score(svm_rbf, X_train_scaled, y_train, cv=5)
print(f"Cross-Validation Scores (5-fold): {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean()*100:.2f}% (+/- {cv_scores.std()*100:.2f}%)")
print("\n" + "="*80 + "\n")

# ==================== VISUALIZATION ====================

# 1. Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title(f'Confusion Matrix - SVM ({best_kernel.upper()} Kernel)', fontsize=14, pad=20)
plt.ylabel('Actual Class')
plt.xlabel('Predicted Class')
plt.tight_layout()
plt.show()

# 2. Model Comparison
kernel_accuracies = {}
for kernel, model in svm_models.items():
    y_pred_temp = model.predict(X_test_scaled)
    kernel_accuracies[kernel] = accuracy_score(y_test, y_pred_temp)

plt.figure(figsize=(10, 6))
bars = plt.bar(kernel_accuracies.keys(), 
               [acc * 100 for acc in kernel_accuracies.values()],
               color=['skyblue', 'lightcoral', 'lightgreen'])
plt.xlabel('Kernel Type', fontsize=12)
plt.ylabel('Accuracy (%)', fontsize=12)
plt.title('SVM Performance Comparison - Different Kernels', fontsize=14, pad=20)
plt.ylim([0, 105])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()

# 3. Decision Boundary Visualization (using first 2 features for 2D plot)
# Train a new SVM using only first 2 features for visualization
X_2d = X[:, :2]  # Use only first 2 features (sepal length and width)
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(
    X_2d, y, test_size=0.3, random_state=42, stratify=y)

# Scale the 2D data
scaler_2d = StandardScaler()
X_train_2d_scaled = scaler_2d.fit_transform(X_train_2d)
X_test_2d_scaled = scaler_2d.transform(X_test_2d)

# Train SVM on 2D data
svm_2d = SVC(kernel='rbf', C=1.0, random_state=42)
svm_2d.fit(X_train_2d_scaled, y_train_2d)

# Create mesh for decision boundary
h = 0.02  # step size in mesh
x_min, x_max = X_train_2d_scaled[:, 0].min() - 1, X_train_2d_scaled[:, 0].max() + 1
y_min, y_max = X_train_2d_scaled[:, 1].min() - 1, X_train_2d_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict for each point in mesh
Z = svm_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(12, 5))

# Plot 1: Decision boundary with training data
plt.subplot(1, 2, 1)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
scatter = plt.scatter(X_train_2d_scaled[:, 0], X_train_2d_scaled[:, 1], 
                     c=y_train_2d, cmap=plt.cm.RdYlBu, edgecolors='black', s=50)
plt.xlabel(f'{iris.feature_names[0]} (scaled)')
plt.ylabel(f'{iris.feature_names[1]} (scaled)')
plt.title('SVM Decision Boundary - Training Data')
plt.colorbar(scatter, label='Class', ticks=[0, 1, 2])

# Plot 2: Decision boundary with test data
plt.subplot(1, 2, 2)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
scatter = plt.scatter(X_test_2d_scaled[:, 0], X_test_2d_scaled[:, 1], 
                     c=y_test_2d, cmap=plt.cm.RdYlBu, edgecolors='black', s=50)
# Mark misclassified points with red circles
y_pred_2d = svm_2d.predict(X_test_2d_scaled)
misclassified = X_test_2d_scaled[y_test_2d != y_pred_2d]
if len(misclassified) > 0:
    plt.scatter(misclassified[:, 0], misclassified[:, 1], 
               s=200, facecolors='none', edgecolors='red', linewidths=2)
plt.xlabel(f'{iris.feature_names[0]} (scaled)')
plt.ylabel(f'{iris.feature_names[1]} (scaled)')
plt.title('SVM Decision Boundary - Test Data')
plt.colorbar(scatter, label='Class', ticks=[0, 1, 2])

plt.tight_layout()
plt.show()

# 4. Feature Importance Visualization (using all features)
# For RBF kernel, we can look at feature scaling impact
feature_names = iris.feature_names
feature_importance = np.abs(X_train_scaled.mean(axis=0))

plt.figure(figsize=(10, 6))
bars = plt.barh(feature_names, feature_importance, color='teal')
plt.xlabel('Average Scaled Value (Magnitude)', fontsize=12)
plt.title('Feature Scale Distribution After Standardization', fontsize=14, pad=20)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# 5. Support Vectors Visualization
support_vectors = svm_2d.support_vectors_
support_vector_indices = svm_2d.support_

plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
plt.scatter(X_train_2d_scaled[:, 0], X_train_2d_scaled[:, 1], 
           c=y_train_2d, cmap=plt.cm.RdYlBu, edgecolors='black', s=50, alpha=0.7)
plt.scatter(support_vectors[:, 0], support_vectors[:, 1], 
           s=200, facecolors='none', edgecolors='green', linewidths=3, 
           label=f'Support Vectors (n={len(support_vectors)})')
plt.xlabel(f'{iris.feature_names[0]} (scaled)')
plt.ylabel(f'{iris.feature_names[1]} (scaled)')
plt.title('SVM Support Vectors Highlighted', fontsize=14, pad=20)
plt.legend(loc='best')
plt.tight_layout()
plt.show()

print("\nVisualization Complete!")
print(f"Total Support Vectors: {len(support_vectors)}")
print(f"Support Vectors per class: {svm_2d.n_support_}")