# Lab Exam - Set 2

This notebook contains implementations for all questions in Set 2.

## Question 5: Pandas DataFrame - Ranking, Sorting, and Aggregation

**Concepts:**
- **DataFrame from dictionary**: Creating structured data from key-value pairs
- **Ranking**: Assigning rank positions based on values
- **Sorting**: Ordering data by specific columns
- **Aggregation**: Computing summary statistics (sum, mean, max, etc.)
- **GroupBy**: Grouping data by categories for analysis

In [None]:
import pandas as pd
import numpy as np

# Create DataFrame from dictionary
data = {
    'Employee': ['John', 'Emma', 'Michael', 'Sophia', 'William', 'Olivia', 'James', 'Ava'],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT', 'Finance', 'HR'],
    'Salary': [75000, 65000, 82000, 70000, 60000, 78000, 72000, 68000],
    'Experience': [5, 3, 8, 6, 2, 7, 4, 5]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Ranking operations
df['Salary_Rank'] = df['Salary'].rank(ascending=False)  # Higher salary = better rank
df['Experience_Rank'] = df['Experience'].rank(ascending=False)
print("\n\nDataFrame with Rankings:")
print(df)

# Sorting operations
print("\n\nSorted by Salary (descending):")
print(df.sort_values('Salary', ascending=False))

print("\n\nSorted by Department and then Salary:")
print(df.sort_values(['Department', 'Salary'], ascending=[True, False]))

# Aggregation operations
print("\n\nBasic Aggregations:")
print(f"Total Salary: ${df['Salary'].sum():,}")
print(f"Average Salary: ${df['Salary'].mean():,.2f}")
print(f"Max Experience: {df['Experience'].max()} years")
print(f"Min Salary: ${df['Salary'].min():,}")

# Group-wise aggregation
print("\n\nDepartment-wise Statistics:")
dept_stats = df.groupby('Department').agg({
    'Salary': ['mean', 'sum', 'count'],
    'Experience': ['mean', 'max']
})
print(dept_stats)

# Multiple aggregations
print("\n\nDepartment Summary:")
summary = df.groupby('Department')['Salary'].agg(['count', 'mean', 'min', 'max'])
summary.columns = ['Employees', 'Avg_Salary', 'Min_Salary', 'Max_Salary']
print(summary)

## Question 6: Skewness Detection and Min-Max Normalization

**Concepts:**
- **Skewness**: Measure of asymmetry in data distribution (positive/negative/zero)
- **Skew values**: -0.5 to 0.5 = normal, outside = skewed
- **Min-Max Scaling**: Normalizes data to range [0, 1] using formula: (x - min) / (max - min)
- **Why normalize**: Makes features comparable and improves ML model performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Create dataset with skewed data
np.random.seed(42)
data = {
    'Normal_Data': np.random.normal(50, 10, 100),
    'Positive_Skew': np.random.exponential(2, 100),  # Right-skewed
    'Negative_Skew': 100 - np.random.exponential(2, 100),  # Left-skewed
    'Income': np.random.lognormal(10, 1, 100)  # Real-world skewed example
}

df = pd.DataFrame(data)

# Detect skewness
print("Skewness Values:")
skewness = df.skew()
print(skewness)
print("\nInterpretation:")
for col, skew_val in skewness.items():
    if -0.5 <= skew_val <= 0.5:
        status = "Normal (Symmetric)"
    elif skew_val > 0.5:
        status = "Positive Skew (Right-tailed)"
    else:
        status = "Negative Skew (Left-tailed)"
    print(f"{col}: {skew_val:.2f} - {status}")

# Visualize original data
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
fig.suptitle('Before and After Normalization', fontsize=16)

for idx, col in enumerate(df.columns):
    axes[0, idx].hist(df[col], bins=20, color='coral', edgecolor='black')
    axes[0, idx].set_title(f'{col}\nSkew: {df[col].skew():.2f}')
    axes[0, idx].set_ylabel('Frequency')

# Handle skewness and normalize using Min-Max scaling
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler.fit_transform(df),
    columns=df.columns
)

print("\n\nOriginal Data Statistics:")
print(df.describe())

print("\n\nNormalized Data Statistics (0-1 range):")
print(df_normalized.describe())

# Visualize normalized data
for idx, col in enumerate(df_normalized.columns):
    axes[1, idx].hist(df_normalized[col], bins=20, color='skyblue', edgecolor='black')
    axes[1, idx].set_title(f'Normalized {col}')
    axes[1, idx].set_xlabel('Value (0-1)')
    axes[1, idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Question 7: Scatter Plots and Correlation Heatmap

**Concepts:**
- **Scatter plot**: Shows relationship between two continuous variables
- **Correlation**: Measure of linear relationship (-1 to +1)
- **Positive correlation**: Both variables increase together
- **Negative correlation**: One increases, other decreases
- **Heatmap**: Color-coded matrix showing correlation strengths

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create dataset with correlated features
np.random.seed(42)
n = 100
data = {
    'Study_Hours': np.random.uniform(1, 10, n),
}
data['Test_Score'] = 40 + 5 * data['Study_Hours'] + np.random.normal(0, 5, n)  # Positive correlation
data['Sleep_Hours'] = np.random.uniform(4, 10, n)
data['Stress_Level'] = 100 - 5 * data['Sleep_Hours'] + np.random.normal(0, 8, n)  # Negative correlation
data['Random_Variable'] = np.random.uniform(0, 100, n)  # No correlation

df = pd.DataFrame(data)

# Scatter plots to show relationships
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Scatter Plots - Feature Relationships', fontsize=16)

# Study Hours vs Test Score (positive correlation)
axes[0, 0].scatter(df['Study_Hours'], df['Test_Score'], alpha=0.6, color='blue')
axes[0, 0].set_xlabel('Study Hours')
axes[0, 0].set_ylabel('Test Score')
axes[0, 0].set_title('Study Hours vs Test Score')
axes[0, 0].grid(True, alpha=0.3)

# Sleep Hours vs Stress Level (negative correlation)
axes[0, 1].scatter(df['Sleep_Hours'], df['Stress_Level'], alpha=0.6, color='red')
axes[0, 1].set_xlabel('Sleep Hours')
axes[0, 1].set_ylabel('Stress Level')
axes[0, 1].set_title('Sleep Hours vs Stress Level')
axes[0, 1].grid(True, alpha=0.3)

# Study Hours vs Random Variable (no correlation)
axes[1, 0].scatter(df['Study_Hours'], df['Random_Variable'], alpha=0.6, color='green')
axes[1, 0].set_xlabel('Study Hours')
axes[1, 0].set_ylabel('Random Variable')
axes[1, 0].set_title('Study Hours vs Random Variable')
axes[1, 0].grid(True, alpha=0.3)

# Test Score vs Stress Level
axes[1, 1].scatter(df['Test_Score'], df['Stress_Level'], alpha=0.6, color='purple')
axes[1, 1].set_xlabel('Test Score')
axes[1, 1].set_ylabel('Stress Level')
axes[1, 1].set_title('Test Score vs Stress Level')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, fmt='.2f', 
            cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

print("\n\nInterpretation:")
print("Values close to +1: Strong positive correlation")
print("Values close to -1: Strong negative correlation")
print("Values close to 0: No correlation")

## Question 8: Support Vector Machine (SVM) Classifier with Decision Boundary

**Concepts:**
- **SVM**: Supervised learning algorithm that finds optimal hyperplane to separate classes
- **Linear kernel**: Creates straight decision boundary
- **Decision boundary**: Line/surface separating different classes
- **Support vectors**: Data points closest to decision boundary
- **Margin**: Distance between decision boundary and nearest points

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load iris dataset and use only 2 features for visualization
iris = datasets.load_iris()
X = iris.data[:, :2]  # Use only first 2 features (sepal length and width)
y = iris.target

# Use only 2 classes for binary classification
X = X[y != 2]
y = y[y != 2]

print("Dataset shape:", X.shape)
print("Classes: Setosa (0) and Versicolor (1)")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM with linear kernel
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)

# Predictions
y_pred = svm.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"\n\nAccuracy: {accuracy * 100:.2f}%")

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Setosa', 'Versicolor'],
            yticklabels=['Setosa', 'Versicolor'])
plt.title('Confusion Matrix - SVM Classifier')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Visualize decision boundary
def plot_decision_boundary(X, y, model, title):
    # Create mesh grid
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # Predict for each point in mesh
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.figure(figsize=(10, 7))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', 
                         edgecolors='black', s=100)
    
    # Plot support vectors
    plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1],
                s=200, linewidth=2, facecolors='none', edgecolors='green',
                label='Support Vectors')
    
    plt.xlabel('Sepal Length (cm)', fontsize=12)
    plt.ylabel('Sepal Width (cm)', fontsize=12)
    plt.title(title, fontsize=14)
    plt.legend(['Setosa', 'Versicolor', 'Support Vectors'])
    plt.grid(True, alpha=0.3)
    plt.show()

plot_decision_boundary(X, y, svm, 'SVM Decision Boundary (Linear Kernel)')

print("\n\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Setosa', 'Versicolor']))