# Lab Exam - Set 3

This notebook contains implementations for all questions in Set 3.

## Question 9: NumPy Arrays - Indexing, Slicing, and Universal Functions

**Concepts:**
- **Creating from lists**: Convert Python lists to NumPy arrays
- **Indexing**: Accessing single elements using position
- **Slicing**: Extracting subarrays using start:stop:step
- **Universal functions (ufuncs)**: Fast element-wise operations (sqrt, exp, sin, etc.)
- **Broadcasting**: Operating on arrays of different shapes

In [None]:
import numpy as np

# Create NumPy arrays from lists
list1 = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
list2 = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]

arr1 = np.array(list1)
arr2 = np.array(list2)

print("Array 1 (1D):", arr1)
print("\nArray 2 (2D):\n", arr2)

# Indexing operations
print("\n\n=== INDEXING ===")
print(f"First element of arr1: {arr1[0]}")
print(f"Last element of arr1: {arr1[-1]}")
print(f"Element at position 5: {arr1[5]}")
print(f"\nElement at row 1, col 2 in arr2: {arr2[1, 2]}")
print(f"First row of arr2: {arr2[0]}")
print(f"Last column of arr2: {arr2[:, -1]}")

# Slicing operations
print("\n\n=== SLICING ===")
print(f"First 5 elements: {arr1[:5]}")
print(f"Elements from index 3 to 7: {arr1[3:8]}")
print(f"Every 2nd element: {arr1[::2]}")
print(f"Last 3 elements: {arr1[-3:]}")
print(f"Reverse array: {arr1[::-1]}")

print(f"\nFirst 2 rows of arr2:\n{arr2[:2]}")
print(f"\nFirst 2 columns:\n{arr2[:, :2]}")
print(f"\nCenter 2x2 subarray:\n{arr2[0:2, 1:3]}")

# Universal functions (ufuncs)
print("\n\n=== UNIVERSAL FUNCTIONS ===")
arr3 = np.array([1, 4, 9, 16, 25])
print(f"Original array: {arr3}")

# Mathematical ufuncs
print(f"Square root: {np.sqrt(arr3)}")
print(f"Square: {np.square(arr3)}")
print(f"Exponential: {np.exp([1, 2, 3])}")
print(f"Log (natural): {np.log(arr3)}")
print(f"Log10: {np.log10(arr3)}")

# Trigonometric ufuncs
angles = np.array([0, 30, 45, 60, 90])
radians = np.deg2rad(angles)
print(f"\nAngles: {angles}°")
print(f"Sin values: {np.sin(radians)}")
print(f"Cos values: {np.cos(radians)}")

# Statistical ufuncs
print(f"\nArray 1: {arr1}")
print(f"Sum: {np.sum(arr1)}")
print(f"Mean: {np.mean(arr1)}")
print(f"Median: {np.median(arr1)}")
print(f"Std deviation: {np.std(arr1)}")
print(f"Min: {np.min(arr1)}")
print(f"Max: {np.max(arr1)}")

# Comparison and logical ufuncs
print(f"\nElements > 50: {arr1[arr1 > 50]}")
print(f"Boolean mask (>50): {arr1 > 50}")
print(f"Count of elements > 50: {np.sum(arr1 > 50)}")

## Question 10: Label Encoding and Missing Value Imputation

**Concepts:**
- **Categorical data**: Non-numeric data (colors, categories, names)
- **Label encoding**: Converting categories to numbers (0, 1, 2, ...)
- **Imputation**: Filling missing values with statistical measures
- **SimpleImputer**: sklearn tool for handling missing values
- **Strategies**: mean, median, most_frequent, constant

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Create dataset with categorical data and missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry'],
    'Gender': ['Female', 'Male', 'Male', np.nan, 'Female', 'Male', 'Female', 'Male'],
    'City': ['Mumbai', 'Delhi', np.nan, 'Mumbai', 'Bangalore', 'Delhi', 'Mumbai', np.nan],
    'Education': ['Graduate', 'Post-Graduate', 'Graduate', 'High School', np.nan, 'Graduate', 'Post-Graduate', 'Graduate'],
    'Age': [25, np.nan, 30, 28, 35, np.nan, 27, 29],
    'Salary': [50000, 75000, np.nan, 60000, np.nan, 55000, 70000, 65000]
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

print("\n\nMissing values count:")
print(df.isnull().sum())

# Label Encoding for categorical columns
print("\n\n=== LABEL ENCODING ===")

# Create a copy for encoding
df_encoded = df.copy()

# Encode Gender
le_gender = LabelEncoder()
# Remove NaN for encoding, then add back
mask_gender = df_encoded['Gender'].notna()
df_encoded.loc[mask_gender, 'Gender_Encoded'] = le_gender.fit_transform(df_encoded.loc[mask_gender, 'Gender'])
print(f"\nGender Encoding: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")

# Encode City
le_city = LabelEncoder()
mask_city = df_encoded['City'].notna()
df_encoded.loc[mask_city, 'City_Encoded'] = le_city.fit_transform(df_encoded.loc[mask_city, 'City'])
print(f"City Encoding: {dict(zip(le_city.classes_, le_city.transform(le_city.classes_)))}")

# Encode Education
le_education = LabelEncoder()
mask_education = df_encoded['Education'].notna()
df_encoded.loc[mask_education, 'Education_Encoded'] = le_education.fit_transform(df_encoded.loc[mask_education, 'Education'])
print(f"Education Encoding: {dict(zip(le_education.classes_, le_education.transform(le_education.classes_)))}")

print("\n\nDataFrame with Encoded Columns:")
print(df_encoded)

# Handle missing values using imputation
print("\n\n=== MISSING VALUE IMPUTATION ===")

# For numerical columns - use mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df_encoded[['Age', 'Salary']] = imputer_mean.fit_transform(df_encoded[['Age', 'Salary']])

print(f"\nImputed Age values (using mean={imputer_mean.statistics_[0]:.2f}):")
print(df_encoded['Age'])

print(f"\nImputed Salary values (using mean={imputer_mean.statistics_[1]:.2f}):")
print(df_encoded['Salary'])

# For categorical encoded columns - use most frequent
imputer_freq = SimpleImputer(strategy='most_frequent')
df_encoded[['Gender_Encoded', 'City_Encoded', 'Education_Encoded']] = imputer_freq.fit_transform(
    df_encoded[['Gender_Encoded', 'City_Encoded', 'Education_Encoded']]
)

print("\n\nFinal Dataset (After Encoding and Imputation):")
print(df_encoded[['Name', 'Age', 'Salary', 'Gender_Encoded', 'City_Encoded', 'Education_Encoded']])

print("\n\nMissing values after imputation:")
print(df_encoded.isnull().sum())

print("\n\nSummary Statistics:")
print(df_encoded[['Age', 'Salary', 'Gender_Encoded', 'City_Encoded', 'Education_Encoded']].describe())

## Question 11: Scatter Plots with Regression Lines and Bar Plots with Error Bars

**Concepts:**
- **Regression line**: Best-fit line showing trend between variables
- **Linear regression**: Finding line that minimizes error
- **Error bars**: Show variability/uncertainty in data
- **Standard error**: Measure of variability in sample mean
- **Confidence intervals**: Range where true value likely lies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Create dataset
np.random.seed(42)
n = 50

# Feature relationships with noise
advertising_spend = np.random.uniform(1000, 10000, n)
sales = 5000 + 0.8 * advertising_spend + np.random.normal(0, 1000, n)

study_hours = np.random.uniform(1, 10, n)
exam_score = 30 + 6 * study_hours + np.random.normal(0, 5, n)

# Create DataFrame
df = pd.DataFrame({
    'Advertising': advertising_spend,
    'Sales': sales,
    'Study_Hours': study_hours,
    'Exam_Score': exam_score
})

# Scatter plots with regression lines
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Scatter Plots with Regression Lines', fontsize=16)

# Plot 1: Advertising vs Sales
axes[0].scatter(df['Advertising'], df['Sales'], alpha=0.6, s=50, color='blue', label='Data points')
# Calculate regression line
slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(df['Advertising'], df['Sales'])
line1 = slope1 * df['Advertising'] + intercept1
axes[0].plot(df['Advertising'], line1, 'r-', linewidth=2, label=f'Regression line\ny={slope1:.2f}x+{intercept1:.0f}')
axes[0].set_xlabel('Advertising Spend ($)', fontsize=11)
axes[0].set_ylabel('Sales ($)', fontsize=11)
axes[0].set_title(f'Advertising vs Sales\nR² = {r_value1**2:.3f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Study Hours vs Exam Score
axes[1].scatter(df['Study_Hours'], df['Exam_Score'], alpha=0.6, s=50, color='green', label='Data points')
slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(df['Study_Hours'], df['Exam_Score'])
line2 = slope2 * df['Study_Hours'] + intercept2
axes[1].plot(df['Study_Hours'], line2, 'r-', linewidth=2, label=f'Regression line\ny={slope2:.2f}x+{intercept2:.0f}')
axes[1].set_xlabel('Study Hours', fontsize=11)
axes[1].set_ylabel('Exam Score', fontsize=11)
axes[1].set_title(f'Study Hours vs Exam Score\nR² = {r_value2**2:.3f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Bar plots with error bars
# Create categorical data
categories = ['Product A', 'Product B', 'Product C', 'Product D']
samples_per_category = 20

data_cat = {
    'Product A': np.random.normal(75, 10, samples_per_category),
    'Product B': np.random.normal(85, 8, samples_per_category),
    'Product C': np.random.normal(70, 12, samples_per_category),
    'Product D': np.random.normal(90, 7, samples_per_category)
}

# Calculate means and standard errors
means = [np.mean(data_cat[cat]) for cat in categories]
std_errors = [stats.sem(data_cat[cat]) for cat in categories]  # Standard error of mean
std_devs = [np.std(data_cat[cat]) for cat in categories]

# Create bar plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Bar Plots with Error Bars', fontsize=16)

# Plot 1: With standard error bars
axes[0].bar(categories, means, color=['skyblue', 'lightcoral', 'lightgreen', 'plum'], 
            alpha=0.7, edgecolor='black')
axes[0].errorbar(categories, means, yerr=std_errors, fmt='none', 
                 ecolor='black', capsize=5, capthick=2, label='Standard Error')
axes[0].set_xlabel('Product Category', fontsize=11)
axes[0].set_ylabel('Average Performance Score', fontsize=11)
axes[0].set_title('Product Performance with Standard Error Bars')
axes[0].set_ylim(0, 100)
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (cat, mean, se) in enumerate(zip(categories, means, std_errors)):
    axes[0].text(i, mean + se + 2, f'{mean:.1f}±{se:.1f}', 
                ha='center', fontsize=9, fontweight='bold')

# Plot 2: With standard deviation bars
axes[1].bar(categories, means, color=['skyblue', 'lightcoral', 'lightgreen', 'plum'], 
            alpha=0.7, edgecolor='black')
axes[1].errorbar(categories, means, yerr=std_devs, fmt='none', 
                 ecolor='red', capsize=5, capthick=2, label='Standard Deviation')
axes[1].set_xlabel('Product Category', fontsize=11)
axes[1].set_ylabel('Average Performance Score', fontsize=11)
axes[1].set_title('Product Performance with Standard Deviation Bars')
axes[1].set_ylim(0, 110)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

# Add value labels
for i, (cat, mean, sd) in enumerate(zip(categories, means, std_devs)):
    axes[1].text(i, mean + sd + 3, f'{mean:.1f}±{sd:.1f}', 
                ha='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

print("Statistical Summary:")
print("\nProduct Performance Statistics:")
for cat in categories:
    mean_val = np.mean(data_cat[cat])
    se_val = stats.sem(data_cat[cat])
    sd_val = np.std(data_cat[cat])
    print(f"{cat}: Mean={mean_val:.2f}, SE={se_val:.2f}, SD={sd_val:.2f}")

## Question 12: Decision Tree Classifier using Gini Index

**Concepts:**
- **Decision Tree**: Tree-like model making decisions based on feature values
- **Gini Index**: Measure of impurity (0=pure, 0.5=maximum impurity)
- **Splitting**: Dividing data based on feature thresholds
- **Leaf nodes**: Final classification decisions
- **Tree depth**: Number of levels in the tree

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

print("Dataset Information:")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {iris.feature_names}")
print(f"Classes: {iris.target_names}")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# Create Decision Tree using Gini Index
dt_classifier = DecisionTreeClassifier(
    criterion='gini',  # Use Gini Index for splitting
    max_depth=4,       # Limit tree depth
    random_state=42,
    min_samples_split=2,
    min_samples_leaf=1
)

# Train the model
dt_classifier.fit(X_train, y_train)

# Make predictions
y_pred = dt_classifier.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\n\nModel Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.title('Confusion Matrix - Decision Tree Classifier (Gini Index)', fontsize=14)
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

# Classification Report
print("\n\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Feature Importance
feature_importance = dt_classifier.feature_importances_
print("\nFeature Importance:")
for feature, importance in zip(iris.feature_names, feature_importance):
    print(f"{feature}: {importance:.4f}")

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(iris.feature_names, feature_importance, color='skyblue', edgecolor='black')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance Score', fontsize=12)
plt.title('Feature Importance in Decision Tree', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

# Visualize the Decision Tree
plt.figure(figsize=(20, 10))
plot_tree(dt_classifier, 
          feature_names=iris.feature_names,
          class_names=iris.target_names,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree Visualization (Gini Index)', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# Tree statistics
print(f"\n\nTree Statistics:")
print(f"Tree depth: {dt_classifier.get_depth()}")
print(f"Number of leaves: {dt_classifier.get_n_leaves()}")
print(f"Total number of nodes: {dt_classifier.tree_.node_count}")