<a href="https://colab.research.google.com/github/HiveCase/MachineLearningPractice/blob/main/Week8/MLP_GA8_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# from google.colab import files

# uploaded = files.upload()
import zipfile

# Extract the uploaded archive.zip file
with zipfile.ZipFile("archive.zip", 'r') as zip_ref:
    zip_ref.extractall("mask_dataset")

In [15]:
import os

for root, dirs, files in os.walk("mask_dataset"):
    print(root)
    for d in dirs:
        print("  📁", d)
    for f in files[:5]:  # Print first 5 files only
        print("  📄", f)
    print()


mask_dataset
  📁 data

mask_dataset/data
  📁 without_mask
  📁 with_mask

mask_dataset/data/without_mask
  📄 without_mask_816.jpg
  📄 without_mask_555.jpg
  📄 without_mask_1209.jpg
  📄 without_mask_1074.jpg
  📄 without_mask_1106.jpg

mask_dataset/data/with_mask
  📄 with_mask_1687.jpg
  📄 with_mask_1113.jpg
  📄 with_mask_401.jpg
  📄 with_mask_1000.jpg
  📄 with_mask_1032.jpg



In [16]:
import os
import cv2
import numpy as np

images = []
labels = []

base_path = "mask_dataset/data"
class_folders = os.listdir(base_path)

for label_folder in class_folders:
    folder_path = os.path.join(base_path, label_folder)

    if os.path.isdir(folder_path):
        for img_file in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_file)

            # Read image in grayscale
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                # Resize and flatten
                img_resized = cv2.resize(img, (100, 100))
                images.append(img_resized.flatten() / 255.0)
                labels.append(label_folder)

images = np.array(images)
labels = np.array(labels)


In [17]:
without_mask_count = np.sum(labels == 'without_mask')
print("Number of 'without_mask' images:", without_mask_count)

Number of 'without_mask' images: 3828


In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# Ensure correct order: without_mask → 1, with_mask → 0
le.fit(['with_mask', 'without_mask'])

y_encoded = le.transform(labels)  # 0 for with_mask, 1 for without_mask
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    images, y_encoded, test_size=0.2, random_state=0
)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    random_state=0,
    max_iter=500,
    tol=0.001,
    C=10
)

model.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

# Confusion matrix: [[TN, FP], [FN, TP]]
cm = confusion_matrix(y_test, y_pred)

false_positives = cm[0][1]
print("False positives (predicted without_mask, actually with_mask):", false_positives)


False positives (predicted without_mask, actually with_mask): 274


In [19]:
import numpy as np
from scipy.ndimage import rotate

def augment_image(images, labels, angles, augmentation_factor):
    np.random.seed(0)  # Reproducibility

    num_images = len(images)
    img_size = int(np.sqrt(images.shape[1]))  # assuming square images e.g., 100x100

    # Reshape flat images to 2D (100, 100)
    images_reshaped = images.reshape((-1, img_size, img_size))

    augmented_images_list = []
    augmented_labels_list = []

    for i in range(num_images):
        # Add original image and label
        augmented_images_list.append(images_reshaped[i])
        augmented_labels_list.append(labels[i])

        for j in range(augmentation_factor):
            angle_index = i * augmentation_factor + j
            angle = angles[angle_index]

            rotated_img = rotate(images_reshaped[i], angle, reshape=False, mode='nearest')
            augmented_images_list.append(rotated_img)
            augmented_labels_list.append(labels[i])

    # Convert to numpy arrays
    augmented_images = np.array(augmented_images_list)
    augmented_labels = np.array(augmented_labels_list)

    # Reshape images back to flattened form (optional, if your model expects flat input)
    augmented_images = augmented_images.reshape((augmented_images.shape[0], -1))

    return augmented_images, augmented_labels

# Set augmentation factor
augmentation_factor = 2

# Generate angles for rotation
np.random.seed(0)
num_train_images = len(X_train)
angle_of_rotation = np.random.uniform(-180, 180, size=(augmentation_factor * num_train_images,))

# Call augment_image function
augmented_images, augmented_labels = augment_image(
    X_train, y_train, angle_of_rotation, augmentation_factor
)

# Compute the sum of elements in augmented_labels[7000:8000]
sum_labels = np.sum(augmented_labels[7000:8000])
print("Sum of augmented_labels from index 7000 to 7999:", sum_labels)


Sum of augmented_labels from index 7000 to 7999: 485


In [20]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Reshape augmented images to 2D (samples, features)
X_aug = augmented_images.reshape((augmented_images.shape[0], -1))
y_aug = augmented_labels

# Step 2: Train RandomForest on all features
rf = RandomForestClassifier(random_state=0)
rf.fit(X_aug, y_aug)

# Step 3: Get top 100 important features
importances = rf.feature_importances_
top_100_indices = np.argsort(importances)[-100:]

# Step 4: Select top 100 features from augmented training and test sets
X_aug_top100 = X_aug[:, top_100_indices]
X_test_reshaped = X_test.reshape((X_test.shape[0], -1))
X_test_top100 = X_test_reshaped[:, top_100_indices]

# Step 5: Train on selected features
rf_top100 = RandomForestClassifier(random_state=0)
rf_top100.fit(X_aug_top100, y_aug)

# Step 6: Predict and calculate misclassifications
y_pred = rf_top100.predict(X_test_top100)
misclassified_count = np.sum(y_pred != y_test)

print("Number of misclassified test images:", misclassified_count)


Number of misclassified test images: 282


In [21]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Flatten images
X_aug_flat = augmented_images.reshape((augmented_images.shape[0], -1))
X_test_flat = X_test.reshape((X_test.shape[0], -1))

# Step 2: Apply PCA
pca = PCA(n_components=100, random_state=0)
X_aug_pca = pca.fit_transform(X_aug_flat)
X_test_pca = pca.transform(X_test_flat)

# Step 3: Train RandomForest on PCA-transformed data
rf_pca = RandomForestClassifier(random_state=0)
rf_pca.fit(X_aug_pca, augmented_labels)

# Step 4: Predict and compute accuracy
y_pred = rf_pca.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy on test data after PCA + RandomForest:", accuracy)


Accuracy on test data after PCA + RandomForest: 0.786896095301125


In [13]:
# Step 1: Import libraries
import zipfile
import os
import cv2
import numpy as np
from tqdm import tqdm

# Step 2: Unzip the file (assuming it's in the root `/content` folder)
with zipfile.ZipFile("/content/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/dataset")

# Step 3: Prepare lists for images and labels
image_list = []
label_list = []

# Step 4: Traverse the dataset directory
dataset_path = "/content/dataset"
for label_name in os.listdir(dataset_path):
    label_folder = os.path.join(dataset_path, label_name)
    if os.path.isdir(label_folder):
        for image_file in tqdm(os.listdir(label_folder), desc=f"Processing {label_name}"):
            image_path = os.path.join(label_folder, image_file)
            try:
                # Read image in grayscale
                img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                if img is None:
                    continue  # skip unreadable files

                # Resize to 100x100
                img_resized = cv2.resize(img, (100, 100))

                # Normalize pixel values
                img_normalized = img_resized / 255.0

                # Flatten and append
                image_list.append(img_normalized.flatten())
                label_list.append(label_name)
            except Exception as e:
                print(f"Error processing {image_file}: {e}")

# Step 5: Convert to numpy arrays
images = np.array(image_list)
labels = np.array(label_list)

# Step 6: Count images with label "without_mask"
without_mask_count = np.sum(labels == "without_mask")
print("Number of images with label 'without_mask':", without_mask_count)


Processing data: 100%|██████████| 2/2 [00:00<00:00, 3833.92it/s]

Number of images with label 'without_mask': 0





In [None]:
# Step 1: Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

# Step 2: Encode labels (without_mask → 1, with_mask → 0)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Ensure correct mapping
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", mapping)
# Expected Output: {'with_mask': 0, 'without_mask': 1}

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(
    images, labels_encoded, test_size=0.2, random_state=0
)

# Step 4: Train Logistic Regression
model = LogisticRegression(
    random_state=0,
    max_iter=500,
    tol=0.001,
    C=10
)
model.fit(X_train, y_train)

# Step 5: Predict and calculate confusion matrix
y_pred = model.predict(X_test)

# Confusion matrix layout:
# [[TN FP]
#  [FN TP]]
cm = confusion_matrix(y_test, y_pred)

false_positives = cm[0][1]
print("Number of false positives (with_mask → predicted without_mask):", false_positives)


In [None]:
import numpy as np
import cv2

# Define the augmentation function
def augment_image(images, labels, angles, augmentation_factor):
    augmented_images = []
    augmented_labels = []

    idx = 0
    for i in range(len(images)):
        original_img = images[i].reshape(100, 100)  # Reshape flattened image
        original_label = labels[i]

        # Append the original image and label
        augmented_images.append(original_img)
        augmented_labels.append(original_label)

        # Generate augmented images by rotation
        for _ in range(augmentation_factor):
            angle = angles[idx]
            M = cv2.getRotationMatrix2D((50, 50), angle, 1.0)  # center=(50,50) for 100x100
            rotated = cv2.warpAffine(original_img, M, (100, 100), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
            augmented_images.append(rotated)
            augmented_labels.append(original_label)
            idx += 1

    # Convert lists to numpy arrays
    augmented_images = np.array(augmented_images).reshape(-1, 100 * 100).astype(np.float32)
    augmented_labels = np.array(augmented_labels)

    return augmented_images, augmented_labels

# Set random seed and generate angles
np.random.seed(0)
augmentation_factor = 2
angle_of_rotation = np.random.uniform(-180, 180, size=augmentation_factor * len(X_train))

# Call the function
aug_images, aug_labels = augment_image(X_train, y_train, angle_of_rotation, augmentation_factor)

# Compute the sum of elements in augmented_labels[7000:8000]
label_sum = np.sum(aug_labels[7000:8000])
print("Sum of augmented_labels[7000:8000]:", label_sum)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Fit RandomForest on augmented training data
rf = RandomForestClassifier(random_state=0)
rf.fit(aug_images, aug_labels)

# Step 2: Get top 100 feature indices based on feature_importances_
importances = rf.feature_importances_
top_100_indices = np.argsort(importances)[-100:]

# Step 3: Select only top 100 features from training and test data
aug_images_top100 = aug_images[:, top_100_indices]
X_test_top100 = X_test[:, top_100_indices]

# Step 4: Train new RandomForest on selected features
rf_top100 = RandomForestClassifier(random_state=0)
rf_top100.fit(aug_images_top100, aug_labels)

# Step 5: Predict and count misclassified points on test data
y_pred_test = rf_top100.predict(X_test_top100)
misclassified = np.sum(y_pred_test != y_test)

print("Number of misclassified test images using top 100 features:", misclassified)


In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Fit PCA on augmented training images
pca = PCA(n_components=100, random_state=0)
aug_images_pca = pca.fit_transform(aug_images)

# Step 2: Transform test images using the same PCA
X_test_pca = pca.transform(X_test)

# Step 3: Train RandomForestClassifier
rf_pca = RandomForestClassifier(random_state=0)
rf_pca.fit(aug_images_pca, aug_labels)

# Step 4: Predict and compute accuracy
y_pred_pca = rf_pca.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred_pca)

print("Accuracy on test data after PCA + RandomForest:", round(accuracy, 4))
