<h3>Import Libraries</h3>

In [25]:
import os
import shutil
import random
import pandas as pd
import torch
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

<h3>Check Directories.</h3>

In [26]:
Train_dir = "Museum_Training/Training/"
# Test_dir = "Museum_Test/Museum_Validation/"

classification_targets = ['museum-indoor', 'museum-outdoor']

def check_directory(directory):
    if os.path.exists(directory):
        print(f"Directory '{directory}' exists.")
    else:
        print(f"Directory '{directory}' does NOT exist.")

check_directory(Train_dir)
# check_directory(Test_dir)

Directory 'Museum_Training/Training/' exists.


In [27]:
def is_image_file(filename):
    valid_extensions = ('.jpg', '.jpeg', '.png')
    return filename.lower().endswith(valid_extensions)

<h3>Pre-process images - greyscale.</h3>

In [28]:
def preprocess_images_grayscale(target_dir, classes, image_size=(256, 256)):
    """
    Preprocess images: Convert to grayscale, resize, and flatten.

    Parameters:
    - target_dir: Path to the target directory containing class folders.
    - classes: List of class names.
    - image_size: Target image size (width, height).

    Returns:
    - X_gray: Flattened grayscale image data.
    - y_gray: Corresponding labels.
    """
    X_grey = []
    y_grey = []

    for label, cls in enumerate(classes):
        class_dir = os.path.join(target_dir, cls)
        for img_name in os.listdir(class_dir):
            if is_image_file(img_name):
                img_path = os.path.join(class_dir, img_name)
                try:
                    # Open Image
                    with Image.open(img_path) as img:
                        img = img.convert('L')  
                        img = img.resize(image_size)  
                        img_array = np.array(img).flatten() 

                        X_grey.append(img_array)
                        y_grey.append(label)

                except Exception as e:
                    print(f"Error processing file {img_path}: {e}")

    X_grey = np.array(X_grey)
    y_grey = np.array(y_grey)
    return X_grey, y_grey

X_grey, y_grey = preprocess_images_grayscale(Train_dir, classification_targets)

print("\nGrayscale Image Data Shape:", X_grey.shape)
print("Labels Shape:", y_grey.shape)


Grayscale Image Data Shape: (10000, 65536)
Labels Shape: (10000,)


<h3>Split data to train and test</h3>

In [30]:
X_grey_train, X_grey_test, y_grey_train, y_grey_test = train_test_split(X_grey, y_grey, test_size=0.2, random_state=42)
print("Training data shape:", X_grey_train.shape)
print("Testing data shape:", X_grey_test.shape)

Training data shape: (8000, 65536)
Testing data shape: (2000, 65536)


<h3>Decision tree algorithm with hyperparameters.</h3>

In [8]:
# Define and train model
dt_model_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=6,min_samples_split=2,random_state=42)
dt_model_entropy.fit(X_grey_train, y_grey_train)

y_train_pred = dt_model_entropy.predict(X_grey_train)
y_test_pred = dt_model_entropy.predict(X_grey_test)

accuracy_train = accuracy_score(y_grey_train, y_train_pred)
accuracy_test = accuracy_score(y_grey_test, y_test_pred)

cm_train = confusion_matrix(y_grey_train, y_train_pred)
cm_test = confusion_matrix(y_grey_test, y_test_pred)

print('--- Training Metrics ---')
print("Accuracy:", accuracy_train)
print("Classification Report:\n", classification_report(y_grey_train, y_train_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_train)
print('\n--- Testing Metrics ---')
print("Accuracy:", accuracy_test)
print("Classification Report:\n", classification_report(y_grey_test, y_test_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_test)

--- Training Metrics ---
Accuracy: 0.730875
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.68      0.87      0.76      3988
museum-outdoor       0.82      0.59      0.69      4012

      accuracy                           0.73      8000
     macro avg       0.75      0.73      0.73      8000
  weighted avg       0.75      0.73      0.73      8000

Confusion Matrix:
 [[3480  508]
 [1645 2367]]

--- Testing Metrics ---
Accuracy: 0.654
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.62      0.82      0.70      1012
museum-outdoor       0.72      0.49      0.58       988

      accuracy                           0.65      2000
     macro avg       0.67      0.65      0.64      2000
  weighted avg       0.67      0.65      0.64      2000

Confusion Matrix:
 [[825 187]
 [505 483]]


In [10]:

dt_model_gini = DecisionTreeClassifier(criterion='gini',max_depth=5,min_samples_split=2,random_state=42)
dt_model_gini.fit(X_grey_train, y_grey_train)

y_train_pred = dt_model_gini.predict(X_grey_train)
y_test_pred = dt_model_gini.predict(X_grey_test)

accuracy_train = accuracy_score(y_grey_train, y_train_pred)
accuracy_test = accuracy_score(y_grey_test, y_test_pred)

cm_train = confusion_matrix(y_grey_train, y_train_pred)
cm_test = confusion_matrix(y_grey_test, y_test_pred)

print('--- Training Metrics ---')
print("Accuracy:", accuracy_train)
print("Classification Report:\n", classification_report(y_grey_train, y_train_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_train)
print('\n--- Testing Metrics ---')
print("Accuracy:", accuracy_test)
print("Classification Report:\n", classification_report(y_grey_test, y_test_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_test)

--- Training Metrics ---
Accuracy: 0.7265
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.73      0.73      0.73      3988
museum-outdoor       0.73      0.73      0.73      4012

      accuracy                           0.73      8000
     macro avg       0.73      0.73      0.73      8000
  weighted avg       0.73      0.73      0.73      8000

Confusion Matrix:
 [[2893 1095]
 [1093 2919]]

--- Testing Metrics ---
Accuracy: 0.653
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.66      0.64      0.65      1012
museum-outdoor       0.64      0.67      0.65       988

      accuracy                           0.65      2000
     macro avg       0.65      0.65      0.65      2000
  weighted avg       0.65      0.65      0.65      2000

Confusion Matrix:
 [[648 364]
 [330 658]]


<h3>Random Forest algorithm with hyperparameters.</h3>

In [11]:
rf_model = RandomForestClassifier(max_depth=6,n_estimators=100, criterion='entropy', random_state=42) 
rf_model.fit(X_grey_train, y_grey_train)

y_train_pred = rf_model.predict(X_grey_train)
y_test_pred = rf_model.predict(X_grey_test)

accuracy_train = accuracy_score(y_grey_train, y_train_pred)
accuracy_test = accuracy_score(y_grey_test, y_test_pred)

cm_train = confusion_matrix(y_grey_train, y_train_pred)
cm_test = confusion_matrix(y_grey_test, y_test_pred)

print('--- Training Metrics ---')
print("Accuracy:", accuracy_train)
print("Classification Report:\n", classification_report(y_grey_train, y_train_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_train)
print('\n--- Testing Metrics ---')
print("Accuracy:", accuracy_test)
print("Classification Report:\n", classification_report(y_grey_test, y_test_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_test)

--- Training Metrics ---
Accuracy: 0.808625
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.82      0.79      0.80      3988
museum-outdoor       0.80      0.83      0.81      4012

      accuracy                           0.81      8000
     macro avg       0.81      0.81      0.81      8000
  weighted avg       0.81      0.81      0.81      8000

Confusion Matrix:
 [[3158  830]
 [ 701 3311]]

--- Testing Metrics ---
Accuracy: 0.738
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.75      0.73      0.74      1012
museum-outdoor       0.73      0.75      0.74       988

      accuracy                           0.74      2000
     macro avg       0.74      0.74      0.74      2000
  weighted avg       0.74      0.74      0.74      2000

Confusion Matrix:
 [[737 275]
 [249 739]]


In [12]:
rf_model_gini = RandomForestClassifier(max_depth=5,n_estimators=50, criterion='gini', random_state=42)  
rf_model_gini.fit(X_grey_train, y_grey_train)

y_train_pred = rf_model_gini.predict(X_grey_train)
y_test_pred = rf_model_gini.predict(X_grey_test)

accuracy_train = accuracy_score(y_grey_train, y_train_pred)
accuracy_test = accuracy_score(y_grey_test, y_test_pred)

cm_train = confusion_matrix(y_grey_train, y_train_pred)
cm_test = confusion_matrix(y_grey_test, y_test_pred)

print('--- Training Metrics ---')
print("Accuracy:", accuracy_train)
print("Classification Report:\n", classification_report(y_grey_train, y_train_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_train)
print('\n--- Testing Metrics ---')
print("Accuracy:", accuracy_test)
print("Classification Report:\n", classification_report(y_grey_test, y_test_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_test)


--- Training Metrics ---
Accuracy: 0.7765
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.80      0.74      0.77      3988
museum-outdoor       0.76      0.81      0.79      4012

      accuracy                           0.78      8000
     macro avg       0.78      0.78      0.78      8000
  weighted avg       0.78      0.78      0.78      8000

Confusion Matrix:
 [[2944 1044]
 [ 744 3268]]

--- Testing Metrics ---
Accuracy: 0.714
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.74      0.67      0.70      1012
museum-outdoor       0.69      0.76      0.72       988

      accuracy                           0.71      2000
     macro avg       0.72      0.71      0.71      2000
  weighted avg       0.72      0.71      0.71      2000

Confusion Matrix:
 [[679 333]
 [239 749]]


<h3>Create pickle files to export the trained models.</h3>

In [13]:
import pickle
with open("rf_model_grey.pkl", "wb") as rf_model_grey:
    pickle.dump(rf_model, rf_model_grey)


In [14]:
with open("dt_model_entropy_grey.pkl", "wb") as dt_model_entropy_grey:
    pickle.dump(dt_model_entropy, dt_model_entropy_grey)

<h3>Pre-preocess images for Boosting algorithm.</h3>

In [None]:
def preprocess_images_grayscale(target_dir, classes, image_size=(128, 128)):
    """
    Preprocess images: Convert to grayscale, resize, and flatten.

    Parameters:
    - target_dir: Path to the target directory containing class folders.
    - classes: List of class names.
    - image_size: Target image size.

    Returns:
    - X_gray: Flattened grayscale image data.
    - y_gray: Corresponding labels.
    """
    X_grey128 = []
    y_grey128 = []

    for label, cls in enumerate(classes):
        class_dir = os.path.join(target_dir, cls)
        for img_name in os.listdir(class_dir):
            if is_image_file(img_name):
                img_path = os.path.join(class_dir, img_name)
                try:
                    # Open Image
                    with Image.open(img_path) as img:
                        img = img.convert('L')  
                        img = img.resize(image_size) 
                        img_array = np.array(img).flatten() 

                        X_grey128.append(img_array)
                        y_grey128.append(label)

                except Exception as e:
                    print(f"Error processing file {img_path}: {e}")

    X_grey128 = np.array(X_grey128)
    y_grey128 = np.array(y_grey128)
    return X_grey128, y_grey128

X_grey128, y_grey128 = preprocess_images_grayscale(Train_dir, classification_targets)

print("\nGrayscale Image Data Shape:", X_grey128.shape)
print("Labels Shape:", y_grey128.shape)


Grayscale Image Data Shape: (10000, 16384)
Labels Shape: (10000,)


In [7]:
X_grey128_train, X_grey128_test, y_grey128_train, y_grey128_test = train_test_split(X_grey128, y_grey128, test_size=0.2, random_state=42)
print("Training data shape:", X_grey128_train.shape)
print("Testing data shape:", X_grey128_test.shape)

Training data shape: (8000, 16384)
Testing data shape: (2000, 16384)


<h3>XGBoost algorithm with different hyperparameters</h3>

In [8]:
xgb_model_grey = xgb.XGBClassifier(max_depth=5, n_estimators=50, learning_rate=0.1, objective="multi:softmax", random_state=42,num_class=2)
xgb_model_grey.fit(X_grey128_train, y_grey128_train)


y_train_pred_xgb = xgb_model_grey.predict(X_grey128_train)
y_test_pred_xgb = xgb_model_grey.predict(X_grey128_test)


accuracy_train_xgb = accuracy_score(y_grey128_train, y_train_pred_xgb)
accuracy_test_xgb = accuracy_score(y_grey128_test, y_test_pred_xgb)


cm_train_xgb = confusion_matrix(y_grey128_train, y_train_pred_xgb)
cm_test_xgb = confusion_matrix(y_grey128_test, y_test_pred_xgb)


print("\nXGBoost Training Metrics")
print("Accuracy:", accuracy_train_xgb)
print("Classification Report:\n", classification_report(y_grey128_train, y_train_pred_xgb, target_names=classification_targets))
print("Confusion Matrix:\n", cm_train_xgb)

print("\nXGBoost Testing Metrics")
print("Accuracy:", accuracy_test_xgb)
print("Classification Report:\n", classification_report(y_grey128_test, y_test_pred_xgb, target_names=classification_targets))
print("Confusion Matrix:\n", cm_test_xgb)


XGBoost Training Metrics
Accuracy: 0.910875
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.91      0.91      0.91      3988
museum-outdoor       0.91      0.92      0.91      4012

      accuracy                           0.91      8000
     macro avg       0.91      0.91      0.91      8000
  weighted avg       0.91      0.91      0.91      8000

Confusion Matrix:
 [[3613  375]
 [ 338 3674]]

XGBoost Testing Metrics
Accuracy: 0.768
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.76      0.79      0.77      1012
museum-outdoor       0.77      0.75      0.76       988

      accuracy                           0.77      2000
     macro avg       0.77      0.77      0.77      2000
  weighted avg       0.77      0.77      0.77      2000

Confusion Matrix:
 [[795 217]
 [247 741]]


In [10]:
import pickle
with open("xgb_model_grey.pkl", "wb") as  xgb_model_grey_file:
    pickle.dump(xgb_model_grey, xgb_model_grey_file)

<h3>Semi supervised decision tree algorithm on grey scaled image data.</h3>

In [31]:

def train_semi_supervised(X, y, iterations=10, confidence_threshold=0.85):
    """
    Train a semi-supervised Decision Tree model.

    Parameters:
    - X: Feature matrix 
    - y: Label vector 
    - iterations: Number of iterations for pseudo-labeling
    - confidence_threshold: Minimum confidence probability for pseudo-labeling

    Returns:
    - Final trained Decision Tree model
    """
    
    X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
        X_grey_train, y_grey_train, test_size=0.8, stratify=y, random_state=42
    )

    for i in range(iterations):
        print(f"\n Iteration {i+1}: Training on {len(X_labeled)} labeled samples.")

        
        dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=42)
        dt_model.fit(X_labeled, y_labeled)

        
        y_pseudo_probs = dt_model.predict_proba(X_unlabeled)
        y_pseudo_labels = dt_model.predict(X_unlabeled)
        accuracy_train = accuracy_score(y_unlabeled, y_pseudo_labels)
        cm_train_xgb = confusion_matrix(y_unlabeled, y_pseudo_labels)
        print(f" Test Accuracy after Iteration {i+1}: {accuracy_train:.4f}")
        print("Classification Report:\n", classification_report(y_unlabeled, y_pseudo_labels, target_names=classification_targets))
        print(f" Test Confusion Matrix after Iteration {i+1}:\n", cm_train_xgb)


        
        high_confidence_mask = (y_pseudo_probs.max(axis=1) >= confidence_threshold)
        confident_indices = np.where(high_confidence_mask)[0]  # Get indices

        if len(confident_indices) == 0:
            print("No confident pseudo-labels found. Stopping early.")
            break

        X_labeled = np.vstack((X_labeled, X_unlabeled[confident_indices]))
        y_labeled = np.hstack((y_labeled, y_pseudo_labels[confident_indices]))

        X_unlabeled = np.delete(X_unlabeled, confident_indices, axis=0)
        y_unlabeled = np.delete(y_unlabeled, confident_indices)

        print(f"{len(confident_indices)} high-confidence samples added. New labeled size: {len(X_labeled)}")

        # y_train_pred = dt_model.predict(X_labeled)
        # accuracy_train = accuracy_score(y_labeled, y_train_pred)
        # cm_train_xgb = confusion_matrix(y_labeled, y_train_pred)
        


        # print(f" Training Accuracy after Iteration {i+1}: {accuracy_train:.4f}")
        # print("Classification Report:\n", classification_report(y_labeled, y_train_pred, target_names=classification_targets))
        # print(f" Training Confusion Matrix after Iteration {i+1}:\n", cm_train_xgb)

    # **Final Step: Train Decision Tree on Fully Labeled Data**
    # print("\nTraining Final Decision Tree on Fully Labeled Data...")
    # dt_model_final = DecisionTreeClassifier(max_depth=6, random_state=42)
    # dt_model_final.fit(X_labeled, y_labeled)
    
    return dt_model

dt_model_semi_supervised = train_semi_supervised(X_grey_train, y_grey_train)

y_test_pred = dt_model_semi_supervised.predict(X_grey_test)


accuracy_test = accuracy_score(y_grey_test, y_test_pred)
cm_train_xgb = confusion_matrix(y_grey_test, y_test_pred)
# cm_test = confusion_matrix(y_grey_test, y_test_pred)

print("\nFinal Testing Metrics")
print(f'Accuracy: {accuracy_test:.4f}')
print("Classification Report:\n", classification_report(y_grey_test, y_test_pred, target_names=classification_targets))
print('Confusion Matrix:\n', cm_test)



 Iteration 1: Training on 1600 labeled samples.
 Test Accuracy after Iteration 1: 0.6423
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.64      0.63      0.64      3190
museum-outdoor       0.64      0.65      0.65      3210

      accuracy                           0.64      6400
     macro avg       0.64      0.64      0.64      6400
  weighted avg       0.64      0.64      0.64      6400

 Test Confusion Matrix after Iteration 1:
 [[2011 1179]
 [1110 2100]]
3948 high-confidence samples added. New labeled size: 5548

 Iteration 2: Training on 5548 labeled samples.
 Test Accuracy after Iteration 2: 0.5665
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor       0.54      0.66      0.60      1192
museum-outdoor       0.60      0.48      0.53      1260

      accuracy                           0.57      2452
     macro avg       0.57      0.57      0.56      2452
  weighted avg       0

In [32]:
import pickle
with open("dt_model_semi_supervised_grey.pkl", "wb") as dt_model_semi_supervised_grey_file:
    pickle.dump(dt_model_semi_supervised, dt_model_semi_supervised_grey_file)