In [11]:
import os
import cv2
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# ---- 1. Generate unlabeled.txt from train.txt ----
def generate_unlabeled_file(train_txt, unlabeled_txt):
    """
    Generates an 'unlabeled.txt' file by copying the paths from 'train.txt'.
    
    Arguments:
        train_txt (str): The path to the labeled training dataset text file.
        unlabeled_txt (str): The path where the unlabeled dataset text file will be saved.
    """
    with open(train_txt, 'r') as file:
        lines = file.readlines()

    # Save the same paths to unlabeled.txt
    with open(unlabeled_txt, 'w') as file:
        file.writelines(lines)
        
    print(f"Unlabeled dataset saved to {unlabeled_txt}")

In [3]:
# ---- 2. Load Dataset ----
def load_dataset(txt_file):
    data, labels = [], []
    
    with open(txt_file, 'r') as file:
        for line in file.readlines():
            image_path = line.strip()
            full_path = os.path.join(image_path)
            if os.path.exists(full_path):
                label = image_path.split('/')[1]  # Extracting label from path
                data.append(full_path)
                labels.append(label)
    
    return pd.DataFrame({'image_path': data, 'label': labels})

In [4]:
# ---- 3. Feature Extraction ----
def extract_features(image_path):
    """
    Extracts color histograms from an image and returns a feature vector.
    Arguments:
        image_path (str): Path to the image file.
    """
    image = cv2.imread(image_path)
    image = cv2.resize(image, (64, 64))  # Resize for uniformity
    
    # Compute histograms for each color channel (Blue, Green, Red)
    hist_b = cv2.calcHist([image], [0], None, [256], [0, 256])
    hist_g = cv2.calcHist([image], [1], None, [256], [0, 256])
    hist_r = cv2.calcHist([image], [2], None, [256], [0, 256])

    # Normalize histograms
    hist_b = cv2.normalize(hist_b, hist_b).flatten()
    hist_g = cv2.normalize(hist_g, hist_g).flatten()
    hist_r = cv2.normalize(hist_r, hist_r).flatten()

    # Concatenate histograms into a single feature vector
    feature_vector = np.concatenate([hist_b, hist_g, hist_r])

    return feature_vector



In [5]:
# ---- 4. Main Process ----
# Paths to your datasets
train_txt = 'train.txt'  # Path to the labeled training file
unlabeled_txt = 'unlabeled.txt'  # Path to save the new unlabeled dataset file

In [6]:
# Step 1: Generate unlabeled.txt from train.txt
generate_unlabeled_file(train_txt, unlabeled_txt)

# Step 2: Load Labeled and Unlabeled Datasets
labeled_dataset = load_dataset(train_txt)  # Labeled data
unlabeled_dataset = load_dataset(unlabeled_txt)  # Unlabeled data

Unlabeled dataset saved to unlabeled.txt


In [None]:
# Step 3: Feature Extraction (Labeled Data)
features_labeled = np.array([extract_features(path) for path in labeled_dataset['image_path']])
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(labeled_dataset['label'])

In [8]:
# Feature Extraction for Unlabeled Data (No Labels)
features_unlabeled = np.array([extract_features(path) for path in unlabeled_dataset['image_path']])

In [9]:
print("Labeled Data Shape:", features_labeled.shape)
print("Unlabeled Data Shape:", features_unlabeled.shape)

Labeled Data Shape: (10000, 768)
Unlabeled Data Shape: (10000, 768)


In [None]:
# Step 4: Train the Random Forest Model (Labeled Data)
rf_model = DecisionTreeClassifier(random_state=42)
rf_model.fit(features_labeled, y_encoded)

# Save the model
joblib.dump(rf_model, "unsupervised_DT_model.pkl")
joblib.dump(encoder, "label_encoder.pkl")  # Save the label encoder for inverse transformation
print("Unsupervised Decision Tree Model saved.")

Unsupervised Decision Tree Model saved.


In [14]:
# Step 5: Pseudo-labeling (Semi-Supervised Learning)
# Use the trained model to predict on the unlabeled data
pseudo_labels = rf_model.predict(features_unlabeled)

# Assign pseudo-labels to the unlabeled dataset
unlabeled_dataset['Predicted Label'] = encoder.inverse_transform(pseudo_labels)

In [15]:
# Combine labeled and pseudo-labeled data
combined_features = np.vstack([features_labeled, features_unlabeled])
combined_labels = np.hstack([y_encoded, pseudo_labels])

In [16]:
# Step 6: Re-train the model with combined dataset (Semi-Supervised)
rf_model.fit(combined_features, combined_labels)
joblib.dump(rf_model, "semi_supervised_model.pkl")  # Save the new model
print("Semi-Supervised model re-trained and saved.")

Semi-Supervised model re-trained and saved.


In [17]:
# ---- 7. Evaluate Model on Test Data ----
# Load Test Dataset
test_txt = "val.txt"  # Test dataset file
test_dataset = load_dataset(test_txt)

# Extract features for the test dataset
test_features = np.array([extract_features(path) for path in test_dataset['image_path']])

# Predict on Test Data
test_predictions = rf_model.predict(test_features)
predicted_labels = encoder.inverse_transform(test_predictions)

In [18]:
# Save Predictions
test_dataset['Predicted Label'] = predicted_labels
test_dataset.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")

Predictions saved to test_predictions.csv


In [19]:
# Compute accuracy and classification metrics
y_true = encoder.transform(test_dataset['label'])
accuracy = accuracy_score(y_true, test_predictions)
report = classification_report(y_true, test_predictions, target_names=encoder.classes_, digits=4)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.7450
Classification Report:
                 precision    recall  f1-score   support

 museum-indoor     0.7579    0.7200    0.7385       100
museum-outdoor     0.7333    0.7700    0.7512       100

      accuracy                         0.7450       200
     macro avg     0.7456    0.7450    0.7448       200
  weighted avg     0.7456    0.7450    0.7448       200

