# Import Libs

In [None]:
import os
import re
import random
import base64
import zipfile
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
from google.colab import userdata

import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.layers import Flatten, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    ConfusionMatrixDisplay,
    RocCurveDisplay
)

import xgboost as xgb


# Install dependencies
!pip install xgboost seaborn --quiet

import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.layers import Flatten, Input
from tensorflow.keras.models import Model

# Connecting to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Unzip dataset
zip_path = "/content/drive/MyDrive/Data2.zip"
unzip_dir = "/content/data2"

In [None]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

In [None]:
# Install necessary libraries
!pip install xgboost

# Training No.1

In [None]:
# --- 1. Load and preprocess images
def load_images_from_folder(folder, label, image_size=(128,128)):
    images = []
    labels = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            img_path = os.path.join(folder, filename)
            img = load_img(img_path, target_size=image_size)
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(label)
    return images, labels

# Example usage:
# Replace these paths with your actual dataset folders
class1_images, class1_labels = load_images_from_folder('/content/data2/Data/Data/Guardrail/No', label=0)
class2_images, class2_labels = load_images_from_folder('/content/data2/Data/Data/Guardrail/Yes', label=1)

X = np.array(class1_images + class2_images)
y = np.array(class1_labels + class2_labels)

# Normalize images as VGG16 expects inputs in range 0-255, we preprocess later
# Optionally, you can scale here or inside feature extractor preprocessing

# --- 2. Prepare VGG16 feature extractor ---
input_tensor = Input(shape=(128,128,3))
base_model = VGG16(weights='imagenet', include_top=False, input_tensor=input_tensor)
x = Flatten()(base_model.output)
feature_extractor = Model(inputs=base_model.input, outputs=x)

# VGG16 preprocessing function
from tensorflow.keras.applications.vgg16 import preprocess_input

# Preprocess input images
X_preprocessed = preprocess_input(X)

# --- 3. Extract features ---
features = feature_extractor.predict(X_preprocessed, batch_size=32, verbose=1)

# --- 4. Split data ---
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42, stratify=y)

# --- 5. Train XGBoost classifier ---
xgb_clf = xgb.XGBClassifier(max_depth=4, learning_rate=0.1, subsample=0.85,
                            colsample_bytree=0.9, gamma=0.1, n_estimators=200,
                            use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)

# --- 6. Evaluate ---
y_pred = xgb_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:


# --- 1. Load and preprocess images ---
def load_images_from_folder(folder, label, image_size=(128,128)):
    images = []
    labels = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            img_path = os.path.join(folder, filename)
            img = load_img(img_path, target_size=image_size)
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(label)
    return images, labels

# Replace these with your dataset paths
class0_images, class0_labels = load_images_from_folder('/content/data2/Data/Data/Guardrail/No', label=0)
class1_images, class1_labels = load_images_from_folder('/content/data2/Data/Data/Guardrail/Yes', label=1)

X = np.array(class0_images + class1_images)
y = np.array(class0_labels + class1_labels)

# --- 2. Prepare VGG16 feature extractor ---
input_tensor = Input(shape=(128,128,3))
base_model = VGG16(weights='imagenet', include_top=False, input_tensor=input_tensor)
x = Flatten()(base_model.output)
feature_extractor = Model(inputs=base_model.input, outputs=x)

# Preprocess images for VGG16
X_preprocessed = preprocess_input(X)

# --- 3. Extract features ---
features = feature_extractor.predict(X_preprocessed, batch_size=32, verbose=1)

# --- 4. Split data ---
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2,
                                                    random_state=42, stratify=y)

# --- 5. Train XGBoost classifier with evaluation logging ---
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_clf = xgb.XGBClassifier(max_depth=4, learning_rate=0.1, subsample=0.85,
                            colsample_bytree=0.9, gamma=0.1, n_estimators=200,
                            use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train, eval_set=eval_set, verbose=True)

# --- 6. Plot training and test logloss ---
results = xgb_clf.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

plt.figure(figsize=(10,6))
plt.plot(x_axis, results['validation_0']['logloss'], label='Train')
plt.plot(x_axis, results['validation_1']['logloss'], label='Test')
plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss During Training')
plt.legend()
plt.show()

# --- 7. Evaluate accuracy and print classification report ---
y_pred = xgb_clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --- 8. Plot confusion matrix ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import xgboost as xgb
from xgboost import plot_tree
import matplotlib.pyplot as plt

# Plot the first tree (tree number 0)
plt.figure(figsize=(20,10))
plot_tree(xgb_clf, num_trees=0, rankdir='LR')  # rankdir='LR' for left-to-right layout
plt.title('XGBoost Decision Tree (Tree 0)')
plt.show()

In [None]:
from xgboost import plot_importance

plt.figure(figsize=(10,8))
plot_importance(xgb_clf, max_num_features=20, importance_type='weight')  # or 'gain', 'cover'
plt.title('Feature Importance')
plt.show()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(feature_extractor, to_file='vgg16_feature_extractor.png', show_shapes=True, show_layer_names=True)

# Training No.2

In [None]:
# --- 1. Load and preprocess images ---
def load_images_from_folder(folder, label, image_size=(128,128)):
    images, labels = [], []
    for fn in os.listdir(folder):
        if fn.lower().endswith(('.jpg','.png')):
            img = load_img(os.path.join(folder, fn), target_size=image_size)
            images.append(img_to_array(img))
            labels.append(label)
    return np.array(images), np.array(labels)

# Paths
o_dir = '/content/data2/Data/Data/Guardrail/No'
yes_dir = '/content/data2/Data/Data/Guardrail/Yes'

X0, y0 = load_images_from_folder(o_dir, 0)
X1, y1 = load_images_from_folder(yes_dir, 1)
X = np.vstack([X0, X1])
y = np.hstack([y0, y1])

# --- 2. Train/Val/Test split (60/20/20) ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Preprocess
X_train_p = preprocess_input(X_train)
X_val_p   = preprocess_input(X_val)
X_test_p  = preprocess_input(X_test)

# --- 3. Fine-tune VGG16 as feature learner ---
input_tensor = Input(shape=(128,128,3))
base = VGG16(weights='imagenet', include_top=False, input_tensor=input_tensor)
# Freeze first N layers, unfreeze rest
for layer in base.layers[:10]:
    layer.trainable = False
for layer in base.layers[10:]:
    layer.trainable = True

# Add classification head for fine-tuning
x = Flatten()(base.output)
out = Dense(1, activation='sigmoid')(x)
ft_model = Model(base.input, out)

ft_model.compile(
    optimizer=Adam(1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
ckpt = ModelCheckpoint('best_ft.h5', save_best_only=True)

# Train (capture history for plots)
history = ft_model.fit(
    X_train_p, y_train,
    validation_data=(X_val_p, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[es, ckpt],
    verbose=1
)

# --- 4. Reload best weights and repurpose base for feature extraction ---
ft_model.load_weights('best_ft.h5')
for layer in base.layers:
    layer.trainable = False
feat_extractor = Model(base.input, Flatten()(base.output))

ModelCheckpoint('best_ft.keras', save_best_only=True)

# --- 5. Extract features for XGBoost ---
feat_train = feat_extractor.predict(X_train_p, batch_size=32)
feat_val   = feat_extractor.predict(X_val_p, batch_size=32)
feat_test  = feat_extractor.predict(X_test_p, batch_size=32)

# --- 6. Hyperparameter tuning with RandomizedSearchCV (faster than full grid) ---
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'max_depth': [3,4,5],
    'learning_rate': [0.01,0.1],
    'n_estimators': [50,100,200],
    'gamma': [0,0.1,1]
}
xgb_base = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
rand_search = RandomizedSearchCV(
    xgb_base,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='f1',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
# rand_search.fit(Xgb_X, Xgb_y)
rand_search.fit(feat_train, y_train)
best_xgb = rand_search.best_estimator_
print("Best XGB params:", rand_search.best_params_)

# --- 7. Final evaluation on test set ---
# Use the best model from randomized search
feat_test  # ensure feat_test is computed above

y_pred = best_xgb.predict(feat_test)
y_proba = best_xgb.predict_proba(feat_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# --- 8. Plots ---. Plots ---

# 8.1 Loss & Accuracy for CNN fine-tuning
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('CNN Fine-tuning Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('CNN Fine-tuning Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# 8.2 ROC Curve for XGBoost
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr)
roc_disp.plot()
plt.title('XGBoost ROC Curve')
plt.show()

# 8.3 Confusion Matrix for XGBoost
cm = confusion_matrix(y_test, y_pred)
cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
cm_disp.plot(cmap='Blues')
plt.title('XGBoost Confusion Matrix')
plt.show()

# Traning No.3

In [None]:
# --- 1. Load and preprocess images ---
def load_images_from_folder(folder, label, image_size=(128,128)):
    images, labels = [], []
    for fn in os.listdir(folder):
        if fn.lower().endswith(('.jpg','.png')):
            img = load_img(os.path.join(folder, fn), target_size=image_size)
            images.append(img_to_array(img))
            labels.append(label)
    return np.array(images), np.array(labels)

# Paths
o_dir = '/content/data2/Data/Data/Guardrail/No'
yes_dir = '/content/data2/Data/Data/Guardrail/Yes'

X0, y0 = load_images_from_folder(o_dir, 0)
X1, y1 = load_images_from_folder(yes_dir, 1)
X = np.vstack([X0, X1])
y = np.hstack([y0, y1])

# --- 2. Train/Val/Test split (60/20/20) ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Preprocess
X_train_p = preprocess_input(X_train)
X_val_p   = preprocess_input(X_val)
X_test_p  = preprocess_input(X_test)

# --- 3. Fine-tune VGG16 as feature learner ---
input_tensor = Input(shape=(128,128,3))
base = VGG16(weights='imagenet', include_top=False, input_tensor=input_tensor)
# Freeze first N layers, unfreeze rest
for layer in base.layers[:10]:
    layer.trainable = False
for layer in base.layers[10:]:
    layer.trainable = True

# Add classification head for fine-tuning
x = Flatten()(base.output)
out = Dense(1, activation='sigmoid')(x)
ft_model = Model(base.input, out)

ft_model.compile(
    optimizer=Adam(1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
ckpt = ModelCheckpoint('best_ft.h5', save_best_only=True)

# Train (capture history for plots)
history = ft_model.fit(
    X_train_p, y_train,
    validation_data=(X_val_p, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[es, ckpt],
    verbose=1
)

# --- 4. Reload best weights and repurpose base for feature extraction ---
ft_model.load_weights('best_ft.h5')
for layer in base.layers:
    layer.trainable = False
feat_extractor = Model(base.input, Flatten()(base.output))

ModelCheckpoint('best_ft.keras', save_best_only=True)

# --- 5. Extract features for XGBoost ---
feat_train = feat_extractor.predict(X_train_p, batch_size=32)
feat_val   = feat_extractor.predict(X_val_p, batch_size=32)
feat_test  = feat_extractor.predict(X_test_p, batch_size=32)

# --- 6. Hyperparameter tuning with RandomizedSearchCV (faster than full grid) ---
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'max_depth': [3,4,5],
    'learning_rate': [0.01,0.1],
    'n_estimators': [50,100,200],
    'gamma': [0,0.1,1]
}
xgb_base = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
rand_search = RandomizedSearchCV(
    xgb_base,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
# rand_search.fit(Xgb_X, Xgb_y)
rand_search.fit(feat_train, y_train)
best_xgb = rand_search.best_estimator_
print("Best XGB params:", rand_search.best_params_)

# --- 7. Final evaluation on test set ---
# Use the best model from randomized search
feat_test  # ensure feat_test is computed above

y_pred = best_xgb.predict(feat_test)
y_proba = best_xgb.predict_proba(feat_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# --- 8. Plots ---. Plots ---

# 8.1 Loss & Accuracy for CNN fine-tuning
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('CNN Fine-tuning Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('CNN Fine-tuning Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# 8.2 ROC Curve for XGBoost
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr)
roc_disp.plot()
plt.title('XGBoost ROC Curve')
plt.show()

# 8.3 Confusion Matrix for XGBoost
cm = confusion_matrix(y_test, y_pred)
cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
cm_disp.plot(cmap='Blues')
plt.title('XGBoost Confusion Matrix')
plt.show()