In [None]:
# Cell 1: Imports
import cv2
import torch
import torchvision
import torchvision.transforms as transforms
from PIL import Image
import os
import numpy as np
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models import resnet50, ResNet50_Weights
import matplotlib.pyplot as plt
import json
from datetime import datetime
import time

In [None]:
# Cell 2: Set up Mask R-CNN model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mask_rcnn_model = maskrcnn_resnet50_fpn(weights=torchvision.models.detection.MaskRCNN_ResNet50_FPN_Weights.DEFAULT)
mask_rcnn_model.to(device)
mask_rcnn_model.eval()

coco_labels = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

vehicle_labels = ['car', 'motorcycle', 'bus', 'truck', 'boat']

transform = transforms.Compose([
    transforms.ToTensor()
])

def detect_objects(image):
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = mask_rcnn_model(image_tensor)

    boxes = outputs[0]['boxes'].cpu().numpy()
    labels = outputs[0]['labels'].cpu().numpy()
    scores = outputs[0]['scores'].cpu().numpy()
    masks = outputs[0]['masks'].cpu().numpy()

    detected_objects = []

    for box, label, score, mask in zip(boxes, labels, scores, masks):
        if score >= 0.7 and coco_labels[label] in vehicle_labels:
            detected_objects.append((box, coco_labels[label], score, mask))

    return detected_objects

def preprocess_image(image_path, target_size=(800, 800)):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    h, w, _ = image.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_size = (int(w * scale), int(h * scale))
    
    resized_image = cv2.resize(image, new_size)
    
    padded_image = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
    padded_image[:resized_image.shape[0], :resized_image.shape[1], :] = resized_image
    
    return padded_image, scale, scale

def visualize_detections(image, detected_objects, scale_x, scale_y, confidence_threshold=0.7):
    image_with_detections = image.copy()

    for box, label, score, _ in detected_objects:
        if score >= confidence_threshold:
            xmin, ymin, xmax, ymax = box

            xmin_adjusted = int(xmin / scale_x)
            ymin_adjusted = int(ymin / scale_y)
            xmax_adjusted = int(xmax / scale_x)
            ymax_adjusted = int(ymax / scale_y)

            cv2.rectangle(image_with_detections, (xmin_adjusted, ymin_adjusted), (xmax_adjusted, ymax_adjusted), (0, 255, 0), 2)
            cv2.putText(image_with_detections, f"{label}: {score:.2f}", (xmin_adjusted, ymin_adjusted - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    return image_with_detections

In [None]:
# Cell 3: Set up ResNet-50 model for classification
def upscale_and_enhance(image):
    desired_size = (256, 256)  
    upscaled_image = image.resize(desired_size, Image.LANCZOS)
    return upscaled_image 
    
preprocess_classification = transforms.Compose([
    transforms.Lambda(lambda img: upscale_and_enhance(img)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_image_classification(image_path):
    img = Image.open(image_path).convert('RGB')
    img_t = preprocess(img)
    return img_t

resnet_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
resnet_model.eval()

def classify_image(image_path, model, preprocess, imagenet_labels, classification_counts, image_predictions):
    img = Image.open(image_path).convert('RGB')
    
    plt.imshow(img)
    plt.axis('off')  
    plt.show()

    img_t = preprocess(img)
    batch_t = torch.unsqueeze(img_t, 0)

    with torch.no_grad():
        out = model(batch_t)
    probabilities = torch.nn.functional.softmax(out[0], dim=0)
    top1_prob, top1_catid = torch.topk(probabilities, 1)
    
    if top1_prob[0].item() >= 0.7:  # Check if the confidence score is at least 0.7
        class_label = imagenet_labels[top1_catid[0].item()]
        print(f"Image: {os.path.basename(image_path)}")
        print(f"Predicted Class: {class_label} - {top1_prob[0].item()}")
        classification_counts[class_label] = classification_counts.get(class_label, 0) + 1
        image_predictions[os.path.basename(image_path)] = (class_label, top1_prob[0].item())

    return classification_counts, image_predictions

In [None]:
import time

# Cell 4: Process video and generate classification report
def process_video(input_video, imagenet_labels_json):
    frames_dir = "extracted_frames"
    cropped_images_dir = 'cropped-objects'
    output_video = "output_video.mp4"

    os.makedirs(frames_dir, exist_ok=True)
    os.makedirs(cropped_images_dir, exist_ok=True)

    video = cv2.VideoCapture(input_video)
    frame_rate = video.get(cv2.CAP_PROP_FPS)
    frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    output = cv2.VideoWriter(output_video, fourcc, frame_rate, (frame_width, frame_height))

    frame_count = 0
    save_interval = int(frame_rate * 3)  
    total_images = 0  

    while True:
        ret, frame = video.read()
        
        if not ret:
            break
        
        frame_path = os.path.join(frames_dir, f"frame_{frame_count:05d}.jpg")
        cv2.imwrite(frame_path, frame)
        
        if frame_count % save_interval == 0:  
            preprocessed_image, scale_x, scale_y = preprocess_image(frame_path)
            pil_image = Image.fromarray(preprocessed_image)
            detected_objects = detect_objects(pil_image)
            
            for i, (box, label, score, _) in enumerate(detected_objects):
                if label in vehicle_labels: 
                    xmin, ymin, xmax, ymax = box
                    xmin_adjusted = int(xmin / scale_x)
                    ymin_adjusted = int(ymin / scale_y)
                    xmax_adjusted = int(xmax / scale_x)
                    ymax_adjusted = int(ymax / scale_y)
                    
                    adjusted_box = (xmin_adjusted, ymin_adjusted, xmax_adjusted, ymax_adjusted)
                    cropped_object = frame[ymin_adjusted:ymax_adjusted, xmin_adjusted:xmax_adjusted]
                    
                    if cropped_object.size > 0:  
                        object_filename = f"{os.path.splitext(os.path.basename(frame_path))[0]}_{i}.jpg"
                        object_path = os.path.join(cropped_images_dir, object_filename)
                        cv2.imwrite(object_path, cropped_object)
                        total_images += 1  
        
        image_with_detections = visualize_detections(frame, detected_objects, scale_x, scale_y)
        output.write(image_with_detections)
        
        frame_count += 1

    video.release()
    output.release()

    with open(imagenet_labels_json, 'r') as f:
        imagenet_labels = json.load(f)

    classification_counts = {}
    image_predictions = {}
    detection_counts = {}

    for image_file in os.listdir(cropped_images_dir):
        image_path = os.path.join(cropped_images_dir, image_file)
        classification_counts, image_predictions = classify_image(image_path, resnet_model, preprocess_classification, imagenet_labels, classification_counts, image_predictions)

    classification_report_lines = [
        "Image Classification Report",
        "===========================",
        "",
        f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        f"Total Images Viewed: {total_images}",
        "",
        "Classification Counts:",
    ]

    for class_label, count in sorted(classification_counts.items()):
        classification_report_lines.append(f"{class_label}: {count}")

    classification_report_lines.append("")
    classification_report_lines.append("Image Predictions:")

    for image_name, (class_label, confidence_score) in image_predictions.items():
        classification_report_lines.append(f"{image_name}")
        classification_report_lines.append(f"  Predicted Class: {class_label}")
        classification_report_lines.append(f"  Confidence Score: {confidence_score:.4f}")
        classification_report_lines.append("")

    classification_report_content = "\n".join(classification_report_lines)

    detection_report_lines = [
        "Object Detection Report",
        "=======================",
        "",
        f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        "Detection Counts:",
    ]

    for label, count in sorted(detection_counts.items()):
        detection_report_lines.append(f"{label}: {count}")

    detection_report_content = "\n".join(detection_report_lines)

    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    timestamped_dir = os.path.join(output_dir, timestamp)
    os.makedirs(timestamped_dir, exist_ok=True)

    classification_report_file = os.path.join(timestamped_dir, "classification-report.txt")
    with open(classification_report_file, "w") as f:
        f.write(classification_report_content)

    print(f"Classification report saved to {classification_report_file}")

    detection_report_file = os.path.join(timestamped_dir, "detection-report.txt")
    with open(detection_report_file, "w") as f:
        f.write(detection_report_content)

    print(f"Object detection report saved to {detection_report_file}")

    output_video_file = os.path.join(timestamped_dir, "output_video.mp4")
    os.rename(output_video, output_video_file)

    print(f"Output video saved to {output_video_file}")

    shutil.rmtree(frames_dir)
    shutil.rmtree(cropped_images_dir)

    print("Video processing completed.")

input_video = "raw-traffic-footage/bridge-1-rgb-test.mp4"
imagenet_labels_json = 'imagenet-simple-labels.json'

process_video(input_video, imagenet_labels_json)