In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

Reading the CSV file

In [None]:

file_path = "/content/drive/MyDrive/Colab Notebooks/Current File/pakistan_car_license_dataset.csv"
data = pd.read_csv(file_path)
print("Initial Dataset:")
data.head() # first few rows


removing missing values

In [None]:
data = data.dropna(subset=["License_Plate", "Owner_Name", "Car_Model", "License_Valid"])

# Display remaining rows
print(f"\nDataset after removing rows with missing values: {data.shape[0]} rows")


Removing Duplicate Values

In [None]:
# Remove duplicate rows based on License_Plate
data = data.drop_duplicates(subset=["License_Plate"], keep="first")

# Display the number of rows after removing duplicates
print(f"\nDataset after removing duplicates: {data.shape[0]} rows")


Standardizing Text Data

In [None]:
# Step 4: Standardize Text Data
data["License_Plate"] = data["License_Plate"].str.strip().str.upper()
data["Owner_Name"] = data["Owner_Name"].str.strip().str.title()
data["Car_Model"] = data["Car_Model"].str.strip().str.title()
data["License_Valid"] = data["License_Valid"].str.strip().str.capitalize()

# Display a few rows after standardization
print("\nDataset after standardizing text data:")
data.head()


Encoding Categorical Variables

In [None]:
# Encode 'Valid' as 1 and 'Invalid' as 0
data["License_Valid_Encoded"] = data["License_Valid"].map({"Valid": 1, "Invalid": 0})

# Display the first few rows with the new encoded column
print("\nDataset after encoding categorical variables:")
data[["License_Valid", "License_Valid_Encoded"]].head()


Final Cleaning (Removing unnecessary attributes)

In [None]:

# Select columns relevant for the analysis
data = data[["License_Plate", "Owner_Name", "Car_Model", "Registration_Region","License_Valid", "License_Valid_Encoded"]]

# Display the cleaned dataset
print("\nFinal Cleaned Dataset:")
data.head()


Saving Final Dataset to CSV file

In [None]:
output_file = "preprocessed_pakistan_car_license_dataset.csv"
data.to_csv(output_file, index=False)

print(f"\nProcessed dataset saved to: {output_file}")


Taking Sample of 30 values from the original dataset

In [None]:
sample_df = data.sample(frac = 0.3, random_state=42)
sample_df

Visual Comparisons Using matplotlib & Seaborn  

In [None]:
# License Validity by Region
plt.figure(figsize=(12, 8))

# Group data by region and license validity
region_validity = sample_df.groupby(["Registration_Region", "License_Valid"]).size().unstack()

# Plot the grouped bar chart
region_validity.plot(
    kind="bar",
    stacked=False,
    color={"Valid": "#4CAF50", "Invalid": "#FF5722"},  # Green for Valid, Red for Invalid
    figsize=(12, 8)
)

# Adding labels and title
plt.title("License Validity by Region", fontsize=16)
plt.xlabel("Region", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title="License Status", fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style for better visuals
sns.set_theme(style="whitegrid")

In [None]:
# Set Seaborn style for better visuals
sns.set_theme(style="whitegrid")

1 Valid VS Invalid Licenses (Bar Chart)


In [None]:
plt.figure(figsize=(10,9))
license_counts = sample_df["License_Valid"].value_counts() #counting the validity attribute
sns.barplot(
    x=license_counts.index,
    y=license_counts.values,
    hue=license_counts.index,
    dodge=False,  # Ensures a single bar per category
    palette={"Valid": "#4CAF50", "Invalid": "#FF5722"}  # Green for Valid, Red for Invalid
)
plt.legend([], [], frameon=False)  # Removes legend

# Adding labels and title
plt.title("Distribution of License Validity", fontsize=16)
plt.xlabel("License Status", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Annotate values on bars
for i, value in enumerate(license_counts.values):
    plt.text(i, value + 2, str(value), ha="center", fontsize=12)

plt.tight_layout()
plt.show()




License Validity by Region (Bar Chart)

pip install ultralytics easyocr opencv-python


YOLOv8 Plate Detection

In [None]:
!pip install ultralytics easyocr opencv-python


In [None]:
from ultralytics import YOLO
import cv2
import os
import matplotlib.pyplot as plt

# Load the models
car_model = YOLO("yolov8n.pt")  # Car detection model
plate_model = YOLO("/content/drive/MyDrive/Colab Notebooks/Current File/license_plate_detector.pt")  # License plate detection model

# Paths
video_path = r"/content/drive/MyDrive/Colab Notebooks/Current File/sample2.mp4"
frames_dir = r"/content/drive/MyDrive/Colab Notebooks/Current File/frames"  # Directory to save frames
plates_dir = r"/content/drive/MyDrive/Colab Notebooks/Current File/plates"  # Directory to save cropped plates

# Create directories for frames and plates if they don't exist
os.makedirs(frames_dir, exist_ok=True)
os.makedirs(plates_dir, exist_ok=True)

# Open the video file
cap = cv2.VideoCapture(video_path)
frame_count = 0
plate_tracker = set()  # Track unique license plates based on coordinates

# Process the video and save frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:  # Break the loop if the video ends
        print("Video processing completed.")
        break

    # Run car detection YOLO model
    car_results = car_model(frame)

    # Annotate the frame with detected cars
    annotated_frame = car_results[0].plot()

    # Loop through car detections to process license plates
    for result in car_results:
        for box in result.boxes.data.tolist():
            x1, y1, x2, y2, score, cls_id = box  # Box coordinates, score, and class ID
            cls_name = car_model.names[int(cls_id)]

            if cls_name == "car":  # Check for "car" detections
                # Crop the car region
                car_crop = frame[int(y1):int(y2), int(x1):int(x2)]

                # Run license plate detection on the cropped car region
                plate_results = plate_model(car_crop)

                for plate_result in plate_results:
                    for plate_box in plate_result.boxes.data.tolist():
                        px1, py1, px2, py2, pscore, pcls_id = plate_box  # Plate box coordinates
                        plate_crop = car_crop[int(py1):int(py2), int(px1):int(px2)]

                        # Calculate the absolute plate coordinates
                        abs_plate_coords = (int(x1) + int(px1), int(y1) + int(py1),
                                            int(x1) + int(px2), int(y1) + int(py2))

                        # Check if the plate was already detected
                        if abs_plate_coords not in plate_tracker:
                            plate_tracker.add(abs_plate_coords)  # Add to tracker

                            # Draw plate detection on the frame
                            cv2.rectangle(annotated_frame,
                                          (abs_plate_coords[0], abs_plate_coords[1]),
                                          (abs_plate_coords[2], abs_plate_coords[3]),
                                          (0, 255, 0), 2)  # Green rectangle for plates

                            # Save the cropped license plate
                            plate_path = os.path.join(plates_dir, f"frame_{frame_count:04d}_plate.jpg")
                            cv2.imwrite(plate_path, plate_crop)

    # Save the annotated frame
    frame_path = os.path.join(frames_dir, f"frame_{frame_count:04d}.jpg")
    cv2.imwrite(frame_path, annotated_frame)

    # Display the annotated frame only every 10th frame
    if frame_count % 10 == 0 or frame_count==1:
        plt.imshow(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
        plt.title(f"Frame {frame_count}")
        plt.axis("off")
        plt.pause(0.01)  # Pause briefly to update the plot
        plt.clf()  # Clear the plot for the next frame

    frame_count += 1

# Release video resources
cap.release()
plt.close()

print("Video processed, frames saved, and license plates cropped.")


EasyOCR Text Extraction

In [None]:
import os
import pandas as pd
import easyocr

# Paths
plates_dir = r"/content/drive/MyDrive/Colab Notebooks/Current File/plates"  # Folder containing cropped plates
output_csv_path = r"/content/drive/MyDrive/Colab Notebooks/Current File/extracted_ocr_plates/extracted_ocr_plates.csv"  # CSV to save extracted license plates

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'])

# List to store extracted license plate information
license_plate_data = []

# Process each image in the cropped plates directory
for plate_image_name in os.listdir(plates_dir):
    plate_image_path = os.path.join(plates_dir, plate_image_name)

    if os.path.isfile(plate_image_path):
        print(f"Processing image: {plate_image_name}...")

        # Perform OCR on the image
        results = reader.readtext(plate_image_path, detail=0)

        if results:
            # Assuming the most prominent text is the license plate
            license_plate = results[0]  # Extract the first result
            print(f"Extracted Text: {license_plate}")

            # Append the extracted data
            license_plate_data.append({
                'Image_Name': plate_image_name,
                'License_Plate': license_plate
            })
        else:
            print(f"No text detected in image: {plate_image_name}")
            license_plate_data.append({
                'Image_Name': plate_image_name,
                'License_Plate': 'No Text Detected'
            })

# Save the extracted data to a CSV file
extracted_df = pd.DataFrame(license_plate_data)
extracted_df.to_csv(output_csv_path, index=False)

print(f"License plate data saved to {output_csv_path}")


In [None]:
import pandas as pd
import re

# Path to the original CSV file
original_csv_file_path = "/content/drive/MyDrive/Colab Notebooks/Current File/extracted_ocr_plates/extracted_ocr_plates.csv"

# Path to save the new normalized CSV file
normalized_csv_file_path = "/content/drive/MyDrive/Colab Notebooks/Current File/extracted_ocr_plates/normalized_extracted_ocr_plates.csv"

# Load the extracted data from the original CSV
extracted_df = pd.read_csv(original_csv_file_path)

def normalize_text(text):
    # Convert to uppercase for consistency
    normalized_text = text.upper()
    # Remove unwanted spaces
    normalized_text = re.sub(r'\s+', '', normalized_text)
    # Remove any non-alphanumeric characters
    normalized_text = re.sub(r'[^A-Z0-9]', '', normalized_text)
    return normalized_text

# Apply normalization to the 'License_Plate' column
if 'License_Plate' in extracted_df.columns:
    extracted_df['License_Plate'] = extracted_df['License_Plate'].apply(lambda x: normalize_text(x) if isinstance(x, str) else x)
else:
    print("Error: 'License_Plate' column not found in the CSV file!")

# Save the normalized data to the new CSV file
extracted_df.to_csv(normalized_csv_file_path, index=False)

print(f"Normalized text saved to: {normalized_csv_file_path}")


Dataset Matching

In [None]:
import time
import pandas as pd

# Load the existing license plate database
database_path = r"/content/drive/MyDrive/Colab Notebooks/Current File/pakistan_car_license_dataset.csv"
database = pd.read_csv(database_path)

# Assuming 'extracted_df' contains the detected license plates
print("Matching each License Plate with Database...")

# Remove duplicate license plates from the extracted DataFrame
unique_plates_df = extracted_df.drop_duplicates(subset=['License_Plate'])

# Match only unique license plates
for _, plate_info in unique_plates_df.iterrows():
    extracted_plate = plate_info['License_Plate']

    # Match with the database
    match = database[database['License_Plate'] == extracted_plate]

    if not match.empty:
        time.sleep(1)
        print(f"\nMatched found for plate: {extracted_plate}")

        time.sleep(1)  # Pause for effect
        print("\nPlease Wait, Fetching License Details...")
        time.sleep(2)
        print("\n")

        # Display details
        owner_name = match.iloc[0]['Owner_Name']
        car_model = match.iloc[0]['Car_Model']
        registration_region = match.iloc[0]['Registration_Region']
        license_valid = match.iloc[0]['License_Valid']

        print(f"Owner Name: {owner_name}")
        print(f"Car Model: {car_model}")
        print(f"Registration Region: {registration_region}")
        print(f"License Validity: {license_valid}\n")
        print("-" * 50)

print("Matching Completed.")
