In [6]:
import pandas as pd
import os
import tensorflow as tf



## read data

In [7]:
df = pd.read_csv("data5000/train/_annotations.csv")
print(df.head())
print(df.info(),"\n")
uniqueLabels=df['class'].unique()
print(uniqueLabels,"\n")

groupedData = df.groupby(by='class').size()
print(groupedData)

                                            filename  width  height  class  \
0  knife_676_box1_jpg.rf.0850acb4103232820f8e2a2a...    460     460  knife   
1   -596_jpg.rf.085734351c2d66f1f525166dae5f58ab.jpg    460     460  knife   
2  knife_317_jpg.rf.082e1b64f858887a1b6595855703e...    460     460  knife   
3  knife_1229_box1_jpg.rf.082657767f8cd55ab3aa1c4...    460     460  knife   
4  armas--554--jpg_jpg.rf.08654f9778daf16cc108e75...    460     460    gun   

   xmin  ymin  xmax  ymax  
0    27    74   430   327  
1    27   114   430   343  
2    12   166   437   443  
3    44   176   332   296  
4    14    19   441   433  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5843 entries, 0 to 5842
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  5843 non-null   object
 1   width     5843 non-null   int64 
 2   height    5843 non-null   int64 
 3   class     5843 non-null   object
 4   xmin      5843 non-null 

## check missing files

In [8]:

def check_images_exist(csv_file, image_dir):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    
    # Assuming the column containing filenames is named 'filename'
    # Adjust the column name if it's different in your CSV
    missing_images = []

    for index, row in df.iterrows():
        image_file = row['filename']  # Replace with your actual column name
        image_path = os.path.join(image_dir, image_file)

        if not os.path.exists(image_path):
            missing_images.append(image_file)

    return missing_images

# Define paths
base_path = 'data5000/'
train_dir = os.path.join(base_path, 'train')
valid_dir = os.path.join(base_path, 'valid')
test_dir = os.path.join(base_path, 'test')

train_csv = os.path.join(base_path, 'train', '_annotations.csv')
valid_csv = os.path.join(base_path, 'valid', '_annotations.csv')
test_csv = os.path.join(base_path, 'test', '_annotations.csv')

# Check images for each set
missing_train_images = check_images_exist(train_csv, train_dir)
missing_valid_images = check_images_exist(valid_csv, valid_dir)
missing_test_images = check_images_exist(test_csv, test_dir)

# Print results
print(f'Missing Training Images: {missing_train_images}')
print(f'Missing Validation Images: {missing_valid_images}')
print(f'Missing Test Images: {missing_test_images}')

                        

Missing Training Images: []
Missing Validation Images: []
Missing Test Images: []


# _CAREFULL_data_deleting code ahead_

In [19]:
# Filter all 'HandGun' rows
handgun_rows = df[df['class'] == 'HandGun']

# Check if there are exactly 800 "HandGun" rows
if len(handgun_rows) == 50:
    print("There are already 800 'HandGun' rows. No deletion needed.")
else:
    # Randomly sample 800 'HandGun' rows to keep
    handgun_to_keep = handgun_rows.sample(n=50, random_state=42)

    # Get the filenames of the 'HandGun' images to keep
    handgun_filenames_to_keep = handgun_to_keep['filename'].tolist()

    # Combine 'HandGun' rows to keep with other classes
    df_to_keep = pd.concat([handgun_to_keep, df[df['class'] != 'HandGun']])

    # Get all filenames from the original dataset
    all_filenames = df['filename'].tolist()

    # Get the filenames of the rows that should be deleted
    filenames_to_delete = [filename for filename in all_filenames if filename not in handgun_filenames_to_keep]

    # Delete the corresponding image files that are not to be kept
    for filename in filenames_to_delete:
        file_path = os.path.join("dataset/valid/", filename)
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")

    # Save the updated DataFrame (with only 1000 'HandGun' and all other classes) back to the CSV file
    df_to_keep.to_csv("dataset/valid/_annotations.csv", index=False)

    # Verify the result
    print(df_to_keep['class'].value_counts())

Deleted: dataset/valid/hg593_png.rf.00bd69b68c51556c0c97c75f111ae0fd.jpg
Deleted: dataset/valid/Assault-Rifle_jpg.rf.02b3ab6397c9264b037cd2758ea875e4.jpg
Deleted: dataset/valid/Pump-Action-Toros-Arms-Hunting-Toros_jpg.rf.017a7cc4252b134002141debc3091278.jpg
Deleted: dataset/valid/MP5-Heckler-Koch-1-_jpg.rf.01b78587de729328359ac1a5505db36c.jpg
Deleted: dataset/valid/Heckler-Koch-__-Product-Overview-_-MP5_png.rf.033093389d121a79bc30d72c19ffd10d.jpg
Deleted: dataset/valid/How-AR-15-became-the-most-popular-rifle_jpg.rf.012767316279110730c868297ecd784e.jpg
Deleted: dataset/valid/The-Mossberg-590A1-The-Combat-Pump_jpg.rf.0396f49d072b86c590d08cce982602d6.jpg
Deleted: dataset/valid/Ar15-Based-Sniper-Rifle-Silencer_jpg.rf.075a9f486c8db4f7c021d3e0fd3f9a10.jpg
Deleted: dataset/valid/hg424_jpg.rf.05287ec83b31c5e4a29addcbabf15537.jpg
File not found: dataset/valid/hg424_jpg.rf.05287ec83b31c5e4a29addcbabf15537.jpg
Deleted: dataset/valid/9mm-Pistols_-Uzi-Mini-Pro_-MPA-Defender_jpg.rf.05f683fee27eedc9f

In [26]:

rifle_rows = df[df['class'] == 'Rifle']

if len(rifle_rows) == 50:
    print("There are already 800 'Rifle' rows. No deletion needed.")
else:
    rifle_to_keep = rifle_rows.sample(n=50, random_state=42)

    rifle_filenames_to_keep = rifle_to_keep['filename'].tolist()

    df_to_keep = pd.concat([rifle_to_keep, df[df['class'] != 'Rifle']])

    all_filenames = df['filename'].tolist()

    filenames_to_delete = [filename for filename in all_filenames if filename not in rifle_filenames_to_keep]

    # Delete the corresponding image files that are not to be kept
    for filename in filenames_to_delete:
        file_path = os.path.join("dataset/valid/", filename)
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")

    # Save the updated DataFrame (with only 1000 'HandGun' and all other classes) back to the CSV file
    df_to_keep.to_csv("dataset/valid/_annotations.csv", index=False)

    # Verify the result
    print(df_to_keep['class'].value_counts())

File not found: dataset/valid/8pcs-1-6-Toys-Gun-Model-Mp5-Hk53-Uzi_jpg.rf.de419b8f48500d261a0e1e99eff027b3.jpg
File not found: dataset/valid/Fn-P90-Stock-Photo-Download-Image-Now_jpg.rf.6891efabbbbe84c6f9f3c807393d8552.jpg
File not found: dataset/valid/KRISS-Vector-SMG-FDE-_-High-Performance_jpg.rf.e8510dddcc42b8f46f55f202bee41070.jpg
File not found: dataset/valid/Deactivated-Uzi-Submachine-Gun_jpg.rf.23a2590a664278cd6bc9921703d0b686.jpg
File not found: dataset/valid/3D-model-Uzi-Submachine-gun-VR-AR_jpg.rf.b759ae0e72bc9a89d657b4d94f046900.jpg
File not found: dataset/valid/MP5-submachine-gun_jpg.rf.1448fab7d816d72b54a9cfa9a4a013ab.jpg
File not found: dataset/valid/Weapon-SMG-Thompson-Vector-VSS-UMP-CG15_jpg.rf.896ef16b8373c0c67c0843e88a2fcefb.jpg
File not found: dataset/valid/3d-Puzzle-Diy-Educational-Toy_jpg.rf.34b577cc915a315d027da0ea4a2ad71f.jpg
File not found: dataset/valid/Weapon-SMG-Thompson-Vector-VSS-UMP-CG15_jpg.rf.896ef16b8373c0c67c0843e88a2fcefb.jpg
File not found: dataset/v

In [28]:
shotgun_rows = df[df['class'] == 'ShotGun']

# Check if there are exactly 800 'ShotGun' rows
if len(shotgun_rows) == 50:
    print("There are already 800 'ShotGun' rows. No deletion needed.")
else:
    # Randomly sample 800 'ShotGun' rows to keep
    shotgun_to_keep = shotgun_rows.sample(n=50, random_state=42)

    # Get the filenames of the 'ShotGun' images to keep
    shotgun_filenames_to_keep = shotgun_to_keep['filename'].tolist()

    # Combine 'ShotGun' rows to keep with other classes
    df_to_keep = pd.concat([shotgun_to_keep, df[df['class'] != 'ShotGun']])

    # Get all filenames from the original dataset
    all_filenames = df['filename'].tolist()

    # Get the filenames of the rows that should be deleted
    filenames_to_delete = [filename for filename in all_filenames if filename not in shotgun_filenames_to_keep]

    # Delete the corresponding image files that are not to be kept
    for filename in filenames_to_delete:
        file_path = os.path.join("dataset/valid/", filename)
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")

    # Save the updated DataFrame (with only 800 'ShotGun' and all other classes) back to the CSV file
    df_to_keep.to_csv("dataset/valid/_annotations.csv", index=False)

    # Verify the result
    print(df_to_keep['class'].value_counts())

File not found: dataset/valid/Gun-Lover-on-Twitter_-_Rt-if-you-love_jpg.rf.24db49556f1c66493bfb438e7db7f6f9.jpg
File not found: dataset/valid/Soldier-Action-Figure-Toys-Diy_jpg.rf.0abe3c36b58f33206ec48e1cf34ac682.jpg
File not found: dataset/valid/U-S-Army-Chooses-Leupold-Mark-5HD-for_jpg.rf.b686a4cebd5501415015bd2078e3ad26.jpg
File not found: dataset/valid/A-visit-to-SIG-SAUER-Headquarters_jpg.rf.f015f80aebc80569e3a993e334e3a42c.jpg
File not found: dataset/valid/Weapons-1-_jpg.rf.e169b7f58bb13a6a135895e329f40102.jpg
File not found: dataset/valid/a-sniper-rifle-or-an-assault-rifle_jpg.rf.cc02ab213d3cd43c7b4f097821766f31.jpg
File not found: dataset/valid/Fortnite-Assault-Rifles-guide-V9-00_jpg.rf.c28c29610e43e04859415e477f6fe051.jpg
File not found: dataset/valid/Best-Assault-Rifle-in-Warzone-2_-Top_jpg.rf.9aa756d096ae4b48a4b6def637044667.jpg
File not found: dataset/valid/Soldier-Action-Figure-Toys-Diy_jpg.rf.0abe3c36b58f33206ec48e1cf34ac682.jpg
File not found: dataset/valid/Age-to-Buy-AR

In [23]:
smg_rows = df[df['class'] == 'SMG']

# Check if there are exactly 800 'SMG' rows
if len(smg_rows) == 50:
    print("There are already 800 'SMG' rows. No deletion needed.")
else:
    # Randomly sample 800 'SMG' rows to keep
    smg_to_keep = smg_rows.sample(n=50, random_state=42)

    # Get the filenames of the 'SMG' images to keep
    smg_filenames_to_keep = smg_to_keep['filename'].tolist()

    # Combine 'SMG' rows to keep with other classes
    df_to_keep = pd.concat([smg_to_keep, df[df['class'] != 'SMG']])

    # Get all filenames from the original dataset
    all_filenames = df['filename'].tolist()

    # Get the filenames of the rows that should be deleted
    filenames_to_delete = [filename for filename in all_filenames if filename not in smg_filenames_to_keep]

    # Delete the corresponding image files that are not to be kept
    for filename in filenames_to_delete:
        file_path = os.path.join("dataset/valid/", filename)
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        else:
            print(f"File not found: {file_path}")

    # Save the updated DataFrame (with only 800 'SMG' and all other classes) back to the CSV file
    df_to_keep.to_csv("dataset/valid/_annotations.csv", index=False)

    # Verify the result
    print(df_to_keep['class'].value_counts())

File not found: dataset/valid/hg534_jpg.rf.6fed9c8797d364c89a30bfb1bc582ae0.jpg
File not found: dataset/valid/hg75_jpg.rf.646cb7d490c3b5bede3dcc9bcf16998b.jpg
File not found: dataset/valid/hg569_jpg.rf.5332c17f204f527cba34d3ea78634af6.jpg
File not found: dataset/valid/hg416_jpg.rf.8f6fa1bdfc17fd18b771f96a0f9342e7.jpg
File not found: dataset/valid/hg106_jpeg.rf.206031271736bfc17ca7e032756a4e7c.jpg
File not found: dataset/valid/hg132_jpg.rf.092f87d38b71f174aebc5ba62820bb68.jpg
File not found: dataset/valid/hg189_jpg.rf.61422f6898e4e3c0957a3710d729c21c.jpg
File not found: dataset/valid/hg75_jpg.rf.646cb7d490c3b5bede3dcc9bcf16998b.jpg
File not found: dataset/valid/hg164_jpg.rf.64bd090f69a517d9d090636701dcd1f1.jpg
File not found: dataset/valid/hg150_jpg.rf.1bb6a1aadb8032ef97d8c5aa1eda7a92.jpg
File not found: dataset/valid/hg132_jpg.rf.092f87d38b71f174aebc5ba62820bb68.jpg
File not found: dataset/valid/Businesses_-Guns_-and-Human-Rights_jpg.rf.46610f36d0d68225a3e047f87c71b979.jpg
File not fou

In [32]:
import os
import shutil
import pandas as pd

# Load the CSV file
df = pd.read_csv("dataset/valid/_annotations.csv")

# Get the list of filenames from the CSV
csv_filenames = df['filename'].tolist()

# Set the path to the train directory where the images should be
train_directory = "dataset/valid/"

# Set the path to the backup directory where the images are stored
backup_directory = "backup_data/data/valid"

# Find the missing image files
missing_files = [filename for filename in csv_filenames if not os.path.exists(os.path.join(train_directory, filename))]

# Print the missing files
if missing_files:
    print(f"Missing files ({len(missing_files)}):")
    for file in missing_files:
        print(file)
else:
    print("No missing files found.")

# Check if the missing files are available in the backup directory and recover them
recoverable_files = [filename for filename in missing_files if os.path.exists(os.path.join(backup_directory, filename))]

if recoverable_files:
    print(f"\nRecovering files from backup ({len(recoverable_files)}):")
    for file in recoverable_files:
        src_path = os.path.join(backup_directory, file)
        dest_path = os.path.join(train_directory, file)
        
        # Recover (copy) the file from the backup directory to the train directory
        shutil.copy(src_path, dest_path)
        print(f"Recovered: {file}")
else:
    print("No recoverable files found in the backup directory.")


Missing files (200):
Spencer-shotgun_jpg.rf.2f0166839cabfdc84d57d66603c4ef82.jpg
sg162_jpg.rf.7684081fd7fd642f41b25ed8b839e2e1.jpg
T4E-HDX-68-Caliber-Paintball-Pump_jpg.rf.7575c212015f98a060f7e0f555121b87.jpg
8-Best-Pump-Action-Shotguns-For-2022_png.rf.d60f33b5543c6690fcccac3d4fc2bd29.jpg
The-Mossberg-Maverick-88-pump-action_jpg.rf.1197212f88c354605d0bed48e1217acf.jpg
sg41_jpg.rf.67d024678aa29c667819ddadb4e339b2.jpg
mental-illness-to-firearms-dealers_jpg.rf.9fb9d999a39d945430388cc95045369b.jpg
6-Gun-Storage-Rack-with-Accessory-1-_jpg.rf.f5fcd30a8027b444cfe7f27d7e190f7f.jpg
Sorry_-Fortnite-fans_-a-new-one-shot_jpg.rf.50519a28fa814c1b59fedc23bbb5d062.jpg
pump-action-shotgun-breaching-3d-3ds_jpg.rf.94d70e9481d54513ec5eb20cade1d547.jpg
Pump-Action-Shotguns-GForce-Arms_jpg.rf.86363996becba7eba5746e9e0500969f.jpg
Akkar-Churchill-612-Pump-Home-Defense_png.rf.fecc6cd876833641c42b39d8b4723630.jpg
Video-Review_-Mossberg-Maverick-88-Pump_jpg.rf.2e7241b206870dbdf64e6fb4568a4b33.jpg
Maxx-Action-Toy