In [5]:
import os
import shutil
import pandas as pd

def list_subfolders_and_count_files(root_directory):
    """Lists subfolders and counts the number of files in each subfolder."""
    subfolder_file_counts = {}
    for dirpath, dirnames, filenames in os.walk(root_directory):
        if dirpath == root_directory:
            for dirname in dirnames:
                subfolder_path = os.path.join(dirpath, dirname)
                file_count = len([f for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))])
                subfolder_file_counts[dirname] = file_count
    return subfolder_file_counts

# Define dataset paths
sampleset_path = "dataset/stagging/sampleset"
testingset_path = "dataset/stagging/testingset"
validationset_path = "dataset/stagging/validationset"

# Count files in each set
df_Sampleset = pd.DataFrame(list(list_subfolders_and_count_files(sampleset_path).items()), columns=['Subfolder', 'File Count_Sampleset'])
df_Testingset = pd.DataFrame(list(list_subfolders_and_count_files(testingset_path).items()), columns=['Subfolder', 'File Count_Testingset'])
df_Validationset = pd.DataFrame(list(list_subfolders_and_count_files(validationset_path).items()), columns=['Subfolder', 'File Count_Validationset'])

# Merge DataFrames
df_combined = df_Sampleset.merge(df_Testingset, on='Subfolder', suffixes=('_Sampleset', '_Testingset'))
df_combined = df_combined.merge(df_Validationset, on='Subfolder')
df_combined.rename(columns={'File Count': 'File Count_Validationset'}, inplace=True)

# Delete subfolders where File Count_Testingset < 5
for subfolder in df_combined[df_combined['File Count_Testingset'] < 5]['Subfolder']:
    for dataset_path in [sampleset_path, testingset_path, validationset_path]:
        subfolder_path = os.path.join(dataset_path, subfolder)
        if os.path.exists(subfolder_path):
            shutil.rmtree(subfolder_path)  # Deletes folder and all its contents
            print(f"Deleted: {subfolder_path}")

# Display the updated dataframe
df_combined.head()


Deleted: dataset/stagging/sampleset\Amy_Davidson
Deleted: dataset/stagging/testingset\Amy_Davidson
Deleted: dataset/stagging/validationset\Amy_Davidson
Deleted: dataset/stagging/sampleset\Andrea_Anders
Deleted: dataset/stagging/testingset\Andrea_Anders
Deleted: dataset/stagging/validationset\Andrea_Anders
Deleted: dataset/stagging/sampleset\Angell_Conwell
Deleted: dataset/stagging/testingset\Angell_Conwell
Deleted: dataset/stagging/validationset\Angell_Conwell
Deleted: dataset/stagging/sampleset\Annie_Ilonzeh
Deleted: dataset/stagging/testingset\Annie_Ilonzeh
Deleted: dataset/stagging/validationset\Annie_Ilonzeh
Deleted: dataset/stagging/sampleset\Barbara_Carrera
Deleted: dataset/stagging/testingset\Barbara_Carrera
Deleted: dataset/stagging/validationset\Barbara_Carrera
Deleted: dataset/stagging/sampleset\Billy_Boyd
Deleted: dataset/stagging/testingset\Billy_Boyd
Deleted: dataset/stagging/validationset\Billy_Boyd
Deleted: dataset/stagging/sampleset\Christopher_Lloyd
Deleted: dataset/st

Unnamed: 0,Subfolder,File Count_Sampleset,File Count_Testingset,File Count_Validationset
0,Alan_Alda,20,5,5
1,Alan_Arkin,20,5,5
2,Alec_Baldwin,20,5,5
3,Alfred_Molina,20,5,5
4,Alyssa_Milano,20,5,5


In [6]:
import os
import pandas as pd

def list_subfolders_and_count_files(root_directory):
    # Dictionary to store subfolder names and their file counts
    subfolder_file_counts = {}

    # Walk through the root directory
    for dirpath, dirnames, filenames in os.walk(root_directory):
        # We are only interested in immediate subfolders of the root directory
        if dirpath == root_directory:
            for dirname in dirnames:
                subfolder_path = os.path.join(dirpath, dirname)
                # Count the number of files in the subfolder
                file_count = len([f for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))])
                subfolder_file_counts[dirname] = file_count

    return subfolder_file_counts



def save_subfolder_counts_to_dataframe(subfolder_counts, output_file):
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(list(subfolder_counts.items()), columns=['Subfolder', 'File Count'])
    
    return df

sampleset_path="dataset/stagging/sampleset"
subfolder_counts = list_subfolders_and_count_files(sampleset_path)
df_Sampleset = save_subfolder_counts_to_dataframe(subfolder_counts, "subfolder_counts.csv")

testingset_path="dataset/stagging/testingset"
subfolder_counts = list_subfolders_and_count_files(testingset_path)
df_Testingset = save_subfolder_counts_to_dataframe(subfolder_counts, "subfolder_counts.csv")

validationset_path="dataset/stagging/validationset"
subfolder_counts = list_subfolders_and_count_files(validationset_path)
df_Validationset = save_subfolder_counts_to_dataframe(subfolder_counts, "subfolder_counts.csv")

# Merge the dataframes on the "Subfolder" column
df_combined = df_Sampleset.merge(df_Testingset, on='Subfolder', suffixes=('_Sampleset', '_Testingset'))
df_combined = df_combined.merge(df_Validationset, on='Subfolder')
df_combined.rename(columns={'File Count': 'File Count_Validationset'}, inplace=True)

# Display the combined dataframe
df_combined.head()


Unnamed: 0,Subfolder,File Count_Sampleset,File Count_Testingset,File Count_Validationset
0,Alan_Alda,20,5,5
1,Alan_Arkin,20,5,5
2,Alec_Baldwin,20,5,5
3,Alfred_Molina,20,5,5
4,Alyssa_Milano,20,5,5


Unnamed: 0,Subfolder,File Count_Sampleset,File Count_Testingset,File Count_Validationset
0,Alan_Alda,20,5,5
1,Alan_Arkin,20,5,5
2,Alec_Baldwin,20,5,5
3,Alfred_Molina,20,5,5
4,Alyssa_Milano,20,5,5


In [4]:
len(df_combined)

182

In [22]:
df_temp=df_combined[
    (df_combined['File Count_Sampleset'] == 20) &
    (df_combined['File Count_Testingset'] == 0) &
    (df_combined['File Count_Validationset'] == 0)
]

array(['Alan Arkin', 'Alan Rickman', 'Alyssa Milano', 'Andy Richter',
       'Ashley Benson', 'Ashley Jones', 'Audrey Landers', 'Ben Kingsley',
       'Billy Boyd', 'Billy Zane', 'Bobbie Eakes', 'Brad Pitt',
       'Burt Reynolds', 'Carey Lowell', 'Caroline Dhavernas',
       'Cary Elwes', 'Channing Tatum', 'Cheryl Hines', 'Cheryl Ladd',
       'Chris Evans', 'Chris Rock', 'Christa Miller', 'Chyler Leigh',
       'Courteney Cox', 'Crystal Chappell', 'Danica McKellar',
       'David Cross', 'David Schwimmer', 'Dermot Mulroney',
       'Desmond Harrington', 'Diego Luna', 'Dina Meyer', 'Dustin Hoffman',
       'Edie Falco', 'Elijah Wood', 'Elizabeth Berkley',
       'Elizabeth Hendrickson', 'Farah Fath', 'Frances Fisher',
       'Gates McFadden', 'Geoffrey Rush', 'George Clooney',
       'Harrison Ford', 'Hayden Christensen', 'Heather Locklear',
       'Ioan Gruffudd', 'Jaden Smith', 'Jake Gyllenhaal', 'Jake Weber',
       'James Frain', 'James Franco', 'James McAvoy', 'January Jones',
  

In [17]:
df_Sampleset.head()

Unnamed: 0,Subfolder,File Count
0,Alan Arkin,20
1,Alan Rickman,20
2,Alyssa Milano,20
3,America Ferrera,20
4,Andrea Bowen,18


In [24]:
## import metadata
facescrub_df_actor = pd.read_csv('faceScrub/facescrub_actors.txt',delimiter='\t',header=None)
facescrub_df_actress = pd.read_csv('faceScrub/facescrub_actresses.txt',delimiter='\t',header=None)

#combine dataframe
facescrub_df=pd.concat([facescrub_df_actor,facescrub_df_actress],axis=0)

print(f"number of row (images) : {len(facescrub_df)}")

number of row (images) : 106865


In [31]:
# Step 1: Extract the first row as the new header
new_header = facescrub_df.iloc[0]

# Step 2: Set the new header
facescrub_df.columns = new_header

# Step 3: Drop the first row (now redundant)
df = facescrub_df[1:]

# Reset index (optional)
facescrub_df_new = df.reset_index(drop=True)

In [34]:
filtered_df = facescrub_df_new[facescrub_df_new["name"].isin(df_temp["Subfolder"].values)]

In [35]:
filtered_df.head()

Unnamed: 0,name,image_id,face_id,url,bbox,sha256
1419,Alan Arkin,2510,1428,http://upload.wikimedia.org/wikipedia/commons/...,156222548614,3c97c0f369c05381631178667925ee9821ca271fe6feb7...
1420,Alan Arkin,2511,1429,http://www.nndb.com/people/777/000022711/alan-...,5725172140,c38286df5ef80ca55cb5fddae185379acdb6db36bb8d11...
1421,Alan Arkin,2512,1430,http://ia.media-imdb.com/images/M/MV5BMjA4NDk5...,2685243302,b4df4f3a91905799aff06f8564330b60d304a60389e02b...
1422,Alan Arkin,2514,1431,http://3.bp.blogspot.com/--qQ0lGtma_M/T3Dp7trn...,7452232210,2d7e5907ef1d76f0e7c799cd3d4df01bc3ef7ad1ce4467...
1423,Alan Arkin,2515,1432,http://whatculture.com/wp-content/photos/Alan_...,8116193128,cf7f7569d842a5030662d37dde9d7fbc77ac4aa8fefe50...


In [37]:
# Make sure 'image_id' is of numeric type for proper sorting
filtered_df['image_id'] = pd.to_numeric(filtered_df['image_id'])

# Sort by 'name' and then by 'image_id' in descending order
filtered_df = filtered_df.sort_values(by=['name', 'image_id'], ascending=[True, False])

# Reset index if needed
filtered_df = filtered_df.reset_index(drop=True)

filtered_df.head()

Unnamed: 0,name,image_id,face_id,url,bbox,sha256
0,Alan Arkin,2850,1638,http://www.images99.com/i99/01/11318/11318.jpg,18347291155,bc98b2174020a261ca4f8cd9b912f4087d3e1b2d7b4062...
1,Alan Arkin,2845,1637,http://www.aceshowbiz.com/images/wennpic/newla...,39447508161,f83edaec4b768d29c284ccd52d17ad693712a93168b658...
2,Alan Arkin,2844,1636,http://www.superiorpics.com/wallpaper/file/Ala...,188109368289,4b100701994e3f658ca2d559600a7a4699d99259dfee0c...
3,Alan Arkin,2843,1635,http://famous-relationships.topsynergy.com/%21...,2438138152,e9ce19ad6406a9a01b43921cdac2d86cd0ebc193f0df45...
4,Alan Arkin,2842,1634,http://www.horsetrackhooligans.com/wp-content/...,2068178226,70ef23aa0bfbc755732bd4762aa084492dba2c2142b7b6...


In [None]:
def download_and_detect_faces(url, filename):
    detector = MTCNN()
    
    try:

        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Convert to numpy array 
        image_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    
        #check if image
        if image is None:
            return 0

        # Detect faces in the image
        result = detector.detect_faces(image)
        if not len(result) ==1:
            #print(f"no face detected in {filename} number of face detected: {len(result)}")
            return 0
            
        # Only save the image if faces detected
        cv2.imwrite(filename, image)
        return 1
        
    except (requests.exceptions.RequestException, cv2.error) as e:
        #remove image if not contain face
        if os.path.exists(filename):
            try:
                os.remove(filename)
            except OSError:
                pass
        return 0
    except Exception as e:
        # Catch any other unexpected errors
        if os.path.exists(filename):
            try:
                os.remove(filename)
            except OSError:
                pass
        return 0
    

    

def get_image_sample(ntest_person,nval_person,facescrub_df):
    current_num_person = 0

    list_person = []

    for current_person in filtered_df["name"].values:
        
        current_ntest_person = 0
        current_nval_person = 0
        
        
        
        if current_person in list_person:
            continue
        else:
        ##buat dataset dengan data orang tersebut
            df_persons=facescrub_df[facescrub_df[0]==current_person]
            
            train_path = 'dataset/stagging/sampleset/'+current_person
            val_path = 'dataset/stagging/validationset/'+current_person
            test_path = 'dataset/stagging/testingset/'+current_person
            ##buat folder untuk orang tersebut
            if not os.path.exists(train_path):
                
                os.makedirs(train_path)
            if not os.path.exists(test_path):
                os.makedirs(test_path)

            len_train = print(f"dwonload image person={current_person} num_image={df_persons.shape[0]}")
            
            
            list_index_person=[]
                    #get random index
                i = np.random.randint(0, df_persons.shape[0])
                if i in list_index_person:
                    continue
                else:
                    list_index_person.append(i)
                    ##ambil url gambar
                    url = df_persons.iloc[i,3]
                    ##download gambar
                    isdownload=download_and_detect_faces(url, train_path+'/'+current_person+'_'+str(current_ntrain_person+1)+'.jpg')
                    if isdownload==1:
                        current_ntrain_person+=1
                        if current_ntrain_person % 10 == 0:
                            print(f"sample_person set image added for {current_person}: {current_ntrain_person}")
            while current_nval_person<nval_person:
                i = np.random.randint(0, df_persons.shape[0])
                if i in list_index_person:
                    continue
                else:
                    list_index_person.append(i)    
                     ##ambil url gambar
                    url = df_persons.iloc[i,3]
                    ##download gambar
                    isdownload=download_and_detect_faces(url, val_path+'/'+current_person+'_'+str(current_nval_person+1)+'.jpg')
                    if isdownload==1:
                        current_nval_person+=1
                        if current_nval_person % 5 == 0:
                            print(f"val_person set image added for {current_person}: {current_nval_person}")
            while current_ntest_person<ntest_person:
                i = np.random.randint(0, df_persons.shape[0])
                if i in list_index_person:
                    continue
                else:
                    list_index_person.append(i)    
                     ##ambil url gambar
                    url = df_persons.iloc[i,3]
                    ##download gambar
                    isdownload=download_and_detect_faces(url, test_path+'/'+current_person+'_'+str(current_ntest_person+1)+'.jpg')
                    if isdownload==1:
                        current_ntest_person+=1
                        if current_ntest_person % 5 == 0:
                            print(f"test_person set image added fro {current_person}: {current_ntest_person}")
            list_person.append(current_person)
            current_num_person+=1
            print(f"===  Person added: {current_person} ===")


In [11]:
os.listdir('dataset/actress_face')

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'dataset/actress_face'

In [13]:
import os
import shutil
import random

# Old dataset paths (source)
actor_path = "dataset/actor_faces"
actress_path = "dataset/actress_faces"

# New dataset paths (destination)
new_sampleset_path = "dataset/mtcnn_face_fix/sampleset"
new_testingset_path = "dataset/mtcnn_face_fix/testingset"
new_validationset_path = "dataset/mtcnn_face_fix/validationset"

# Create directories if they don't exist
for path in [new_sampleset_path, new_testingset_path, new_validationset_path]:
    os.makedirs(path, exist_ok=True)

# Collect all person folders from actor and actress directories
person_folders = list(set(os.listdir(actor_path)) | set(os.listdir(actress_path)))  # Convert set to list

# Randomly select 200 persons (or all if less than 200)
selected_persons = random.sample(person_folders, min(200, len(person_folders)))

# Function to organize images into the new dataset
def organize_images(person_folder):
    # Collect images from both actor and actress folders
    all_images = []
    for dataset_path in [actor_path, actress_path]:
        person_path = os.path.join(dataset_path, person_folder)
        if os.path.isdir(person_path):
            all_images.extend([os.path.join(person_path, img) for img in os.listdir(person_path)])

    # Ensure we have at least 30 images
    if len(all_images) < 30:
        print(f"Skipping {person_folder} - Not enough images ({len(all_images)}/30).")
        return

    # Shuffle images for randomness
    random.shuffle(all_images)

    # Assign 20 images to sampleset, 5 to testingset, 5 to validationset
    sampleset_images = all_images[:20]
    testingset_images = all_images[20:25]
    validationset_images = all_images[25:30]

    # Copy images to new directories
    for img_list, target_path in zip([sampleset_images, testingset_images, validationset_images],
                                     [new_sampleset_path, new_testingset_path, new_validationset_path]):
        new_person_path = os.path.join(target_path, person_folder)
        os.makedirs(new_person_path, exist_ok=True)

        for img in img_list:
            shutil.copy2(img, os.path.join(new_person_path, os.path.basename(img)))

    print(f"Copied {person_folder}: 20 sampleset, 5 testingset, 5 validationset")

# Process only the selected 200 persons
for person in selected_persons:
    organize_images(person)


Copied Hank_Azaria: 20 sampleset, 5 testingset, 5 validationset
Copied Hector_Elizondo: 20 sampleset, 5 testingset, 5 validationset
Copied Rachel_Griffiths: 20 sampleset, 5 testingset, 5 validationset
Copied Mel_Gibson: 20 sampleset, 5 testingset, 5 validationset
Copied David_Schwimmer: 20 sampleset, 5 testingset, 5 validationset
Copied Antonio_Banderas: 20 sampleset, 5 testingset, 5 validationset
Copied Linda_Gray: 20 sampleset, 5 testingset, 5 validationset
Copied Cary_Elwes: 20 sampleset, 5 testingset, 5 validationset
Copied Staci_Keanan: 20 sampleset, 5 testingset, 5 validationset
Copied Katrina_Bowden: 20 sampleset, 5 testingset, 5 validationset
Copied Jim_Carrey: 20 sampleset, 5 testingset, 5 validationset
Copied Alexander_Skarsgård: 20 sampleset, 5 testingset, 5 validationset
Copied Shannen_Doherty: 20 sampleset, 5 testingset, 5 validationset
Copied Kevin_Connolly: 20 sampleset, 5 testingset, 5 validationset
Copied Richard_Madden: 20 sampleset, 5 testingset, 5 validationset
Cop