# **Seminararbeit**
von Leon Lantz

## **🚀 Bibliotheken-Import und CUDA-Verfügbarkeit**

In [1]:
import torch
import os
from diffusers import StableDiffusionPipeline
from diffusers import DiffusionPipeline
import shutil
import json
import mediapipe as mp
from torch import autocast
import random
import cv2
from pycocotools.coco import COCO

print("Cuda verfügbar? -->", torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Cuda verfügbar? --> True
NVIDIA GeForce RTX 4070 Ti


## **🖼️ COCO Dataset** 
https://cocodataset.org/

- mehr als 200.000 reale Bilder aus unterschiedlichsten Szenarien
- unterteilt in 80 Kategorien (Personen, Fahrzeuge, Tiere, ...)
- jedes Bild detailliert annotiert, unter anderem mit Bildunterschriften (Captions)


In [2]:
# Ordner-Pfade
results_path  = 'results/'
images_path = 'coco_dataset/train2017'
annotations_path = 'coco_dataset/annotations/instances_train2017.json'
captions_path = 'coco_dataset/annotations/captions_train2017.json'

In [3]:
# Lade die COCO-Annotationen
with open(annotations_path, 'r') as f:
    coco_annotations = json.load(f)

# Lade die COCO-Annotationen
with open(captions_path, 'r') as f:
    coco_captions = json.load(f)


# Erstelle ein Mapping für ID und Bild-Pfad
mapping_filename = {}
for image in coco_annotations['images']:
    mapping_filename[image['id']] = image['file_name']

def get_filename_from_image_id(image_id):
    return mapping_filename.get(image_id, "Bild-ID nicht gefunden")


# Erstelle ein leeres Dictionary für das Mapping
image_id_to_caption = {}

# Durchlaufe die Liste der Captions und fülle das Mapping
for annotation in coco_captions['annotations']:
    image_id = annotation['image_id']
    caption = annotation['caption']
    if image_id not in image_id_to_caption:
        image_id_to_caption[image_id] = [caption]
    else:
        image_id_to_caption[image_id].append(caption)

def get_caption_from_image_id(image_id):
    captions_list = image_id_to_caption.get(image_id, "Bild-ID nicht gefunden")
    
    # Sortiere die Bildunterschriften nach der Länge der Wörter
    sorted_captions = sorted(captions_list, key=lambda x: len(x.split()), reverse=True)
    
    # Gib die erste (detaillierteste) Bildunterschrift zurück
    return sorted_captions[0]

Verwendung des COCO-Helpers um nur Bilder der Kategorie "Person" herauszufiltern

In [4]:
# Initialisiere COCO-Helper
coco = COCO(annotations_path)

# Hole alle Kategorien
categories = coco.loadCats(coco.getCatIds())
category_names = [cat['name'] for cat in categories]

# Finde die ID der Kategorie "person" --> Nur Bilder die im Dataset bereits als "Person" markiert
category_id = coco.getCatIds(catNms=['person'])

# Hole alle Annotationen der Kategorie "person"
ann_ids = coco.getAnnIds(catIds=category_id)
anns = coco.loadAnns(ann_ids)

# Hole die Bild-IDs der Annotationen
image_ids = list(set([ann['image_id'] for ann in anns]))

loading annotations into memory...
Done (t=9.65s)
creating index...
index created!


## **🙂 Gesichtserkennung**

In [5]:
def detectFace(img_name):
    mp_face_detection = mp.solutions.face_detection

    def get_rectangle(objDictionary, ss):
        left = int(objDictionary.xmin * ss[1])
        top = int(objDictionary.ymin * ss[0])
        right = int(left + objDictionary.width * ss[1])
        bottom = int(top + objDictionary.height * ss[0])
        return ((left, top), (right, bottom))

    with mp_face_detection.FaceDetection(model_selection=3, min_detection_confidence=0.8) as face_detection:
        image = cv2.imread(img_name)
        results = face_detection.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    if not results.detections:
        return None

    faces = []

    for obj in results.detections:
        ff = get_rectangle(obj.location_data.relative_bounding_box, image.shape)
        (left, top), (right, bottom) = ff
        faces.append(image[top:bottom, left:right])

    return faces

In [6]:
def prune_detected_faces(folder_name):
  files = os.listdir(folder_name)

  r_count = 0
  for f_name in files:
    faces = detectFace(os.path.join(folder_name,f_name))
    if not faces: 
      os.remove(os.path.join(folder_name,f_name))
      r_count += 1
  
  return r_count

## **✂️ Extrahieren von realen Gesichtern**

In [7]:
def find_faces(img_path, save_path, num_faces_wanted, face_res = 100):

  if os.path.exists(save_path):
    shutil.rmtree(save_path)
  os.mkdir(save_path)

  count = 0

  for img_id in random.sample(image_ids, num_faces_wanted*10):

    if count >= num_faces_wanted: break

    f_name = get_filename_from_image_id(img_id)

    faces = detectFace(os.path.join(img_path,f_name))
    if not faces: continue

    for face in faces:
      if not face.size: continue
      face = cv2.resize(face,(face_res,face_res))
      cv2.imwrite(os.path.join((f'{save_path}'), str(count)+'.jpeg'), face)
      count += 1      

  return count

In [9]:
num_runs = 2
num_faces_wanted = 1000
face_res = 100

for n in range(num_runs):
    faces_generated = find_faces(images_path, f'{results_path}/realFaces{n}', num_faces_wanted, face_res) # type: ignore
    print(f'{faces_generated} Gesichter wurden im Ordner realFaces{n} generiert.')
    x = prune_detected_faces((f'{results_path}/realFaces{n}')) # type: ignore
    print(f'{x} Gesichter wurden entfernt.')



1000 Gesichter wurden im Ordner realFaces0 generiert.
193 Gesichter wurden entfernt.


## **🔄 Generierung mit Stable Diffusion v1-4**

In [9]:
def select_model_pipeline(name):
    if name == "stable-diffusion-v1-4":
        # Instanziiere eine Stable Diffusion Pipeline aus dem Modell "CompVis/stable-diffusion-v1-4"
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16)  
        pipe.to("cuda")
    elif name == "stable-diffusion-xl-base-1.0":
        # Instanziiere eine Stable Diffusion Pipeline aus dem Modell "CompVis/stable-diffusion-xl-base-1.0"
        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
        pipe.to("cuda")
    
    return pipe  # Rückgabe der Pipeline am Ende der Funktion

pipe = select_model_pipeline("stable-diffusion-v1-4")

unet\diffusion_pytorch_model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]



In [32]:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16)  
pipe.to("cuda")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionXLPipeline {
  "_class_name": "StableDiffusionXLPipeline",
  "_diffusers_version": "0.28.2",
  "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
  "feature_extractor": [
    null,
    null
  ],
  "force_zeros_for_empty_prompt": true,
  "image_encoder": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "EulerDiscreteScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "text_encoder_2": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_2": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [10]:
def find_faces_generated(save_path_images, save_path_faces, num_faces_wanted, face_res=100):

  if os.path.exists(save_path_images):
    shutil.rmtree(save_path_images)    
  os.mkdir(save_path_images)

  if os.path.exists(save_path_faces):
    shutil.rmtree(save_path_faces)    
  os.mkdir(save_path_faces)

  count_f = 0
  count_i = 0

  for img_id in random.sample(image_ids, num_faces_wanted*10):

    if count_f >= num_faces_wanted: break

    caption = get_caption_from_image_id(img_id)
    caption += ". Enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. Ensure clear and detailed facial expressions for a captivating result."
    prompt = caption.lower()
    print(prompt)

    # generate image
    with autocast("cuda"):
      image = pipe(prompt).images[0] 
      image.save(f'{save_path_images}/{count_i}.jpeg')
      count_i += 1

    faces = detectFace(os.path.join(f'{save_path_images}/', str(count_i-1)+'.jpeg'))
    if not faces: continue

    for face in faces:
      if not face.size: continue
      face = cv2.resize(face,(face_res,face_res))
      cv2.imwrite(os.path.join(f'{save_path_faces}/', str(count_f)+'.jpeg'), face)
      count_f += 1     

  return count_i, count_f

In [31]:
# Lösche den GPU-Cache
torch.cuda.empty_cache()

In [12]:
num_runs = 1
processed_images = set()
num_faces_wanted = 250
face_res = 100

for n in range(num_runs):
  images_generated, faces_generated = find_faces_generated(f'{results_path}/imagesGenerated{n}', f'{results_path}/facesGenerated{n}', num_faces_wanted, face_res)
  print(f'{faces_generated} faces were generated in folder {results_path}/facesGenerated{n}')
  print(f'{images_generated} faces were generated in folder {results_path}/imagesGenerated{n}')
  x = prune_detected_faces((f'{results_path}/facesGenerated{n}'))
  print(f'{x} faces were removed')

a picture of two giraffes, fairly close to a road, with a bus traveling up it.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a woman in a blue shirt sitting on a park bench next to a large backpack.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a parking lot with several buses and a few cars parked there.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

two cross country skiers make their way along a snow covered trail between the evergreen trees.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a woman reaching for a piece of bread in a oven.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

up close picture of baseball batter wearing gloves and helmet.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

two men in red coats on horse back with a third man waking behind in the same type of coat.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

two glasses or wine, one red and one white, sit with two bottles of wine while people mingle in the background.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a couple of kids are dancing together in a room. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a black and white photo of a skateboarder skating with people watching.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

there is a family posing in front of a old bus. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a black cow and calf with a little boy standing by a ugly building.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man on a snowboard with goggles on, and other people on skis behind him on a snow covered ground area.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man taking picture of another man standing next to a baseball player.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a skate boarder is doing a trick on the side of an empty pool.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man has hit the baseball and is preparing to run the bases.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a young boy holds a tennis racket on a tennis court.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

the street sign has on outline of a man with a question mark on it. . enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man in a bicycle helmet holding a cup of ice cream.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a girl sitting and smiling at a desk with a computer.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man sitting at a table about to enjoy a bowl of steamed broccoli.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

tows tied up to a fence and a man walking on a sidewalk with a sack on his head. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man wearing an orange vest. taking items from the back of a white truck. . enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a man stands in front of a plane in black and white. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

little children running and playing in a soccer match as a parent watches. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

this is a sub sandwich made with chicken and tomato.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

young man holding a tennis racket while standing in front of a fence.. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

two young people in their ski gear are posing for the camera. . enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

a couple of people are riding on the back of horses. enhance the image to emphasize facial features, such as expressive eyes and radiant smiles. ensure clear and detailed facial expressions for a captivating result.


  0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 