# 1. Data structuring

In [None]:
from ast import Expression
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import os
from typing import List
import shutil

raw_dataset_dir = Path("/content/drive/MyDrive/ml_final_assignment/final_assignment/raw_dataset")
dataset_dir = Path("/content/drive/MyDrive/ml_final_assignment/final_assignment/dataset")
happy_dir = Path.joinpath(dataset_dir, "happy")
angry_dir = Path.joinpath(dataset_dir, "angry")
sad_dir = Path.joinpath(dataset_dir, "sad")
neutral_dir = Path.joinpath(dataset_dir, "neutral")

In [None]:
subfolders = []

for subfolder in raw_dataset_dir.rglob("*"):
  if subfolder.is_dir():
    folder = {"subfolder_name": subfolder.name, "subfolder_path": subfolder}
    subfolders.append(folder)

for subfolder in subfolders:
  for path in subfolder["subfolder_path"].rglob("*"):
    print("File : ",  path.name)
    if path.is_file():
      if "happy" in path.name.lower():
        shutil.copy(str(path), str(happy_dir / path.name))
      elif "angry" in path.name.lower():
        shutil.copy(str(path), str(angry_dir / path.name))
      elif "sad" in path.name.lower():
        shutil.copy(str(path), str(sad_dir / path.name))
      elif "neutral" in path.name.lower():
        shutil.copy(str(path), str(neutral_dir / path.name))

File :  kawamura_left_angry_sunglasses.png
File :  kawamura_straight_angry_sunglasses_4.png
File :  kawamura_right_neutral_sunglasses.png
File :  kawamura_right_sad_open_2.png
File :  kawamura_right_sad_open_4.png
File :  kawamura_right_neutral_open_4.png
File :  kawamura_right_neutral_open_2.png
File :  kawamura_right_neutral_sunglasses_2.png
File :  kawamura_right_neutral_sunglasses_4.png
File :  kawamura_right_neutral_open.png
File :  kawamura_right_sad_sunglasses.png
File :  kawamura_right_sad_open.png
File :  kawamura_right_happy_sunglasses_2.png
File :  kawamura_right_happy_sunglasses_4.png
File :  kawamura_right_happy_open_2.png
File :  kawamura_right_angry_sunglasses.png
File :  kawamura_right_happy_open_4.png
File :  kawamura_right_happy_open.png
File :  kawamura_right_angry_sunglasses_2.png
File :  kawamura_right_angry_sunglasses_4.png
File :  kawamura_right_angry_open_4.png
File :  kawamura_left_sad_open.png
File :  kawamura_right_angry_open.png
File :  kawamura_left_sad_ope

In [None]:
# check for bad file extensions
for expression_dir in [happy_dir, angry_dir, sad_dir, neutral_dir]:
  total_file = 0
  for path in expression_dir.rglob("*"):
    if path.is_file():
      if path.suffix == ".bad":
        print(f"File : {path.name} has a .bad extension")
        # remove file from folder
        path.unlink()
        print(f"File : {path.name} has been removed" )
      total_file += 1

  print(f"Total files in {expression_dir.name} : ", total_file)

Total files in happy :  311
Total files in angry :  310
Total files in sad :  314
Total files in neutral :  320


In [None]:
for expression_dir in [happy_dir, angry_dir, sad_dir, neutral_dir]:
  total_file = 0
  for path in expression_dir.rglob("*"):
    if path.is_file():
      total_file += 1

  print(f"Total files in {expression_dir.name} : ", total_file)

Total files in happy :  311
Total files in angry :  310
Total files in sad :  314
Total files in neutral :  320


In [None]:
# split dataset into 80% training, 10% validation, and 10% testing
import random

train_dir = Path.joinpath(dataset_dir, "train")
test_dir = Path.joinpath(dataset_dir, "test")
validation_dir = Path.joinpath(dataset_dir, "validation")

for d in [train_dir, test_dir, validation_dir]:
    d.mkdir(parents=True, exist_ok=True)

for expression_dir in [happy_dir, angry_dir, sad_dir, neutral_dir]:
  files = list(path for path in expression_dir.iterdir() if path.is_file())
  random.shuffle(files)

  train_files = files[0:372]
  test_files = files[372:418]
  validation_files = files[418:465]

  train_folder = train_dir / expression_dir.name
  test_folder = test_dir / expression_dir.name
  validation_folder = validation_dir / expression_dir.name

  for folder in [train_folder, test_folder, validation_folder]:
    folder.mkdir(exist_ok=True)

  for path in train_files:
    shutil.copy(str(path), str(train_folder / path.name))
  for path in test_files:
    shutil.copy(str(path), str(test_folder / path.name))
  for path in validation_files:
    shutil.copy(str(path), str(validation_folder / path.name))

In [None]:
# without 32 pixels

import random

train_dir_without30 = Path.joinpath(dataset_dir, "train_without_32")
test_dir_without30 = Path.joinpath(dataset_dir, "test_without_32")
validation_dir_without30 = Path.joinpath(dataset_dir, "validation_without_32")

# Define split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

for d in [train_dir_without30, test_dir_without30, validation_dir_without30]:
    d.mkdir(parents=True, exist_ok=True)

for expression_dir in [happy_dir, angry_dir, sad_dir, neutral_dir]:
  files = list(path for path in expression_dir.iterdir() if path.is_file())
  random.shuffle(files)

  total = len(files)
  train_end = int(total * train_ratio)
  val_end = train_end + int(total * val_ratio)

  train_files = files[:train_end]
  validation_files = files[train_end:val_end]
  test_files = files[val_end:]

  train_folder = train_dir_without30 / expression_dir.name
  test_folder = test_dir_without30 / expression_dir.name
  validation_folder = validation_dir_without30 / expression_dir.name

  for folder in [train_folder, test_folder, validation_folder]:
    folder.mkdir(exist_ok=True)

  for path in train_files:
    shutil.copy(str(path), str(train_folder / path.name))
  for path in test_files:
    shutil.copy(str(path), str(test_folder / path.name))
  for path in validation_files:
    shutil.copy(str(path), str(validation_folder / path.name))

### Remove 32 x 30 pixels images

In [None]:
from PIL import Image
backup_dir = Path("/content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/32_30_pixel_images")
backup_dir.mkdir(exist_ok=True)

for expression_dir in [happy_dir, sad_dir, neutral_dir, angry_dir]:
  for path in Path(expression_dir).iterdir():
    if path.is_file():
      img = Image.open(path)

      if img.size == (32, 30):
        print(f"Image : {str(path)} | 32 x 30 pixels")
        path.unlink()

Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_right_happy_sunglasses_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_right_happy_open_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_left_happy_open_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_left_happy_sunglasses_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_up_happy_sunglasses_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_straight_happy_open_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_assignment/dataset/happy/kawamura_straight_happy_sunglasses_4.png | 32 x 30 pixels
Image : /content/drive/MyDrive/ml_final_assignment/final_a

In [None]:
for expression_dir in [happy_dir, angry_dir, sad_dir, neutral_dir]:
  total_file = 0
  for path in expression_dir.rglob("*"):
    if path.is_file():
      total_file += 1

  print(f"Total files in {expression_dir.name} : ", total_file)

Total files in happy :  311
Total files in angry :  310
Total files in sad :  314
Total files in neutral :  320


In [None]:
train_dir_without30 = Path.joinpath(dataset_dir, "train_without_32")
test_dir_without30 = Path.joinpath(dataset_dir, "test_without_32")
validation_dir_without30 = Path.joinpath(dataset_dir, "validation_without_32")

for dir in [train_dir_without30, test_dir_without30, validation_dir_without30]:
  for expression_dir in dir.iterdir():
    total_file = 0
    print(f"FOLDER : {dir.name} / {expression_dir.name}")
    for path in expression_dir.rglob("*"):
      if path.is_file():
        total_file += 1

    print(f"Total file in {expression_dir.name} : {total_file}")
    print("=================================================")

FOLDER : train_without_32 / happy
Total file in happy : 248
FOLDER : train_without_32 / angry
Total file in angry : 248
FOLDER : train_without_32 / sad
Total file in sad : 251
FOLDER : train_without_32 / neutral
Total file in neutral : 256
FOLDER : test_without_32 / happy
Total file in happy : 32
FOLDER : test_without_32 / angry
Total file in angry : 31
FOLDER : test_without_32 / sad
Total file in sad : 32
FOLDER : test_without_32 / neutral
Total file in neutral : 32
FOLDER : validation_without_32 / happy
Total file in happy : 31
FOLDER : validation_without_32 / angry
Total file in angry : 31
FOLDER : validation_without_32 / sad
Total file in sad : 31
FOLDER : validation_without_32 / neutral
Total file in neutral : 32
