### Imports

In [1]:
# loading basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, random
import csv
import cv2
import shutil
from pathlib import Path

### Code formatting in Jupyter cells

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

### Constants

In [3]:
path_to_all_images = os.path.join(os.path.pardir, "images/")

path_to_train_folder = os.path.join(os.path.pardir, "data", "train_images")
path_to_test_folder = os.path.join(os.path.pardir, "data", "test_images")
path_to_validation_folder = os.path.join(os.path.pardir, "data", "valid_images")

<IPython.core.display.Javascript object>

In [4]:
class_name = {
    "0": "letter",
    "1": "form",
    "2": "email",
    "3": "handwritten",
    "4": "advertisement",
    "5": "scientific report",
    "6": "scientific publication",
    "7": "specification",
    "8": "file folder",
    "9": "news article",
    "10": "budget",
    "11": "invoice",
    "12": "presentation",
    "13": "questionnaire",
    "14": "resume",
    "15": "memo",
}

<IPython.core.display.Javascript object>

## Train set

In [5]:
# converting .txt file to .csv file
def convertTxtToCsv(txtfile):
    df = pd.read_csv(txtfile, sep=" ")
    df.to_csv(csvfile)

<IPython.core.display.Javascript object>

In [7]:
txtfile = os.path.join(os.path.pardir, "labels", "train.txt")
csvfile = os.path.join(os.path.pardir, "csv_files", "train.csv")

convertTxtToCsv(txtfile)

<IPython.core.display.Javascript object>

In [8]:
# Loading data
def loadData(csvfile):
    return pd.read_csv(csvfile, header=None, usecols=[1, 2], names=["image", "class"])


df = loadData(csvfile)
df.head()

Unnamed: 0,image,class
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


<IPython.core.display.Javascript object>

In [9]:
# take 200 images from each class for training
def sampleImages(df, sampleNo):
    return (
        df.groupby("class").apply(lambda x: x.sample(sampleNo)).reset_index(drop=True)
    )


df = sampleImages(df, 200)
df.to_csv(csvfile)

<IPython.core.display.Javascript object>

In [10]:
# df has only 3200 rows now (200 for each class)
df

Unnamed: 0,image,class
0,imagesq/q/c/z/qcz89d00/500088319.tif,0
1,imagesv/v/n/n/vnn00a00/60020693.tif,0
2,imagesh/h/j/q/hjq96c00/70114304-4304.tif,0
3,imagesi/i/r/c/irc93f00/0000979946.tif,0
4,imagesw/w/j/r/wjr03c00/524592655+-2655.tif,0
...,...,...
3195,imagesj/j/m/k/jmk02d00/1338013.tif,15
3196,imagesv/v/z/k/vzk58c00/81745564_5569.tif,15
3197,imagesd/d/t/u/dtu83e00/2043616413_2043616417.tif,15
3198,imagesa/a/c/k/ack62d00/86253524_3528.tif,15


<IPython.core.display.Javascript object>

In [11]:
def changeDatatypes(df):
    df["image"] = df["image"].astype(str)
    df["class"] = df["class"].astype(int)


changeDatatypes(df)

<IPython.core.display.Javascript object>

In [12]:
# create 'data' folder where we'll store the selected train, test and validaton images
os.mkdir("data")
os.chdir(os.path.join(os.path.pardir, "data"))
os.mkdir("train_images")
os.mkdir("test_images")
os.mkdir("valid_images")

<IPython.core.display.Javascript object>

In [13]:
# new directory where the 3200 images will be stored by category
os.chdir(path_to_train_folder)

# create a subfolder for each category and copy each image to the correspondent subfolder
def groupDocsByCtegory(df, path_to_folder):
    for i in range(16):
        os.mkdir(str(i))
    os.chdir(os.path.pardir)

    for i in range(len(df)):
        for j in range(16):
            if df["class"][i] == j:
                shutil.copy2(
                    path_to_all_images + "/" + df["image"][i],
                    path_to_folder + "/" + str(j) + "/",
                )
            else:
                continue


groupDocsByCtegory(df, path_to_train_folder)

<IPython.core.display.Javascript object>

In [14]:
# rename each folder using the class name
def renameFolders(df, path_to_folder):
    for j in class_name.keys():
        os.rename(
            path_to_folder + "/" + str(j), path_to_folder + "/" + class_name[j],
        )


renameFolders(df, path_to_train_folder)

<IPython.core.display.Javascript object>

## Test set

In [15]:
# converting .txt file to .csv file
txtfile = os.path.join(os.path.pardir, "labels", "test.txt")
csvfile = os.path.join(os.path.pardir, "csv_files", "test.csv")

convertTxtToCsv(txtfile)

<IPython.core.display.Javascript object>

In [16]:
# Loading data
df = loadData(csvfile)
df.head()

Unnamed: 0,image,class
0,imagesr/r/g/e/rge31d00/503210033+-0034.tif,3
1,imagesc/c/e/j/cej80d00/517306722+-6724.tif,3
2,imagesm/m/r/r/mrr36d00/50603620-3621.tif,14
3,imagesg/g/t/u/gtu29c00/2084573574a.tif,2
4,imagesh/h/o/f/hof08d00/2071783492.tif,9


<IPython.core.display.Javascript object>

In [17]:
# take 100 images from each class for testing
df = sampleImages(df, 100)
df.to_csv(csvfile)
changeDatatypes(df)
df

Unnamed: 0,image,class
0,imagesw/w/s/y/wsy01c00/650486.tif,0
1,imagesw/w/w/g/wwg89d00/500261053.tif,0
2,imagesw/w/v/x/wvx09d00/502451788.tif,0
3,imagesa/a/h/p/ahp18e00/1001924328.tif,0
4,imagesz/z/r/c/zrc16c00/2024004763_4764.tif,0
...,...,...
1595,imagesy/y/c/d/ycd87e00/2046769244.tif,15
1596,imagesu/u/r/r/urr69c00/50176113-6113.tif,15
1597,imagesv/v/l/e/vle42f00/94009348.tif,15
1598,imagesn/n/f/c/nfc2aa00/10056109.tif,15


<IPython.core.display.Javascript object>

In [18]:
# new directory where the 1600 images will be stored by category
os.chdir(path_to_test_folder)
groupDocsByCtegory(df, path_to_test_folder)

<IPython.core.display.Javascript object>

In [19]:
# rename each folder using the class name
renameFolders(df, path_to_test_folder)

<IPython.core.display.Javascript object>

## Validation set

In [20]:
# converting .txt file to .csv file
txtfile = os.path.join(os.path.pardir, "labels", "val.txt")
csvfile = os.path.join(os.path.pardir, "csv_files", "validation.csv")

convertTxtToCsv(txtfile)

<IPython.core.display.Javascript object>

In [21]:
# Loading data
df = loadData(csvfile)
df.head()

Unnamed: 0,image,class
0,imagesg/g/t/h/gth35e00/2024525661.tif,11
1,imagesi/i/y/k/iyk38c00/512015827+-5827.tif,0
2,imagesr/r/r/e/rre21e00/87103403.tif,0
3,imagesk/k/s/u/ksu44c00/03636607.tif,4
4,imagesr/r/a/i/rai09d00/50437856-7857.tif,14


<IPython.core.display.Javascript object>

In [22]:
# take 100 images from each class for validating
df = sampleImages(df, 100)
df.to_csv(csvfile)
changeDatatypes(df)
df

Unnamed: 0,image,class
0,imagesq/q/e/f/qef83d00/509107124_509107125.tif,0
1,imagesc/c/e/y/cey60f00/0011973382.tif,0
2,imagesj/j/b/c/jbc83f00/0001142419.tif,0
3,imagesh/h/j/s/hjs72f00/tob08901.15.tif,0
4,imagesr/r/l/r/rlr00e00/87037478_87037479.tif,0
...,...,...
1595,imagesm/m/j/c/mjc83f00/0001142315.tif,15
1596,imagesa/a/s/v/asv92e00/2049300700.tif,15
1597,imagesd/d/a/a/daa60e00/82918269_82918274.tif,15
1598,imagesj/j/d/f/jdf89e00/0000107579.tif,15


<IPython.core.display.Javascript object>

In [23]:
# new directory where the 1600 images will be stored by category
os.chdir(path_to_validation_folder)
groupDocsByCtegory(df, path_to_validation_folder)

<IPython.core.display.Javascript object>

In [24]:
# rename each folder using the class name
renameFolders(df, path_to_validation_folder)

<IPython.core.display.Javascript object>