Ce notebook a pour but de réorganiser les données du dataset RVL-CDIP.
Il permet également de sélectionner un sample de ces données.

In [1]:
import os
import shutil
import pandas as pd

In [None]:
source_folder_path = "./../data/raw/images"
target_folder_path = "./../data/raw/RVL-CDIP"

#Create the target folder
if not os.path.exists(target_folder_path):
    os.makedirs(target_folder_path)

#Navigating through the source folder and copying to the destination folder
for folder, sub_folder, files in os.walk(source_folder_path):
    for file in files:
        if file.endswith('.tif'):
            filepath = os.path.join(folder, file)
            shutil.copy(filepath, target_folder_path)

In [2]:
#creating a csv file to map the pictures to their categories
csv_text = "file,target\n"

target_files = ['./../data/raw/labels/test.txt', 
                './../data/raw/labels/train.txt',
                 './../data/raw/labels/val.txt']

for i, file in enumerate(target_files):
    with open(file, 'r') as f:
        for line in f:
            file_name, target = os.path.basename(line.strip()).split(' ')
            csv_text += file_name + ","
            csv_text += target + "\n"

#saving the csv file as pictures.csv
with open("./../data/raw/pictures.csv", "w") as f:
    f.write(csv_text)



In [4]:
#creating a smaller dataset for training
number_to_select = 20000

pictures = pd.read_csv("./../data/raw/pictures.csv")
number_of_target = 16


#checking if number_to_select is divisible by number_of_target
if number_to_select % number_of_target != 0:
    number_to_select = number_to_select - number_to_select % number_of_target
    print(f"number_to_select is not divisible by number_of_target, changing number_to_select to {number_to_select}")

#selecting the first n pictures in all the targets
select_pictures = pd.DataFrame()
for i in range(number_of_target):
    select_pictures = pd.concat([select_pictures, pictures[pictures['target'] == i][:number_to_select//number_of_target]])

#print the shape and value_counts of the test_pictures
print(select_pictures.shape)
print(select_pictures['target'].value_counts())


(20000, 2)
target
0     1250
1     1250
2     1250
3     1250
4     1250
5     1250
6     1250
7     1250
8     1250
9     1250
10    1250
11    1250
12    1250
13    1250
14    1250
15    1250
Name: count, dtype: int64


In [3]:
#copying the selected pictures into a folder with their target
selected_folder_path = "./../data/raw/selected"
target_folder_path = "./../data/raw/RVL-CDIP"

if not os.path.exists(selected_folder_path):
    os.makedirs(selected_folder_path)

for index, row in select_pictures.iterrows():
    file_name = row['file']
    target = row['target']
    shutil.copy(os.path.join(target_folder_path, file_name), os.path.join(selected_folder_path, file_name))
    with open(os.path.join(selected_folder_path, file_name.rsplit('.', 1)[0] + '.txt'), 'w') as f:
        f.write(str(target))

#3 minutes pour 5000 images

In [4]:
#print the number of tif files and of txt files in the folder
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.tif')]))
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.txt')]))

19999
19999


In [9]:
#creating a smaller dataset for testing
pictures = pd.read_csv("./../data/raw/pictures.csv")

#selecting the first n pictures in all the targets that are not already in the training dataset
select_test_pictures = pd.DataFrame()
for i in range(number_of_target):
    select_test_pictures = pd.concat([select_test_pictures, pictures[(pictures['target'] == i) 
                                                                & (~pictures['file'].isin(select_pictures.file.to_list()))]
                                                                [:250]])
    
#print the shape and value_counts of the test_pictures
print(select_test_pictures.shape)
print(select_test_pictures['target'].value_counts())

(4000, 2)
target
0     250
1     250
2     250
3     250
4     250
5     250
6     250
7     250
8     250
9     250
10    250
11    250
12    250
13    250
14    250
15    250
Name: count, dtype: int64


In [10]:
#copying the selected pictures into a folder with their target
selected_folder_path = "./../data/raw/selected_test"
target_folder_path = "./../data/raw/RVL-CDIP"

if not os.path.exists(selected_folder_path):
    os.makedirs(selected_folder_path)

for index, row in select_test_pictures.iterrows():
    file_name = row['file']
    target = row['target']
    shutil.copy(os.path.join(target_folder_path, file_name), os.path.join(selected_folder_path, file_name))
    with open(os.path.join(selected_folder_path, file_name.rsplit('.', 1)[0] + '.txt'), 'w') as f:
        f.write(str(target))

In [11]:
#print the number of tif files and of txt files in the folder
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.tif')]))
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.txt')]) )

4000
4000


In [3]:
#creating a small dataset for streamlit purposes
pictures = pd.read_csv("./../data/raw/pictures.csv")

#selecting the first n pictures in all the targets
#select_pictures = pd.DataFrame()
#for i in range(16):
#    select_pictures = pd.concat([select_pictures, pictures[pictures['target'] == i][:100]])

#selecting at random 2000 pictures from the whole folder
select_random_pictures = pictures.sample(n=2000)

#print the shape and value_counts of the test_pictures
#print(select_pictures.shape)
#print(select_pictures['target'].value_counts())

print(select_random_pictures.shape)
print(select_random_pictures['target'].value_counts())

#copying the selected pictures into a folder with their target
#selected_folder_path = "./../data/raw/selected_streamlit"
#target_folder_path = "./../data/raw/RVL-CDIP"

#copying the selected random pictures into a folder with their target
selected_folder_path = "./../data/raw/selected_streamlit"
target_folder_path = "./../data/raw/RVL-CDIP"

if not os.path.exists(selected_folder_path):
    os.makedirs(selected_folder_path)

for index, row in select_random_pictures.iterrows():
    file_name = row['file']
    target = row['target']
    shutil.copy(os.path.join(target_folder_path, file_name), os.path.join(selected_folder_path, file_name))
    with open(os.path.join(selected_folder_path, file_name.rsplit('.', 1)[0] + '.txt'), 'w') as f:
        f.write(str(target))

#for index, row in select_pictures.iterrows():
#    file_name = row['file']
#    target = row['target']
#    shutil.copy(os.path.join(target_folder_path, file_name), os.path.join(selected_folder_path, file_name))
#    with open(os.path.join(selected_folder_path, file_name.rsplit('.', 1)[0] + '.txt'), 'w') as f:
#        f.write(str(target))

#print the number of tif files and of txt files in the folder
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.tif')]))
print(len([file for file in os.listdir(selected_folder_path) if file.endswith('.txt')]) )

(2000, 2)
target
7     139
13    134
11    133
1     132
12    130
14    125
6     125
15    124
0     124
3     124
9     123
2     123
4     120
8     119
10    119
5     106
Name: count, dtype: int64
2000
2000
