In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Get all the necessary packages

In [0]:
import numpy as np
import pandas as pd
import os
import itertools
from PIL import Image
import pickle
import glob
import cv2

In [0]:
#### Aux functions

## Process image
def processImage(i):
  target_size = (224,224)
  img = image.load_img(i)
  img = img.resize(target_size)
  x = image.img_to_array(img)
  return x

## Function that returns a dataframe with the name and the path of all the jpg in the input path
def getAllJPGFilesAugemented(path_dataset):
  files_to_read = [path_dataset + "/" + x for x in os.listdir(path_dataset) if "jpg" in x]
  files_name = [x for x in os.listdir(path_dataset) if "jpg" in x]
  df = pd.DataFrame(
                    {'files': files_to_read,
                     'file_name':files_name
                    })
  return df

def getAllJPGFiles(path_dataset):
    dirs = [d for d in os.listdir(path_dataset) if os.path.isdir(os.path.join(path_dataset, d))]
    files = []
    typeFile_list = []
    name_list = []
    for i in dirs:
        paths = [path_dataset + "/" + i + "/" + x for x in os.listdir(path_dataset + "/" + i) if "jpg" in x]
        file_type = [i for x in os.listdir(path_dataset + "/" + i) if "jpg" in x]
        file_name = [x for x in os.listdir(path_dataset + "/" + i) if "jpg" in x]

        #files_to_read = [x for x in os.listdir(path_dataset + "/" + i) if "jpg" in x]
        files.append(paths)
        typeFile_list.append(file_type)
        name_list.append(file_name)

    dt_files = list(itertools.chain.from_iterable(files))
    dt_tipe_file = list(itertools.chain.from_iterable(typeFile_list))
    dt_name = list(itertools.chain.from_iterable(name_list))
    df = pd.DataFrame(
                      {'files': dt_files,
                       'file_type': dt_tipe_file,
                       'file_name':dt_name
                      })
    return df


## Original and synthetic data

In [0]:
path_dataset= "/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer"
original = getAllJPGFiles(path_dataset)
def isDigital(aux):
    if 'digital' in aux:
        return True
    return False

original['digital'] = original['file_type'].apply(isDigital)
original = original.loc[original['digital'] ]

In [20]:
original = original[['files', 'file_name']]
original.head(2)


Unnamed: 0,files,file_name
159,/content/drive/My Drive/Team Bergere/Optimizer...,0638cd7a868ee31c370c9f43acc0399ce535617a.jpg
160,/content/drive/My Drive/Team Bergere/Optimizer...,0098fde1754c51028bb4785e9eb5ef39519c1e16.jpg


In [0]:
## Load HQ_digital synthetic data
path_dataset= "/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/augmented_data/HQ_digital5"
df_augmented1 = getAllJPGFilesAugemented(path_dataset)
path_dataset= "/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/augmented_data/MQ_digital5"
df_augmented2 = getAllJPGFilesAugemented(path_dataset)
## Load MQ_digital synthetic data
df_augmented = pd.concat([df_augmented1, df_augmented2], axis = 0)
## To be able to merge with the labels
df_augmented['file_name'] = df_augmented['file_name'].apply(lambda x: x.split("_")[0])
df_augmented['file_name']= df_augmented['file_name'] + ".jpg"


In [22]:
df_augmented.shape


(11075, 2)

In [0]:
train = pd.concat([original,df_augmented], axis = 0)

In [24]:
train.shape

(11848, 2)

### Load labels

In [26]:
## Merge with the labels
digital_all = pd.read_csv("/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/Digital_label.csv", sep = ",", dtype = {'correct_label':str})
digital_all = digital_all[['image', 'correct_label', 'correct_cat']]
digital_all.columns = ['file_name', 'correct_label', 'correct_cat']
digital_all.head(2)

Unnamed: 0,file_name,correct_label,correct_cat
0,ffd083a696ac010ca5a9a3cbecdb28a57a167732.jpg,75.0,H
1,fe1dc0a44b58c36fd9215a6d201920deefe7c309.jpg,58.0,M


In [0]:
train = train.merge(digital_all,
                   on = 'file_name',
                   how = 'left')

In [28]:
train.head(2)

Unnamed: 0,files,file_name,correct_label,correct_cat
0,/content/drive/My Drive/Team Bergere/Optimizer...,0638cd7a868ee31c370c9f43acc0399ce535617a.jpg,63.0,H
1,/content/drive/My Drive/Team Bergere/Optimizer...,0098fde1754c51028bb4785e9eb5ef39519c1e16.jpg,64.01,M


In [0]:
x_train = train.loc[~train['correct_label'].isna()]
x_train = x_train[['files', 'file_name', 'correct_label']]
x_train.rename(columns = {'correct_label':'label'}, inplace = True)

In [38]:
x_train.head(2)

Unnamed: 0,files,file_name,label
0,/content/drive/My Drive/Team Bergere/Optimizer...,0638cd7a868ee31c370c9f43acc0399ce535617a.jpg,63.0
1,/content/drive/My Drive/Team Bergere/Optimizer...,0098fde1754c51028bb4785e9eb5ef39519c1e16.jpg,64.01


### Process Labels

Digital labels have the same format:

Digit 1 Digit 2 Digit 3 . Decimal 1 Decimal2

We need to make sure that all the labels have the same format:
- If the label doesn't have Decimal 2, we add "10" at the end
- If the label doesnt't have Digit 1 we add "10" at the beginning
- We need to remove the point
- We add the len at the beginning

On this way, the labels are a (1, 6) array with this format

[len, Digit1, Digit2, Digit3, Decimal1, Decimal2]

For instance:
- 61.41 is labeled as [4 10 6 1 4 1]
- 124.1 is labeled as [5 1 2 4 1 10]
- 124.10 is labeled as [5 1 2 4 1 0]

In [0]:
x_train = x_train.loc[~x_train['label'].isna()]
####Geting the correct label
x_train['len_label'] = x_train['label'].apply(str).apply(len)
pictures_to_take = x_train['len_label']>3
x_train = x_train.loc[x_train['len_label']>3]
x_train['len_label_decimal'] =  x_train['label'].apply(lambda x: len(x.split('.')[1]))
x_train['label'] = np.where(x_train['label'] == 1, x_train.label + "x", x_train.label)

def padding(s):
  length = len(str(s))
  return (5-length)*"x" +str (s)
#train["padded"] = train["correct_label"].apply(padding)
x_train["label"] = x_train['label'].apply(str).apply(lambda x: x.replace(".", ""))
x_train['label'] = x_train["label"].apply(padding)

x_train['final_label'] = (x_train['len_label'] -1).astype(str) +x_train['label']
x_train["final_label"] = x_train['final_label'].apply(list).apply(lambda a: ["10" if x=="x" else x for x in a])

In [40]:
x_train.head(4)

Unnamed: 0,files,file_name,label,len_label,len_label_decimal,final_label
0,/content/drive/My Drive/Team Bergere/Optimizer...,0638cd7a868ee31c370c9f43acc0399ce535617a.jpg,x6300,5,2,"[4, 10, 6, 3, 0, 0]"
1,/content/drive/My Drive/Team Bergere/Optimizer...,0098fde1754c51028bb4785e9eb5ef39519c1e16.jpg,x6401,5,2,"[4, 10, 6, 4, 0, 1]"
2,/content/drive/My Drive/Team Bergere/Optimizer...,17ef620d20013eda5358b375de69b79af632d8b6.jpg,x2601,5,2,"[4, 10, 2, 6, 0, 1]"
3,/content/drive/My Drive/Team Bergere/Optimizer...,08e5c29addcf21a37718722500e74e07a387834d.jpg,12501,6,2,"[5, 1, 2, 5, 0, 1]"


### Laoding and saving images texto del enlace

In [0]:
## Doing in this way because is easy to paralellize in different notebook
## Save the picture in this format (224, 224, 3)
## Save the array in this format (1,6)

train1 = x_train.iloc[0:3000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_good1.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)
  
train1 = x_train.iloc[3000:6000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_good2.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)
  
train1 = x_train.iloc[6000:9000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_good2.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)

train1 = x_train.iloc[9000:len(x_train)]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_good2.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)



### Preprocessed original and extra pictures


In [41]:
## Load and concatenate extra data created
path_dataset= "/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/e2e_data/gen_t1"
gen1 = getAllJPGFilesAugemented(path_dataset)
gen1['file_name'] = gen1['file_name'].apply(lambda x: x.split(".jpg")[0])
path_dataset= "/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/e2e_data/gen_t2"
gen2 = getAllJPGFilesAugemented(path_dataset)
gen2['file_name'] = gen2['file_name'].apply(lambda x: x.split(".jpg")[0])
df_generated = pd.concat([gen1, gen2], axis = 0)
df_generated.head(3)

Unnamed: 0,files,file_name
0,/content/drive/My Drive/Team Bergere/Optimizer...,422fa3fd-8f77-4720-bb4b-cc3078bff9e1
1,/content/drive/My Drive/Team Bergere/Optimizer...,1e7606bd-9bbe-4606-b3f7-781e717b6129
2,/content/drive/My Drive/Team Bergere/Optimizer...,b3e45f5b-b2db-47a9-b16f-af2d1151019e


In [42]:
## Read labels
csv = glob.glob("/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/e2e_data/*/*.csv")
csvs = []
for i in csv:
  csvs.append(pd.read_csv(i,dtype = {'label':str}))
all_csv = pd.concat(csvs, axis = 0)
all_csv.columns = ['file_name', 'label']
all_csv.head(3)

Unnamed: 0,file_name,label
0,7142d8cc-e98f-4d79-bbfc-836f8d863538,167.31
1,12edbe1a-b148-4d93-9455-9c36505c0b44,466.46
2,3cdf33aa-325e-4f2e-b42d-081582077d68,43.57


In [0]:
x_train = df_generated.merge(all_csv,
                   on = 'file_name',
                   how = 'left')

In [46]:
x_train.head(3)

Unnamed: 0,files,file_name,label
0,/content/drive/My Drive/Team Bergere/Optimizer...,422fa3fd-8f77-4720-bb4b-cc3078bff9e1,37.18
1,/content/drive/My Drive/Team Bergere/Optimizer...,1e7606bd-9bbe-4606-b3f7-781e717b6129,146.67
2,/content/drive/My Drive/Team Bergere/Optimizer...,b3e45f5b-b2db-47a9-b16f-af2d1151019e,887.61


### Process Labels

Digital labels have the same format:

Digit 1 Digit 2 Digit 3 . Decimal 1 Decimal2

We need to make sure that all the labels have the same format:
- If the label doesn't have Decimal 2, we add "10" at the end
- If the label doesnt't have Digit 1 we add "10" at the beginning
- We need to remove the point
- We add the len at the beginning

On this way, the labels are a (1, 6) array with this format

[len, Digit1, Digit2, Digit3, Decimal1, Decimal2]

For instance:
- 61.41 is labeled as [4 10 6 1 4 1]
- 124.1 is labeled as [5 1 2 4 1 10]
- 124.10 is labeled as [5 1 2 4 1 0]


In [47]:
x_train = x_train.loc[~x_train['label'].isna()]
####Geting the correct label
x_train['len_label'] = x_train['label'].apply(str).apply(len)
pictures_to_take = x_train['len_label']>3
x_train = x_train.loc[x_train['len_label']>3]
x_train['len_label_decimal'] =  x_train['label'].apply(lambda x: len(x.split('.')[1]))
x_train['label'] = np.where(x_train['label'] == 1, x_train.label + "x", x_train.label)

def padding(s):
  length = len(str(s))
  return (5-length)*"x" +str (s)
#train["padded"] = train["correct_label"].apply(padding)
x_train["label"] = x_train['label'].apply(str).apply(lambda x: x.replace(".", ""))
x_train['label'] = x_train["label"].apply(padding)

x_train['final_label'] = (x_train['len_label'] -1).astype(str) +x_train['label']
x_train["final_label"] = x_train['final_label'].apply(list).apply(lambda a: ["10" if x=="x" else x for x in a])
x_train.head(3)

Unnamed: 0,files,file_name,label,len_label,len_label_decimal,final_label
0,/content/drive/My Drive/Team Bergere/Optimizer...,422fa3fd-8f77-4720-bb4b-cc3078bff9e1,x3718,5,2,"[4, 10, 3, 7, 1, 8]"
1,/content/drive/My Drive/Team Bergere/Optimizer...,1e7606bd-9bbe-4606-b3f7-781e717b6129,14667,6,2,"[5, 1, 4, 6, 6, 7]"
2,/content/drive/My Drive/Team Bergere/Optimizer...,b3e45f5b-b2db-47a9-b16f-af2d1151019e,88761,6,2,"[5, 8, 8, 7, 6, 1]"


In [0]:
## Doing in this way because is easy to paralellize in different notebook
## Save the picture in this format (224, 224, 3)
## Save the array in this format (1,6)

train1 = x_train.iloc[0:5000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_extra1.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)

train1 = x_train.iloc[5000:10000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_extra2.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)

train1 = x_train.iloc[10000:15000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_extra3.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)
  
train1 = x_train.iloc[15000:20000]
train1['pictures'] = train1['files'].apply(processImage)
with open('/content/drive/My Drive/Team Bergere/Optimizer/Data/optimizer/save_data/data_extra3.pickle', 'wb') as f:
  pickle.dump(train1[['pictures', 'final_label']], f)
