In this notebook I build a flexible function which reads and processes a large number of images by managing the RAM memory and by saving the result later uses.

I use this function to train a LightGBM for that I applied an image pooling in 4x4 and flattened the result and then I summed the RGB colors, which gives me a dataframe of 65,536 columns + 1 columns of out put and 24,000 rows which corresponds to 24,000 images stored on 6 GB of RAM.

This can be modified according to your needs. if you want to train classic models and not CNNs.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tqdm
from PIL import Image
import glob
import skimage.measure
import gc

In [2]:
# nbr_images : the quantity of images that we want to import

# df : - If a DataFrame that contains images exists and we want to concatenate 
#        with the images that this function is going to import, we add it. 
#        the return of the function is the existing dataframe concatenated with the new.
#      - If NaN the function creates a new DataFrame which contains the imported images.

# file_name : The name of the file into which we want to import these images. Cover, JMiPOD, JUNIWARD or UERD.

# from_ : From which image you can start importing. If 1000 entered, the function will start importing from the thousandth image.

# status : It is a gadget, it takes 'neg' or 'pos', if 'neg' it adds an output column at the end of the DataFrame equal to 0,
#          if pos it adds an output column equal to 1. it is to distinguish whether the image hides a message or not.



def img_reader(nbr_images = 10, df = None, file_name = 'Cover', from_ = 0, status = 'neg') :
    from_ = from_
    nbr_images  = nbr_images
    image_list = []
    i=0
    j=0
    df = df
    file_name = file_name
    for filename in tqdm.tqdm(glob.glob('../input/alaska2-image-steganalysis/'+file_name+'/*.jpg')): 
        if j >= from_ :
            im=mpimg.imread(filename)
            im=skimage.measure.block_reduce(im, (2,2,1), np.max) # Drop this step to not apply the image pooling.
            image_list.append(np.sum(im.reshape((d3, d1*d2)), axis = 0).tolist()) # d3 = 3, d1 and d2 = 256, without pooling this may be 3 * 512 * 512, np.sum() is for sum rgb. 
            i+=1
            if i%1000 == 0 : # is for concat DataFrame by batch of 100 images.
                if df is None:
                    df = pd.DataFrame(image_list).astype('int16')
                    del image_list
                    gc.collect()
                    image_list = []
                else :
                    df = pd.concat([df , pd.DataFrame(image_list).astype('int16')])
                    del image_list
                    gc.collect()
                    image_list = []
                    if i == nbr_images :    
                        del image_list
                        gc.collect()
                        break
        j=j+1
        
    if status == 'neg' :
        df['output'] = 0
        df['output'] = df['output'].astype('int16')
        gc.collect()
    else :
        df['output'] = 1
        df['output'] = df['output'].astype('int16')
        gc.collect()
        
    return df

In [3]:
# Here i recover an image and I apply the pooling on it in order to recover the final dimensions d1, d2 and d3.

img=mpimg.imread('../input/alaska2-image-steganalysis/Cover/00001.jpg')
test_pool = skimage.measure.block_reduce(img, (2,2,1), np.max)
d1, d2, d3 = test_pool.shape
del test_pool
gc.collect()

0

In [4]:
# I call this function 4 times:

# the first to receive 12,000 images from image 1 of the folder Cover, label them 0 and store them in df_neg.

df_neg = img_reader(nbr_images = 12000, df = None, file_name = 'Cover', from_ = 0, status = 'neg')
print('import Cover Done !')

# Thereafter i recover 4,000 images from JMiPOD starting with image number 1, then 4,000 from JUNIWARD starting from image number 4,000 and 4,000 from UERD starting from image number 8000.

# At the end i have a dataframe 'df_pos' with 4000 images of JMiPOD followed by 4000 images of JUNIWARD followed by 4000 images of UERD which makes 12000 images in total labeled 1.

df_pos = img_reader(nbr_images = 4000, df = None, file_name = 'JMiPOD', from_ = 0, status = 'pos')
print('JMiPOD Done!')
df_pos = img_reader(nbr_images = 4000, df = df_pos, file_name = 'JUNIWARD', from_ = 4000, status = 'pos')
print('JUNIWARD Done!')
df_pos = img_reader(nbr_images = 4000, df = df_pos, file_name = 'UERD', from_ = 8000, status = 'pos')
print('UERD Done!')


 16%|█▌        | 11999/75000 [20:06<1:45:33,  9.95it/s]


import Cover Done !


  5%|▌         | 3999/75000 [06:28<1:54:57, 10.29it/s]


JMiPOD Done!


 11%|█         | 7999/75000 [06:42<56:14, 19.86it/s]


JUNIWARD Done!


 16%|█▌        | 11999/75000 [06:37<34:46, 30.19it/s]

UERD Done!





In [5]:
print('df_neg info :')
display(df_neg.info())

print('df_pos info :')
display(df_pos.info())

df_neg info :
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 999
Columns: 65537 entries, 0 to output
dtypes: int16(65537)
memory usage: 1.5 GB


None

df_pos info :
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 999
Columns: 65537 entries, 0 to output
dtypes: int16(65537)
memory usage: 1.5 GB


None

1.5 GB each, No Break =)

In [6]:
print('df_neg head :')
display(df_neg.head())

print('df_pos head :')
display(df_pos.head())

df_neg head :


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65527,65528,65529,65530,65531,65532,65533,65534,65535,output
0,164,142,129,170,135,132,172,129,140,190,...,157,143,134,152,140,141,142,134,151,0
1,305,309,315,284,282,291,277,287,300,282,...,238,241,257,245,249,278,255,252,283,0
2,340,431,379,378,417,348,381,418,365,332,...,291,332,297,311,329,290,294,356,316,0
3,546,561,560,558,583,589,589,601,616,616,...,379,432,419,423,417,413,433,404,393,0
4,432,585,522,482,604,537,476,586,521,440,...,394,469,384,447,485,393,469,484,379,0


df_pos head :


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65527,65528,65529,65530,65531,65532,65533,65534,65535,output
0,164,142,129,170,134,131,171,129,140,189,...,160,146,134,153,140,141,142,134,151,1
1,305,309,315,284,282,291,277,287,300,283,...,238,241,257,245,249,278,255,252,283,1
2,340,427,375,375,409,341,369,423,369,338,...,285,329,294,308,330,291,295,358,318,1
3,535,556,555,553,589,595,595,601,617,617,...,387,435,422,420,418,415,444,399,387,1
4,435,590,527,487,623,556,495,591,525,445,...,395,469,384,445,486,394,470,489,384,1


Everything Alright

In [7]:
# Lets save them in pkl format for next.

df_neg.to_pickle('df_neg.pkl')
df_pos.to_pickle('df_pos.pkl')

In [8]:
# concatenate all, free up space.

df_train = pd.concat([df_pos, df_neg], ignore_index = True).astype('int16')
del df_pos, df_neg
gc.collect()

20

Now we have 24,000 images we can do what we want with =)

If you have suggestions to improve the importation of images I am interested.

If it helped you make an UpVote it will be chic on your part =)