# Data Preprocessing

In this part, we will follow these steps:
 - 1: Make a list of filenames
 - 2: Resize the image
 - 3: create a dataframe then save it as csv file

Let's start !

We must first import the necessary libraries

In [None]:
#librairies
import os
import csv
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [None]:
def filelist(root,file_type):
    """returns filenames under root directory""" 
    listefiles=[]
    for directory_path,directory_name,files in os.walk(root):
        for f in files :
            if f.endswith(file_type):
                listefiles.append(os.path.join(directory_path,f))
    return listefiles

In [None]:
def generate_train_df(anno_path):
    """returns dataframe of our data 
       Columns: filename,class,xmin,ymin,xmax,ymax"""
    
    annotations = filelist(anno_path,".txt")
    anno_list=[]
    for path in annotations:
        anno ={}
        with open(path) as f:
            lines = ''.join(f.readlines())
            infos = lines.split()
            if (  int(infos[2]) < 0 or  int(infos[3])<0 or int(infos[4])<0 or  int(infos[5])<0 ):
                continue
            else:
                anno['filename'] = infos[0]
                anno['class'] = infos[1]
                anno['xmin'] = int(infos[2])
                anno['ymin'] = int(infos[3])
                anno['xmax'] = int(infos[4])
                anno['ymax'] = int(infos[5])
                anno_list.append(anno)
    return pd.DataFrame(anno_list)
            

In [None]:
def resize_image_bb(images_path,img_name,write_path,bb,size):
    img = cv2.imread(os.path.join(images_path,img_name))
    (h,w) = img.shape
    img_resized = cv2.resize(img,(size,size))
    #Here you have to choice between saving the resized images in a directory 
    #or just put them  in a list
    # To save it in directory:
    new_path = os.path.join(write_path,img_name)
    cv2.imwrite(new_path,cv2.cvtColor(img_resized,cv2.COLOR_RGB2BGR))
    bb= [bb[0]/w,bb[1]/h,bb[2]/w,bb[3]/h]
    return new_path,bb

In [None]:
def bb_to_array(x):
        return (np.array((x[3],x[2],x[5],x[4])))

In [None]:
def to_csvFile(ANNOTS_PATH,IMAGES_PATH,WRITE_NEW_IMAGES_PATH,CLASSES_DICT,IMAGE_SIZE):
    new_paths=[]
    new_bbs=[]
    df_train =  generate_train_df(ANNOTS_PATH)
    df_train['class']=df_train['class'].apply(lambda x :CLASSES_DICT[x])
    
    for index,row in df_train.iterrows():
        try:
            new_path,new_bb = resize_image_bb(IMAGES_PATH ,row["filename"],WRITE_NEW_IMAGES_PATH,bb_array(row.values),IMAGE_SIZE)
            new_paths.append(new_path)
            new_bbs.append(new_bb)
        except:
            df_train =df_train.drop(df_train[df_train["filename"] == row["filename"] ].index)
    
    
    df_train["new_path"]=new_paths
    df_train["new_bb"]=new_bbs
    
    df_train.to_csv("data_CSVFile.csv",index=False)
    print("[INFO] data_CSVFile.csv was created in the current directory")
    return("annotation_data.csv")

Now we can use this functions to get our csv file wich contains:
    -filename: image name
    -class: category of object in the image
    -the default bounding box (before resizing the image):
      xmin,ymin,xmax,ymax
    -new_path: where resized images were saved
    -new_bb: a list of the reation between bounding box and the real dimensions of image (for example        
     xmin/width of image
    

This CSV file will help us in the second phase of the project!