In [None]:
import os
import numpy as np
import pandas as pd
from osgeo import gdal

def read_image(file_path):
    """Read a .tif image and return its data as a numpy array."""
    ds = gdal.Open(file_path, gdal.GA_ReadOnly)
    img_data = []
    for band in range(ds.RasterCount):
        band_data = ds.GetRasterBand(band + 1).ReadAsArray()
        img_data.append(band_data)
    return np.dstack(img_data)

# def read_image(file_path):
#     """Read a .tif image and return its data as a numpy array."""
#     ds = gdal.Open(file_path, gdal.GA_ReadOnly)
#     if ds.RasterCount < 4:
#         raise ValueError(f"Image at {file_path} does not have 4 channels.")
#     img_data = []
#     for band in range(1, 5):  # Reading only the first 4 bands
#         band_data = ds.GetRasterBand(band).ReadAsArray()
#         img_data.append(band_data)
#     return np.dstack(img_data)

def process_images(folder_path, batch_size=10):
    """Process all images in the given folder and return a DataFrame."""
    batch_data = []
    batch_count = 0
    data = []
    for class_label in os.listdir(folder_path):
        class_folder = os.path.join(folder_path, class_label)
        print(f"Processing class: {class_label} in folder: {class_folder}")
#         print(os.path.isdir)
#         print(os.listdir(class_folder))
        try:
             if os.path.isdir(class_folder) and class_label in ['0','1']:
#             print("Class directory found")
                try:
                    for image_file in os.listdir(class_folder):
                        print("Inside the listdir- ", image_file)
                        if image_file.endswith('.jpg'):
                            print("Image is valid jpg")
                            image_path = os.path.join(class_folder, image_file)
                            print(f"Reading image: {image_path}")
                            image_data = read_image(image_path)
                            print("image data size - ", len(image_data))
                            if image_data.size > 0:  # Check if image data is not empty
                        # Flatten the image data and append class label
                                flat_data = image_data.reshape(-1, image_data.shape[2])
                                flat_data_with_label = np.hstack([flat_data, np.full((flat_data.shape[0], 1), class_label)])
                                batch_data.extend(flat_data_with_label)
                                print("batch data - ", len(batch_data))
#                                 print("batch size - ", len(batch_size))
                            else:
                                print(f"Warning: Empty image at {image_path}")
                            if len(batch_data) >= batch_size:
                                yield pd.DataFrame(batch_data)
                                batch_data = []  # Reset batch
                                batch_count += 1
                                print(f"Yielded a batch of size {batch_size}")
                except Exception as e:
                    print(f"Error processing directory {class_folder}: {e}")
        except Exception as e:
            print(f"Error processing directory {class_folder}: {e}")
    if batch_data:
        yield pd.DataFrame(batch_data)
        print(f"Yielded the last batch")
#     return pd.DataFrame(data)
    
# Set the folder path and process the images
# folder_path = './test/0'
folder_path = '/Users/c.himadry/Downloads/NAIP/test'
# image_data_df = process_images(folder_path)

# # Define column names (adjust according to the number of bands in your images)
# column_names = [f'Band_{i}' for i in range(image_data_df.shape[1] - 1)] + ['Class_Label']
# image_data_df.columns = column_names

# # Save the data to a CSV file
# image_data_df.to_csv('image_data.csv', index=False)
# csv_file = 'image_data.csv'
# Process images and save in batches
for i, df_batch in enumerate(process_images(folder_path)):
    if df_batch.empty:
        print(f"Batch {i} is empty.")
        continue
    if i == 0:
        # Write the first batch with headers
        print(df_batch.head())
        df_batch.to_csv('/Users/c.himadry/Downloads/NAIP/image_data.csv', mode='w', index=False)
    else:
        # Append subsequent batches without headers
        print(df_batch.head())
        df_batch.to_csv('/Users/c.himadry/Downloads/NAIP/image_data.csv', mode='a', index=False, header=False)
#         print(df_batch)

    print(f"Processed batch {i+1}")

Processing class: .DS_Store in folder: /Users/c.himadry/Downloads/NAIP/test/.DS_Store
Processing class: 0 in folder: /Users/c.himadry/Downloads/NAIP/test/0
Inside the listdir-  298911_ortho_1-1_1n_s_ia013_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/298911_ortho_1-1_1n_s_ia013_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  95  109  74  0
1  93  109  73  0
2  95  110  77  0
3  97  112  81  0
4  92  107  76  0
Processed batch 1
Yielded a batch of size 10
Inside the listdir-  55335_ortho_1-1_1n_s_il011_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/55335_ortho_1-1_1n_s_il011_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  73  94  75  0
1  75  96  77  0
2  78  99  80  0
3  77  98  79  0
4  73  94  75  0
Processed batch 2
Yielded a batch of size 10
Inside the listdir-  113755_ortho_1-1_1n_s_il045_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downl

batch data -  57600
    0   1   2  3
0  66  88  65  0
1  63  85  62  0
2  68  90  67  0
3  48  70  47  0
4  66  88  67  0
Processed batch 24
Yielded a batch of size 10
Inside the listdir-  325130_ortho_1-1_1n_s_ia023_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/325130_ortho_1-1_1n_s_ia023_2017_1.jpg
image data size -  240
batch data -  57600
     0    1   2  3
0  134  120  94  0
1  132  120  94  0
2  125  115  88  0
3  113  106  78  0
4  105  101  72  0
Processed batch 25
Yielded a batch of size 10
Inside the listdir-  121940_ortho_1-1_1n_s_il053_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/121940_ortho_1-1_1n_s_il053_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  76  93  74  0
1  77  94  75  0
2  78  95  76  0
3  77  94  75  0
4  75  92  73  0
Processed batch 26
Yielded a batch of size 10
Inside the listdir-  345708_ortho_1-1_1n_s_ia031_2017_1.jpg
Image is valid jpg
Reading image: 

batch data -  57600
    0    1   2  3
0  95  122  89  0
1  77  104  71  0
2  84  109  77  0
3  97  120  91  0
4  94  117  88  0
Processed batch 48
Yielded a batch of size 10
Inside the listdir-  79301_ortho_1-1_1n_s_il021_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/79301_ortho_1-1_1n_s_il021_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  47  74  57  0
1  48  75  58  0
2  50  77  60  0
3  52  79  62  0
4  52  79  62  0
Processed batch 49
Yielded a batch of size 10
Inside the listdir-  128620_ortho_1-1_1n_s_il059_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/128620_ortho_1-1_1n_s_il059_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  55  78  62  0
1  55  78  62  0
2  55  78  62  0
3  55  78  62  0
4  56  79  63  0
Processed batch 50
Yielded a batch of size 10
Inside the listdir-  418744_ortho_1-1_1n_s_ia079_2017_1.jpg
Image is valid jpg
Reading image: /Users/c

    0   1   2  3
0  42  64  43  0
1  61  83  62  0
2  54  76  55  0
3  55  77  56  0
4  67  89  68  0
Processed batch 72
Yielded a batch of size 10
Inside the listdir-  201217_ortho_1-1_1n_s_il107_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/201217_ortho_1-1_1n_s_il107_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  74  96  73  0
1  65  87  64  0
2  58  80  57  0
3  60  82  59  0
4  69  91  68  0
Processed batch 73
Yielded a batch of size 10
Inside the listdir-  549617_ortho_1-1_1n_s_ia187_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/549617_ortho_1-1_1n_s_ia187_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  69   98  67  0
1  71  100  69  0
2  84  113  83  0
3  73  102  72  0
4  71  100  70  0
Processed batch 74
Yielded a batch of size 10
Inside the listdir-  236553_ortho_1-1_1n_s_il135_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads

    0    1   2  3
0  71   96  67  0
1  73   98  69  0
2  75  100  71  0
3  76  101  72  0
4  77  102  73  0
Processed batch 96
Yielded a batch of size 10
Inside the listdir-  71949_ortho_1-1_1n_s_il019_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/71949_ortho_1-1_1n_s_il019_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  54  76  55  0
1  60  82  61  0
2  70  92  71  0
3  76  98  77  0
4  70  92  71  0
Processed batch 97
Yielded a batch of size 10
Inside the listdir-  505736_ortho_1-1_1n_s_ia149_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/505736_ortho_1-1_1n_s_ia149_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  67  85  63  0
1  80  98  76  0
2  78  96  74  0
3  69  87  65  0
4  70  88  66  0
Processed batch 98
Yielded a batch of size 10
Inside the listdir-  215408_ortho_1-1_1n_s_il115_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/N

    0    1   2  3
0  82   97  74  0
1  77   92  69  0
2  83   98  75  0
3  86  101  78  0
4  80   95  72  0
Processed batch 120
Yielded a batch of size 10
Inside the listdir-  263626_ortho_1-1_1n_s_il175_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/263626_ortho_1-1_1n_s_il175_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  74  99  69  0
1  71  96  66  0
2  67  92  62  0
3  66  91  61  0
4  68  93  63  0
Processed batch 121
Yielded a batch of size 10
Inside the listdir-  127800_ortho_1-1_1n_s_il059_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/127800_ortho_1-1_1n_s_il059_2017_1.jpg
image data size -  240
batch data -  57600
    0   1   2  3
0  65  83  67  0
1  58  76  60  0
2  56  74  58  0
3  60  78  62  0
4  63  81  65  0
Processed batch 122
Yielded a batch of size 10
Inside the listdir-  326843_ortho_1-1_1n_s_ia023_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downlo

batch data -  57600
    0    1   2  3
0  82  100  74  0
1  83  101  75  0
2  78   96  70  0
3  71   89  63  0
4  76   94  68  0
Processed batch 144
Yielded a batch of size 10
Inside the listdir-  362761_ortho_1-1_1n_s_ia041_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/362761_ortho_1-1_1n_s_ia041_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  86  108  70  0
1  89  111  73  0
2  90  112  74  0
3  88  110  71  0
4  89  111  72  0
Processed batch 145
Yielded a batch of size 10
Inside the listdir-  372854_ortho_1-1_1n_s_ia047_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/372854_ortho_1-1_1n_s_ia047_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  66   87  68  0
1  64   85  66  0
2  74   95  76  0
3  88  109  90  0
4  84  105  86  0
Processed batch 146
Yielded a batch of size 10
Inside the listdir-  197986_ortho_1-1_1n_s_il107_2017_1.jpg
Image is valid jpg
Readin

batch data -  57600
     0    1   2  3
0   80   98  74  0
1   61   79  55  0
2   74   92  68  0
3  103  121  97  0
4   97  115  91  0
Processed batch 168
Yielded a batch of size 10
Inside the listdir-  466865_ortho_1-1_1n_s_ia113_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/466865_ortho_1-1_1n_s_ia113_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  74  100  75  0
1  74  100  75  0
2  74  100  75  0
3  75  101  76  0
4  75  101  76  0
Processed batch 169
Yielded a batch of size 10
Inside the listdir-  472401_ortho_1-1_1n_s_ia119_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/472401_ortho_1-1_1n_s_ia119_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  91  118  85  0
1  86  113  80  0
2  78  105  72  0
3  81  108  75  0
4  90  117  84  0
Processed batch 170
Yielded a batch of size 10
Inside the listdir-  83554_ortho_1-1_1n_s_il021_2017_1.jpg
Image is valid jpg
R

    0    1   2  3
0  65   97  60  0
1  68  100  63  0
2  72  104  67  0
3  74  106  69  0
4  75  107  70  0
Processed batch 191
Yielded a batch of size 10
Inside the listdir-  211654_ortho_1-1_1n_s_il113_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/211654_ortho_1-1_1n_s_il113_2017_1.jpg
image data size -  240
batch data -  57600
    0    1   2  3
0  77  102  73  0
1  72   97  68  0
2  66   91  62  0
3  82  107  78  0
4  62   87  58  0
Processed batch 192
Yielded a batch of size 10
Inside the listdir-  102336_ortho_1-1_1n_s_il037_2017_1.jpg
Image is valid jpg
Reading image: /Users/c.himadry/Downloads/NAIP/test/0/102336_ortho_1-1_1n_s_il037_2017_1.jpg
image data size -  240
batch data -  57600
     0    1    2  3
0  102  120   96  0
1  110  128  104  0
2  117  135  111  0
3  119  137  113  0
4  121  139  115  0
Processed batch 193
Yielded a batch of size 10
Inside the listdir-  333078_ortho_1-1_1n_s_ia025_2017_1.jpg
Image is valid jpg
Reading image:

In [1]:
import pandas as pd

input_csv_path = '/Users/c.himadry/Desktop/crop_classification/dataset/image_data.csv'  # Path to your original large CSV file
output_csv_path = '/Users/c.himadry/Downloads/NAIP/image_data_processed.csv'  # Path for the new deduplicated CSV file

chunk_size = 10000  # Adjust this depending on your memory capacity
columns = None  # For storing column names

# Create an iterator to read the CSV file in chunks
reader = pd.read_csv(input_csv_path, chunksize=chunk_size)

# Process each chunk
for i, chunk in enumerate(reader):
    print(f"Processing chunk {i+1}")

    # Store column names from the first chunk
    if columns is None:
        columns = chunk.columns

    # Remove duplicate rows
    deduplicated_chunk = chunk.drop_duplicates()

    # Write the deduplicated chunk to a new CSV file
    if i == 0:
        # Write the first chunk with headers
        deduplicated_chunk.to_csv(output_csv_path, mode='w', index=False)
    else:
        # Append subsequent chunks without headers
        deduplicated_chunk.to_csv(output_csv_path, mode='a', index=False, header=False)

    print(f"Processed chunk {i+1}, {len(deduplicated_chunk)} rows written to CSV")

print("Processing completed. Deduplicated data saved to:", output_csv_path)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/c.himadry/Desktop/crop_classification/dataset/image_data.csv'