# run growth.py

## generates "ndvi_all_results_growth.csv" (ndvi_3.1 + growth rate)

# get image brightness

In [1]:
import cv2
import numpy as np
import pandas as pd
import os
from pathlib import Path
from datetime import datetime

def get_image_brightness_cv2(image_path):
    try:
        img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"Error: Could not read the image at {image_path}.")
            return None
        
        # get mean
        brightness = np.mean(img)
        return brightness
    except Exception as e:
        print(f" error with {image_path}: {e}")
        return None

def parse_datetime_from_filename(filename):
    """
    Extract datetime from filename with format DD_MM_HH_MM_ndvi.jpg
    Also handles variations like DD_MM_HH_MM.jpg or DD_MM_HH_MM_ndvi.jpg
    """
    try:
        # Remove file extension and split by underscores
        name_without_ext = os.path.splitext(filename)[0]
        parts = name_without_ext.split('_')
        
        # Handle different filename formats
        if len(parts) >= 4:
            # Check if the last part is 'ndvi' and remove it
            if parts[-1].lower() == 'ndvi':
                parts = parts[:-1]
            
            if len(parts) >= 4:
                day = int(parts[0])
                month = int(parts[1])
                hour = int(parts[2])
                minute = int(parts[3])
                
                # Create a datetime object (using current year since it's not in filename)
                dt = datetime(datetime.now().year, month, day, hour, minute)
                return dt
    except (ValueError, IndexError):
        pass
    # Return a very old date if parsing fails, so these files appear first
    return datetime(1900, 1, 1)

def batch_process_all_directories(base_directory="."):
    """
    Process all subdirectories matching the pattern 'ba_*_organized',
    extracting brightness and metadata for each image.
    """
    base_dir = Path(base_directory)
    all_results = []

    # Find all matching directories dynamically
    camera_directories = sorted(base_dir.glob("ba_*_organized"))

    for cam_dir in camera_directories:
        if not cam_dir.is_dir():
            continue

        # Extract the original camera name from the directory
        parts = cam_dir.name.split('_')
        original_camera_name = '_'.join(parts[:-1])

        # --- CHANGE: Remap specific camera names as requested ---
        # This block checks the original name and reassigns it for the CSV file.
        if original_camera_name == 'ba_1_1':
            camera_name_to_save = 'ba_3'
        elif original_camera_name == 'ba_2_2':
            camera_name_to_save = 'ba_4'
        else:
            camera_name_to_save = original_camera_name

        print(f"\n--- Processing Directory: {cam_dir.name} (Saving as Camera: {camera_name_to_save}) ---")

        # Process each date subdirectory
        for date_dir in cam_dir.iterdir():
            if date_dir.is_dir() and date_dir.name in [
                'Aug_26','Aug_27','Aug_28','Aug_29','Aug_30','Aug_31',
                'Sep_01','Sep_02','Sep_03','Sep_04','Sep_05','Sep_06',
                'Sep_07','Sep_08','Sep_09','Sep_10','Sep_11','Sep_12','Sep_13','Sep_14'
            ]:
                print(f"  Processing Date: {date_dir.name}")

                try:
                    # Find all image files in the directory
                    image_files = [
                        f for f in date_dir.iterdir() 
                        if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')
                    ]

                    if not image_files:
                        print(f"    No image files found in {date_dir}")
                        continue

                    for image_path in image_files:
                        brightness_value = get_image_brightness_cv2(image_path)
                        parsed_datetime = parse_datetime_from_filename(image_path.name)
                        
                        if brightness_value is not None:
                            all_results.append({
                                'full_path': str(image_path),
                                'camera': camera_name_to_save, # Use the remapped name
                                'date_dir': date_dir.name,
                                'image': image_path.name,
                                'brightness': brightness_value,
                                'datetime': parsed_datetime
                            })

                except Exception as e:
                    print(f"Error processing {date_dir}: {e}")

    # Create DataFrame and save results
    if all_results:
        df = pd.DataFrame(all_results)
        # Sort the DataFrame by camera and datetime
        df.sort_values(by=['camera', 'datetime'], inplace=True)
        output_csv = base_dir / "image_brightness_and_metadata.csv"
        df.to_csv(output_csv, index=False)
        print(f"\nSaved results to {output_csv}")
        return df
    else:
        print("No results to save")
        return pd.DataFrame()

if __name__ == "__main__":
    # You can specify the base directory here.
    # For example, use "." to start from the current directory.
    df_results = batch_process_all_directories(base_directory=".")
    print("\nProcessing complete.")




--- Processing Directory: ba_1_1_organized (Saving as Camera: ba_3) ---
  Processing Date: Sep_10
  Processing Date: Sep_14
  Processing Date: Sep_12
  Processing Date: Sep_13
  Processing Date: Sep_04
    No image files found in ba_1_1_organized/Sep_04
  Processing Date: Sep_07
  Processing Date: Sep_11
  Processing Date: Sep_09
  Processing Date: Sep_05
  Processing Date: Sep_08
  Processing Date: Sep_06

--- Processing Directory: ba_1_organized (Saving as Camera: ba_1) ---
  Processing Date: Aug_31
  Processing Date: Sep_03
  Processing Date: Aug_30
  Processing Date: Sep_04
  Processing Date: Aug_27
  Processing Date: Aug_29
  Processing Date: Sep_01
  Processing Date: Sep_02
  Processing Date: Aug_26
  Processing Date: Aug_28

--- Processing Directory: ba_2_2_organized (Saving as Camera: ba_4) ---
  Processing Date: Sep_10
  Processing Date: Sep_14
  Processing Date: Sep_12
  Processing Date: Sep_13
  Processing Date: Sep_04
    No image files found in ba_2_2_organized/Sep_04
  P

# merge datasets

In [5]:
import numpy as np
import pandas as pd
df_ndvi = pd.read_csv('ndvi_all_results_growth.csv')
df_light = pd.read_csv('image_brightness_and_metadata.csv')

df_light = df_light.drop('datetime', axis=1)

df_merged = pd.merge(df_ndvi, df_light, on=['camera', 'image'])

df_merged = df_merged.drop('date_dir', axis=1)

df_merged = df_merged.drop('full_path', axis=1)

df_merged.to_csv('ndvi_light_growth_b.csv', index=False)

# filter datasets

In [6]:
import pandas as pd

def filter_all_cameras(input_filename="ndvi_light_growth_b.csv",
                         output_filename="ndvi_light_growth_filtered_all_b.csv",
                         threshold=0.0):
    """
    Loads a dataset and filters ALL records based on a mean NDVI threshold.
    """
    # 1. Load the entire dataset from the input file
    try:
        df = pd.read_csv(input_filename)
        print(f"Read {len(df)} total records from {input_filename}")
    except FileNotFoundError:
        print(f"Error: Input file '{input_filename}' not found.")
        return pd.DataFrame()

    # 2. Apply the threshold filter to the entire DataFrame
    df_filtered = df[df['mean'] >= threshold].copy()
    print(f"After applying threshold >= {threshold}, {len(df_filtered)} records remain.")

    # 3. Optional but good practice: sort the final dataframe
    if 'datetime' in df_filtered.columns:
        df_filtered['datetime'] = pd.to_datetime(df_filtered['datetime'])
        df_filtered = df_filtered.sort_values('datetime')

    # 4. Save the newly filtered dataframe to the output file
    df_filtered.to_csv(output_filename, index=False)
    print(f"Saved {len(df_filtered)} records to {output_filename}. Filtering was applied to all cameras.")

    return df_filtered

if __name__ == "__main__":
    # This will run the function with the default settings
    # It will filter the entire dataset, keeping rows where the 'mean' NDVI is >= 0.0
    filtered_data = filter_all_cameras()
    
    print("\n--- First 5 rows of the final, filtered dataset ---")
    print(filtered_data.head())

Read 6494 total records from ndvi_light_growth_b.csv
After applying threshold >= 0.0, 5205 records remain.
Saved 5205 records to ndvi_light_growth_filtered_all_b.csv. Filtering was applied to all cameras.

--- First 5 rows of the final, filtered dataset ---
       mean    median       std       min       max  growth_metric  \
0  0.890559  0.999999  0.312189  0.000000  0.999999       324164.0   
1  0.416240  0.000000  0.497336 -0.999995  0.999999       131599.0   
2  0.835001  0.999999  0.371177  0.000000  0.999999       303941.0   
3  0.347919  0.000000  0.591006 -0.999991  0.999999       129489.0   
4  0.831821  0.999999  0.374021  0.000000  0.999999       302784.0   

             image camera date_folder            datetime  brightness  
0  26_08_16_37.jpg   ba_2      Aug_26 2025-08-26 16:37:00   33.952615  
1  26_08_16_37.jpg   ba_1      Aug_26 2025-08-26 16:37:00    8.686000  
2  26_08_16_40.jpg   ba_2      Aug_26 2025-08-26 16:40:00   31.128231  
3  26_08_16_40.jpg   ba_1      Au

In [7]:
import pandas as pd

def filter_by_rolling_zscore(input_filename="ndvi_light_growth_filtered_all_b.csv",
                               output="ndvi_light_growth_f_b.csv",
                               column_to_check='mean',
                               window_size=6, # The number of data points in the moving window
                               threshold=2):
    
    df = pd.read_csv(input_filename)
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').set_index('datetime')

    # Calculate rolling mean and std dev
    # The window_size is crucial - you may need to tune it
    df['rolling_mean'] = df[column_to_check].rolling(window=window_size, min_periods=1).mean()
    df['rolling_std'] = df[column_to_check].rolling(window=window_size, min_periods=1).std()

    # Calculate the z-score based on the rolling window
    # We fill NaNs in std dev with a small number to avoid division by zero
    df['z_score'] = (df[column_to_check] - df['rolling_mean']) / df['rolling_std'].fillna(1e-9)

    # Filter out the outliers
    original_count = len(df)
    filtered_df = df[df['z_score'].abs() <= threshold].copy()
    
    # Drop helper columns
    filtered_df = filtered_df.drop(columns=['rolling_mean', 'rolling_std', 'z_score'])
    
    print(f"{original_count - len(filtered_df)} outliers removed. Rolling Z-score threshold: {threshold}.")
    print(f"{len(filtered_df)} data points remaining.")

    filtered_df.reset_index().to_csv(output, index=False)
    return filtered_df

if __name__ == "__main__":
    cleaned_data = filter_by_rolling_zscore()
    if cleaned_data is not None:
        print(cleaned_data.head())

129 outliers removed. Rolling Z-score threshold: 2.
5076 data points remaining.
                         mean    median       std       min       max  \
datetime                                                                
2025-08-26 16:37:00  0.890559  0.999999  0.312189  0.000000  0.999999   
2025-08-26 16:37:00  0.416240  0.000000  0.497336 -0.999995  0.999999   
2025-08-26 16:40:00  0.835001  0.999999  0.371177  0.000000  0.999999   
2025-08-26 16:40:00  0.347919  0.000000  0.591006 -0.999991  0.999999   
2025-08-26 16:45:00  0.831821  0.999999  0.374021  0.000000  0.999999   

                     growth_metric            image camera date_folder  \
datetime                                                                 
2025-08-26 16:37:00       324164.0  26_08_16_37.jpg   ba_2      Aug_26   
2025-08-26 16:37:00       131599.0  26_08_16_37.jpg   ba_1      Aug_26   
2025-08-26 16:40:00       303941.0  26_08_16_40.jpg   ba_2      Aug_26   
2025-08-26 16:40:00       129489.0  26