In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import glob
import pandas as pd

In [None]:
source_folders = ['/content/drive/MyDrive/Image Dataset Curation/Segmentation/hourly-2021', '/content/drive/MyDrive/Image Dataset Curation/Segmentation/2022','/content/drive/MyDrive/Image Dataset Curation/Segmentation/2023hourly_new']  # Replace with the actual folder names
destination_folder = '/content/drive/MyDrive/Image Dataset Curation/Final_Images'  # Replace with the desired destination folder name

In [None]:
#Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

In [None]:
for source_folder in source_folders:
    for filename in os.listdir(source_folder):
        if filename.endswith(('.jpg', '.jpeg', '.png')):  # Add more file extensions if needed
            source_path = os.path.join(source_folder, filename)
            destination_path = os.path.join(destination_folder, filename)
            shutil.copy(source_path, destination_path)

In [None]:
# Use glob to get paths to the merged images
image_paths = glob.glob(os.path.join(destination_folder, '*.jpg'))

In [None]:
# Create a DataFrame with the image paths
df = pd.DataFrame({'image_path': image_paths})

In [None]:
# Create a DataFrame with columns for year, month, date, time, and image_path
df = pd.DataFrame({
    'image_path': image_paths,
    'filename': [os.path.basename(path) for path in image_paths],
})

In [None]:
# Extract year, month, date, and year from the filename and convert them to datetime objects
df['year'] = df['filename'].str.extract(r'(\d{4})\d{10}_')
df['month'] = df['filename'].str.extract(r'\d{4}(\d{2})\d{8}_')
df['date'] = df['filename'].str.extract(r'\d{6}(\d{2})\d{6}_')
df['time'] = df['filename'].str.extract(r'\d{7}(\d{6})_')

In [None]:
# Convert the extracted components to datetime objects
df['datetime'] = pd.to_datetime(df['year'] + df['month'] + df['date'] + df['time'], format='%Y%m%d%H%M%S')

In [None]:
# Sort the DataFrame based on the 'datetime' column
df.sort_values(by='datetime', inplace=True)

In [None]:
# Reset the index if needed
df.reset_index(drop=True, inplace=True)

In [None]:
#Numerical Dataset
df2=pd.read_csv("/content/drive/MyDrive/Numerical Dataset Curation/Meteorological_Data.csv")

In [None]:
df2['DATETIME'] = pd.to_datetime(df2['DATETIME'])

In [None]:
# Rename the 'DATETIME' column in df2 to 'datetime' to match df
df2.rename(columns={'DATETIME': 'datetime'}, inplace=True)

In [None]:
# Merge the DataFrames on 'datetime' using an inner join
Image_Dataset = df.merge(df2, on='datetime', how='inner')

In [None]:
Image_Dataset.columns

Index(['image_path', 'filename', 'year', 'month', 'date', 'time', 'datetime',
       'Avg BRBG Total Cloud Cover [%]', 'Avg CDOC Total Cloud Cover [%]',
       'Avg CDOC Thick Cloud Cover [%]', 'Avg CDOC Thin Cloud Cover [%]',
       'Avg Sun Flag', 'Avg HCF Value', 'Avg Blue/Red_min', 'Avg Blue/Red_mid',
       'Avg Blue/Red_max', 'Avg File Extension',
       'Avg Global CMP22 (vent/cor) [W/m^2]', 'Avg Zenith Angle [degrees]',
       'Avg Azimuth Angle [degrees]', 'Avg Solar Eclipse Shading',
       'Avg Tower Dew Point Temp [deg C]', 'Avg Total Cloud Cover [%]',
       'Avg Opaque Cloud Cover [%]', 'Avg Avg Wind Speed @ 6ft [m/s]',
       'Avg Precipitation [mm]', 'Avg Moisture', 'Avg Albedo (CMP11)'],
      dtype='object')

In [None]:
columns_to_keep = ['image_path', 'datetime', 'Avg Global CMP22 (vent/cor) [W/m^2]']
Image_Dataset_Final = Image_Dataset[columns_to_keep]

In [None]:
# Save the DataFrame to a CSV file
Image_Dataset_Final.to_csv('/content/drive/MyDrive/Image Dataset Curation/Image_Dataset_Final.csv', index=False)

In [None]:
# Save the Raw DataFrame to a CSV file
Image_Dataset.to_csv('/content/drive/MyDrive/Image Dataset Curation/Raw_Image_Dataset.csv', index=False)