# **MODIS Data Preprocessing and Organization**

This section covers the process of **calculating monthly weighted averages for MODIS data** and **organizing the output into a structured folder hierarchy**. The purpose of this process is to create a time-series dataset where each file represents the monthly averaged MODIS data for specific environmental variables (e.g., LST Day, LST Night) over a given year.

The workflow consists of the following steps:

1. **Monthly Weighted Average Calculation**  
2. **Organizing MODIS Data into Monthly Folders**  
3. **Renaming Files for Consistency**

In [None]:
import os
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd
import json
import matplotlib.pyplot as plt

from rasterio.mask import mask
from netCDF4 import Dataset
from sklearn.preprocessing import MinMaxScaler
from shapely.geometry import shape, Point
from tqdm import tqdm
from datetime import datetime, timedelta
from rasterio.transform import from_origin, xy
from rasterio.crs import CRS

from pyproj import Transformer
import xarray as xr

import os
import re
from collections import defaultdict

import numpy as np
import rasterio

import os
import re
from datetime import datetime
from collections import defaultdict

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import rasterio


import os
import shutil

## **Step 1: Monthly Weighted Average Calculation**

This step involves generating **monthly weighted average TIF files** for each environmental variable (e.g., LST Day, LST Night) by applying appropriate weights to MODIS raster data.

In [None]:
# 폴더 경로를 지정합니다.
folder_path = '/home/scsi/Desktop/Sehoon/crop_yield/MODIS/NDVI_EVI/'

# 파일 이름 목록을 가져옵니다.
file_names = [f for f in os.listdir(folder_path) if f.endswith('.tif')]

# 파일 이름에서 날짜를 추출하는 정규 표현식을 정의합니다.
pattern = r"MODIS_Vegetation_Ukraine_(\d{4})-\d{2}-\d{2}_\d+\.tif"

# 년도별 파일 개수를 저장할 딕셔너리를 초기화합니다.
year_count = defaultdict(int)

# 파일 이름에서 년도를 추출하여 개수를 셉니다.
for file_name in file_names:
    match = re.match(pattern, file_name)
    if match:
        year = match.group(1)
        year_count[year] += 1

# 년도별 파일 개수를 출력합니다.
for year, count in sorted(year_count.items()):
    print(f"{year}: {count} files")

### 예외처리: 2022-10-16

LST, FPAR 2022-10-16 데이터을 얻을 수 없기 때문에 10-08, 10-24 자료를 평균내어 계산함.

In [None]:
def create_average_tif(file1, file2, output_file):
    with rasterio.open(file1) as src1, rasterio.open(file2) as src2:
        # Read the first band from both files
        band1 = src1.read(1)
        band2 = src2.read(1)
        
        # Ensure the dimensions match
        if band1.shape != band2.shape:
            raise ValueError("Input files must have the same dimensions")

        # Get the original data type
        dtype = band1.dtype
        
        # Calculate the average
        average_band = ((band1 + band2) / 2).astype(dtype)

        # Metadata for the new file
        meta = src1.meta
        meta.update(dtype=dtype)

        # Write the average band to a new file
        with rasterio.open(output_file, 'w', **meta) as dst:
            dst.write(average_band, 1)

# Example usage
file1 = "/home/scsi/Desktop/Sehoon/crop_yield/MODIS/LST/MODIS_LST_Ukraine_2022-10-08_27.tif"
file2 = "/home/scsi/Desktop/Sehoon/crop_yield/MODIS/LST/MODIS_LST_Ukraine_2022-10-24_29.tif"
output_file = "/home/scsi/Desktop/Sehoon/crop_yield/MODIS/LST/MODIS_LST_Ukraine_2022-10-16_28.tif"

create_average_tif(file1, file2, output_file)
print(f"New file created: {output_file}")

### 파일 인덱스 정리

In [None]:
# 폴더 경로를 지정합니다.
folder_path = '/home/scsi/Desktop/Sehoon/crop_yield/MODIS/NDVI_EVI/'

# 파일 이름 목록을 가져옵니다.
file_names = [f for f in os.listdir(folder_path) if f.endswith('.tif')]

# 파일 이름에서 날짜와 인덱스를 추출하는 정규 표현식을 정의합니다.
pattern = r"MODIS_LST_Ukraine_(\d{4})-(\d{2}-\d{2})_(\d+)\.tif"

# 파일 이름과 날짜를 추출하여 년도별로 저장합니다.
file_data_by_year = defaultdict(list)
for file_name in file_names:
    match = re.match(pattern, file_name)
    if match:
        year = match.group(1)
        date_str = f"{year}-{match.group(2)}"
        index = int(match.group(3))
        date = datetime.strptime(date_str, '%Y-%m-%d')
        file_data_by_year[year].append((file_name, date, index))

# 년도별로 파일 데이터를 정렬하고 인덱스를 다시 부여합니다.
for year, file_data in file_data_by_year.items():
    # 날짜를 기준으로 파일 데이터를 정렬합니다.
    file_data.sort(key=lambda x: x[1])

    # 새로운 파일 이름을 생성하고 파일 이름을 변경합니다.
    for i, (original_file_name, date, _) in enumerate(file_data):
        new_file_name = f"MODIS_Vegetation_Ukraine_{date.strftime('%Y-%m-%d')}_{i}.tif"
        original_file_path = os.path.join(folder_path, original_file_name)
        new_file_path = os.path.join(folder_path, new_file_name)
        os.rename(original_file_path, new_file_path)
        print(f"Renamed: {original_file_name} -> {new_file_name}")

### MODIS: 월별 가중 평균 계산

In [None]:
def generate_raster_file_paths(year, folder_path):
    raster_files = []
    start_date = None

    for file_name in os.listdir(folder_path):
        if file_name.startswith(f"MODIS_Vegetation_Ukraine_{year}") and file_name.endswith(".tif"):
            date_str = file_name.split('_')[3]
            file_date = datetime.strptime(date_str, "%Y-%m-%d")
            if start_date is None:
                start_date = file_date
            raster_files.append((file_date, os.path.join(folder_path, file_name)))
    
    raster_files.sort()  # Sort by date
    start_date = raster_files[0][0]
    end_date = raster_files[-1][0]

    return [f[1] for f in raster_files], start_date, end_date

def calculate_modis_weights(file_paths, interval_days):
    # 월별 기여도를 저장할 DataFrame 초기화
    weights = pd.DataFrame(0, index=np.arange(1, len(file_paths) + 1), columns=np.arange(1, 13))
    
    # 날짜 형식 추출
    date_format = "%Y-%m-%d"
    
    for file_path in file_paths:
        # 파일명에서 날짜 추출
        file_name = file_path.split('/')[-1]
        date_str = file_name.split('_')[3]
        date = datetime.strptime(date_str, date_format)
        
        # 종료 날짜와 시작 날짜 계산 (interval_days 주기)
        end_date = date
        start_date = end_date - timedelta(days=interval_days - 1)
        
        # 각 월에 기여하는 일수를 계산하여 가중치 업데이트
        current_date = start_date
        while current_date <= end_date:
            year = current_date.year
            month = current_date.month
            days_in_month = (current_date.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
            days_in_month = days_in_month.day
            
            # 현재 월의 남은 일수 계산
            remaining_days_in_month = days_in_month - current_date.day + 1
            days_to_add = min(remaining_days_in_month, (end_date - current_date).days + 1)
            
            weights.loc[file_paths.index(file_path) + 1, month] += days_to_add
            current_date += timedelta(days=days_to_add)
    
    return weights

def apply_weights(raster_files, weights, start_month=3, end_month=10):
    weighted_means = []
    for month in range(start_month, end_month + 1):
        w = weights[month]
        selected_rasters = [raster_files[i - 1] for i in w.index if i <= len(raster_files)]

        # 가중치 배열 정규화
        ww = w.values[:len(selected_rasters)] / w.sum()
        weighted_sum = None
        sum_weights = 0
        for raster_path, weight in zip(selected_rasters, ww):
            with rasterio.open(raster_path) as src:
                data = src.read(1)
                if weighted_sum is None:
                    weighted_sum = data * weight
                else:
                    weighted_sum += data * weight
                sum_weights += weight
        weighted_mean = weighted_sum / sum_weights
        weighted_means.append(weighted_mean)
    return weighted_means


# Example usage
year = 2010
folder_path = "/home/scsi/Desktop/Sehoon/crop_yield/MODIS/NDVI_EVI/"

# Generate raster file paths
raster_files, start_date, end_date = generate_raster_file_paths(year, folder_path)
interval = 16

# Calculate weights
weights = calculate_modis_weights(raster_files, interval_days=interval)

# # Apply weights to calculate weighted means
# weighted_means = apply_weights(raster_files, weights)

In [None]:
def generate_raster_file_paths(year, folder_path):
    raster_files = []
    start_date = None

    for file_name in os.listdir(folder_path):
        if file_name.startswith(f"MODIS_LST_Ukraine_{year}") and file_name.endswith(".tif"):
            date_str = file_name.split('_')[3]
            file_date = datetime.strptime(date_str, "%Y-%m-%d")
            if start_date is None:
                start_date = file_date
            raster_files.append((file_date, os.path.join(folder_path, file_name)))
    
    raster_files.sort()  # Sort by date
    start_date = raster_files[0][0]
    end_date = raster_files[-1][0]

    return [f[1] for f in raster_files], start_date, end_date

def calculate_modis_weights(file_paths, interval_days):
    weights = pd.DataFrame(0, index=np.arange(1, len(file_paths) + 1), columns=np.arange(1, 13))
    date_format = "%Y-%m-%d"
    
    for file_path in file_paths:
        file_name = file_path.split('/')[-1]
        date_str = file_name.split('_')[3]
        date = datetime.strptime(date_str, date_format)
        
        end_date = date
        start_date = end_date - timedelta(days=interval_days - 1)
        
        current_date = start_date
        while current_date <= end_date:
            year = current_date.year
            month = current_date.month
            days_in_month = (current_date.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
            days_in_month = days_in_month.day
            
            remaining_days_in_month = days_in_month - current_date.day + 1
            days_to_add = min(remaining_days_in_month, (end_date - current_date).days + 1)
            
            weights.loc[file_paths.index(file_path) + 1, month] += days_to_add
            current_date += timedelta(days=days_to_add)
    
    return weights

def apply_weights(raster_files, weights, start_month=3, end_month=10):
    weighted_means = {}
    with rasterio.open(raster_files[0]) as src:
        band_count = src.count
    
    for band in range(1, band_count + 1):
        weighted_means[band] = []

        for month in range(start_month, end_month + 1):
            w = weights[month]
            selected_rasters = [raster_files[i - 1] for i in w.index if i <= len(raster_files)]

            ww = w.values[:len(selected_rasters)] / w.sum()
            weighted_sum = None
            sum_weights = 0
            for raster_path, weight in zip(selected_rasters, ww):
                with rasterio.open(raster_path) as src:
                    data = src.read(band)
                    if weighted_sum is None:
                        weighted_sum = data * weight
                    else:
                        weighted_sum += data * weight
                    sum_weights += weight
            weighted_mean = weighted_sum / sum_weights
            weighted_means[band].append(weighted_mean)
    return weighted_means

def save_as_tif(data, out_path, reference_file, band_name):
    with rasterio.open(reference_file) as src:
        profile = src.profile
        profile.update(dtype=rasterio.float32, count=1, compress='lzw')
    
    with rasterio.open(out_path, 'w', **profile) as dst:
        dst.write(data.astype(rasterio.float32), 1)

def process_year(year, input_path, output_path, interval):
    raster_files, start_date, end_date = generate_raster_file_paths(year, input_path)
    weights = calculate_modis_weights(raster_files, interval_days=interval)
    weighted_means = apply_weights(raster_files, weights)
    
    output_folder = os.path.join(output_path, str(year))
    os.makedirs(output_folder, exist_ok=True)
    
    band_names = {1: 'LST_Day', 2: 'LST_Night'}
    for band, means in weighted_means.items():
        for i, wm in enumerate(means, start=3):
            band_name = band_names.get(band, f'band{band}')
            save_as_tif(wm, os.path.join(output_folder, f'MODIS_{band_name}_{year}_{i}_.tif'), raster_files[0], band_name)

# Example usage
input_path = "/home/scsi/Desktop/Sehoon/crop_yield/MODIS/LST/"
output_path = '/home/scsi/Desktop/Sehoon/crop_yield/preprocessed/MODIS/'
interval = 8

years = range(2010, 2023 + 1)
for year in tqdm(years, desc="Processing years"):
    process_year(year, input_path, output_path, interval)

## **Step 2: Organizing MODIS Data into Monthly Folders and Renaming Files**

### **Final Output**

1. **Monthly Weighted Average TIF Files**  
   - Each file represents the weighted average MODIS data for a specific variable (e.g., LST Day, LST Night) and month.
   - Example output files:
     ```
     MODIS_LST_Day_2010_03.tif
     MODIS_LST_Night_2010_04.tif
     ```

2. **Organized Folder Structure**  
   - The processed files are stored in a structured format:
     ```
     /preprocessed/MODIS/
     ├── 2010/
     │   ├── 03/
     │   ├── 04/
     │   └── ...
     ├── 2011/
     ├── 2012/
     └── ...
     ```

In [None]:
def move_files(base_dir, years, months):
    for year in years:
        for month in months:
            # Source directory
            src_dir = os.path.join(base_dir, str(year))
            
            # Destination directory
            dest_dir = os.path.join(base_dir, str(year), f"{month:02d}")
            os.makedirs(dest_dir, exist_ok=True)
            
            print(f"Processing year {year}, month {month:02d}")
            for root, dirs, files in os.walk(src_dir):
                for file in files:
                    if f"_{month}_" in file:
                        src_path = os.path.join(root, file)
                        dest_path = os.path.join(dest_dir, file)
                        print(f"Moving {src_path} to {dest_path}")
                        shutil.move(src_path, dest_path)

# Define base directory, years and months
base_dir = "/home/scsi/Desktop/Sehoon/crop_yield/preprocessed/MODIS"
years = range(2010, 2024)
months = range(3, 11)

# Move the files
move_files(base_dir, years, months)

In [None]:
def rename_files(base_dir):
    # Compile a regular expression to match the file name pattern that needs to be renamed
    pattern = re.compile(r"(.*)_(\d{1,2})_(.*)\.tif$")

    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".tif"):
                match = pattern.match(file)
                if match:
                    prefix = match.group(1)
                    month = int(match.group(2))
                    suffix = match.group(3)
                    new_file_name = f"{prefix}_{month:02d}{suffix}.tif"
                    src_path = os.path.join(root, file)
                    dest_path = os.path.join(root, new_file_name)
                    print(f"Renaming {src_path} to {dest_path}")
                    shutil.move(src_path, dest_path)

# Define base directory
base_dir = "/home/scsi/Desktop/Sehoon/crop_yield/preprocessed/MODIS"

# Rename the files
rename_files(base_dir)