In [None]:
import cv2
import numpy as np
import pytesseract
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
import random
from transformers import ViTFeatureExtractor, ViTModel
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import itertools
import optuna

#1. Downloading Data


*   Images are downloaded from the source : https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/
*   As we have worked with summer season data, images are obtained from 16th July to 16th August for every year.
*   Images are stored separately for every year



In [None]:
import os
import requests
from bs4 import BeautifulSoup
from time import sleep

base_url = "https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/"

#directory for saving the image files
root_save_dir = r"D:\polarData"
os.makedirs(root_save_dir, exist_ok=True)

years = list(range(1978, 2026))

july_dates = [f"0716", f"0717", f"0718", f"0719", f"0720", f"0721", f"0722", f"0723",
              f"0724", f"0725", f"0726", f"0727", f"0728", f"0729", f"0730", f"0731"]

august_dates = [f"0801", f"0802", f"0803", f"0804", f"0805", f"0806", f"0807", f"0808",
                f"0809", f"0810", f"0811", f"0812", f"0813", f"0814", f"0815", f"0816"]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def get_image_urls(year, month_folder, date_list):
    year_url = f"{base_url}{year}/{month_folder}/"
    print(f"Accessing: {year_url}")

    try:
        response = requests.get(year_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        images = []
        all_links = [link.get("href") for link in soup.find_all("a") if link.get("href")]
        print(f"Found {len(all_links)} files in {year}/{month_folder} folder.")

        for href in all_links:
            if href.endswith(".png"):
                for date in date_list:
                    if f"N_{year}{date}_extn_blmrbl_hires_v3.0.png" in href:
                        images.append(year_url + href)
                        print(f" Found Image: {href}")

        if not images:
            print(f" No matching images found for {year}/{month_folder}!")

        return images

    except requests.exceptions.RequestException as e:
        print(f" Failed to access {year_url}: {e}")
        return []

def download_images(image_urls, year_folder):
    os.makedirs(year_folder, exist_ok=True)

    for url in image_urls:
        filename = os.path.join(year_folder, os.path.basename(url))
        if os.path.exists(filename):
            print(f" Skipping {filename} (already exists)")
            continue

        print(f"Downloading {url}...")
        try:
            response = requests.get(url, headers=headers, stream=True, timeout=20)
            response.raise_for_status()

            with open(filename, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)

            print(f" Saved: {filename}")
        except requests.exceptions.RequestException as e:
            print(f" Failed to download: {url} - {e}")

for year in years:
    print(f"\n Fetching images for {year}...")

    year_folder = os.path.join(root_save_dir, f"{year}")

    july_images = get_image_urls(year, "07_Jul", july_dates)
    if july_images:
        download_images(july_images, year_folder)

    august_images = get_image_urls(year, "08_Aug", august_dates)
    if august_images:
        download_images(august_images, year_folder)

    sleep(2)

print("\n Download complete!")



 Fetching images for 1978...
Accessing: https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/07_Jul/
 Failed to access https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/07_Jul/: 404 Client Error: Not Found for url: https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/07_Jul/
Accessing: https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/08_Aug/
 Failed to access https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/08_Aug/: 404 Client Error: Not Found for url: https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1978/08_Aug/

 Fetching images for 1979...
Accessing: https://noaadata.apps.nsidc.org/NOAA/G02135/north/daily/images/1979/07_Jul/
Found 249 files in 1979/07_Jul folder.
 Found Image: N_19790716_extn_blmrbl_hires_v3.0.png
 Found Image: N_19790717_extn_blmrbl_hires_v3.0.png
 Found Image: N_19790718_extn_blmrbl_hires_v3.0.png
 Found Image: N_19790719_extn_blmrbl_hires_v3.0.png
 Found Image: N_

# 2. Handling Missing Data


*   In each year folder the missing data are identified and removed.




In [None]:
import os
import cv2
import pytesseract
import numpy as np
import matplotlib.pyplot as plt

pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

polar_data_dir = r"D:\polarData"

def extract_yellow_text_region(image):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    lower_yellow = np.array([20, 100, 100])
    upper_yellow = np.array([40, 255, 255])

    mask = cv2.inRange(hsv, lower_yellow, upper_yellow)
    yellow_text_region = cv2.bitwise_and(image, image, mask=mask)

    return yellow_text_region

def enhance_text_for_ocr(text_region):
    gray = cv2.cvtColor(text_region, cv2.COLOR_BGR2GRAY)

    _, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    kernel = np.ones((5, 5), np.uint8)
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)

    return cleaned

def is_no_data_image(image_path, show_debug=False):
    """ Detects 'NO DATA' in an image using OCR on enhanced yellow text. """
    img = cv2.imread(image_path)

    if img is None:
        print(f" Skipping {image_path}: Unable to read image.")
        return False

    yellow_text = extract_yellow_text_region(img)
    enhanced_text = enhance_text_for_ocr(yellow_text)

    custom_oem_psm = "--oem 3 --psm 6 -c tessedit_char_whitelist='NO DATA'"

    text = pytesseract.image_to_string(enhanced_text, config=custom_oem_psm).strip()

    print(f" OCR Detected Text in {image_path}:\n{text}")

    if show_debug:
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        axes[0].imshow(yellow_text)
        axes[0].set_title("Extracted Yellow Text Region")
        axes[0].axis("off")

        axes[1].imshow(enhanced_text, cmap="gray")
        axes[1].set_title("Final Cleaned Text for OCR")
        axes[1].axis("off")

        plt.show()

    if "NO" in text.upper():
        print(f" NO DATA detected: {image_path}")
        return True

    return False

def process_images(root_folder):
    for year_folder in sorted(os.listdir(root_folder)):
        year_path = os.path.join(root_folder, year_folder)

        if not os.path.isdir(year_path):
            continue

        print(f"\n Checking images in {year_folder}...")

        for filename in os.listdir(year_path):
            if filename.endswith(".png"):
                image_path = os.path.join(year_path, filename)

                if is_no_data_image(image_path, show_debug=False):
                    print(f" Deleting: {image_path}")
                    os.remove(image_path)
                else:
                    print(f" Valid Image: {image_path}")

process_images(polar_data_dir)

print("\n Processing complete! All NO DATA images have been removed.")



 Checking images in 1979...
 OCR Detected Text in D:\polarData\1979\N_19790717_extn_blmrbl_hires_v3.0.png:

 Valid Image: D:\polarData\1979\N_19790717_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790719_extn_blmrbl_hires_v3.0.png:
'
 Valid Image: D:\polarData\1979\N_19790719_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790721_extn_blmrbl_hires_v3.0.png:

 Valid Image: D:\polarData\1979\N_19790721_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790723_extn_blmrbl_hires_v3.0.png:

 Valid Image: D:\polarData\1979\N_19790723_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790725_extn_blmrbl_hires_v3.0.png:
'
 Valid Image: D:\polarData\1979\N_19790725_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790727_extn_blmrbl_hires_v3.0.png:
'
 Valid Image: D:\polarData\1979\N_19790727_extn_blmrbl_hires_v3.0.png
 OCR Detected Text in D:\polarData\1979\N_19790729_extn_blmrbl_hires

#3. Pre Processing Images and Extracting Features from Images


*   The extracted numerical features are stored in CSV for every year in their designated folder.



In [None]:
#folder wise csv generation
def resize_image(image, target_size=(512, 512)):
    return cv2.resize(image, target_size, interpolation=cv2.INTER_LINEAR)

def align_images(img1, img2):
    orb = cv2.ORB_create()
    keypoints1, descriptors1 = orb.detectAndCompute(img1, None)
    keypoints2, descriptors2 = orb.detectAndCompute(img2, None)

    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors1, descriptors2)
    matches = sorted(matches, key=lambda x: x.distance)

    src_pts = np.float32([keypoints1[m.queryIdx].pt for m in matches]).reshape(-1, 1, 2)
    dst_pts = np.float32([keypoints2[m.trainIdx].pt for m in matches]).reshape(-1, 1, 2)

    H, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
    return cv2.warpPerspective(img1, H, (img2.shape[1], img2.shape[0]))

def denoise_image(image):
    return cv2.GaussianBlur(image, (5, 5), 0)

def grayscale_and_normalize(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX)

def extract_sea_ice(image):
    _, binary_mask = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary_mask

def preprocess_image(image_path, reference_image=None):
    img = cv2.imread(image_path)
    resized_img = resize_image(img)

    if reference_image is not None:
        aligned_img = align_images(resized_img, reference_image)
    else:
        aligned_img = resized_img

    denoised_img = denoise_image(aligned_img)
    grayscale_img = grayscale_and_normalize(denoised_img)
    ice_mask = extract_sea_ice(grayscale_img)

    return ice_mask

def detect_edges(image):
    return cv2.Canny(image, threshold1=50, threshold2=150)

def apply_circular_mask(image):
    """Applies a precise circular mask to remove background and bottom ring artifacts."""
    h, w = image.shape[:2]
    x, y, radius = detect_globe_boundary(image)

    mask = np.zeros((h, w), dtype=np.uint8)
    cv2.circle(mask, (x, y), radius - 10, 255, -1)

    return cv2.bitwise_and(image, image, mask=mask)

def track_ice_retreat(image_old, image_new):
    diff = cv2.absdiff(image_old, image_new)
    _, diff_mask = cv2.threshold(diff, 50, 255, cv2.THRESH_BINARY)
    return diff_mask

def extract_features(ice_mask, reference_mask=None):
    edges = detect_edges(ice_mask)
    ice_coverage = np.sum(ice_mask > 0) / (ice_mask.shape[0] * ice_mask.shape[1]) * 100

    if reference_mask is not None:
        retreat_mask = track_ice_retreat(reference_mask, ice_mask)
        retreat_area = np.sum(retreat_mask > 0) / (retreat_mask.shape[0] * retreat_mask.shape[1]) * 100
    else:
        retreat_mask = np.zeros_like(ice_mask)
        retreat_area = 0

    return ice_coverage, retreat_area, edges, retreat_mask

def get_first_image(folder_path):
    image_files = sorted([f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))])
    if image_files:
        return os.path.join(folder_path, image_files[0])
    return None

def process_folder(folder_path, static_1979_image_path, previous_year_image_path):
    reference_img_static = cv2.imread(static_1979_image_path)
    reference_img_static = resize_image(reference_img_static)

    reference_img_yearly = cv2.imread(previous_year_image_path)
    reference_img_yearly = resize_image(reference_img_yearly)

    results = []

    for filename in sorted(os.listdir(folder_path)):
        image_path = os.path.join(folder_path, filename)

        if not filename.endswith(('.png', '.jpg', '.jpeg')):
            continue

        print(f"Processing {filename}...")

        processed_mask = preprocess_image(image_path, reference_img_static)

        reference_mask_yearly = preprocess_image(previous_year_image_path, reference_img_static)

        reference_mask_static = preprocess_image(static_1979_image_path, reference_img_static)

        ice_coverage, retreat_area_yearly, edges, retreat_mask_yearly = extract_features(processed_mask, reference_mask_yearly)
        _, retreat_area_1979, _, retreat_mask_1979 = extract_features(processed_mask, reference_mask_static)

        results.append({
            "Image": filename,
            "Ice Coverage (%)": ice_coverage,
            "Retreat Area (Yearly) (%)": retreat_area_yearly,
            "Retreat Area (Since 1979) (%)": retreat_area_1979
        })

    output_csv = os.path.join(folder_path, "ice_extent_analysis.csv")

    df_results = pd.DataFrame(results)
    df_results.to_csv(output_csv, index=False)

    print(f"Analysis complete. Results saved to {output_csv}")
    return df_results

def process_all_folders(root_folder, static_1979_image_path):
    folders = sorted(os.listdir(root_folder))

    for i in range(1, len(folders)):
        current_folder = os.path.join(root_folder, folders[i])

        #previous year image retrieval for calculating yearly retreat
        previous_folder = os.path.join(root_folder, folders[i - 1])

        if not os.path.isdir(current_folder) or not os.path.isdir(previous_folder):
            continue

        previous_year_image_path = get_first_image(previous_folder)

        if previous_year_image_path is None:
            print(f" Skipping {folders[i]}: No previous year image found.")
            continue

        print(f"\n Processing folder: {folders[i]} (Previous Year: {folders[i - 1]})")

        process_folder(current_folder, static_1979_image_path, previous_year_image_path)


polar_data_dir = r"D:\polarData"
static_1979_image_path = r"D:\1979\N_19790717_extn_blmrbl_hires_v3.0.png"

process_all_folders(polar_data_dir, static_1979_image_path)

print("\n CSV generated for all folders!")



 Processing folder: 1980 (Previous Year: 1979)
Processing N_19800717_extn_blmrbl_hires_v3.0.png...
Processing N_19800719_extn_blmrbl_hires_v3.0.png...
Processing N_19800721_extn_blmrbl_hires_v3.0.png...
Processing N_19800723_extn_blmrbl_hires_v3.0.png...
Processing N_19800725_extn_blmrbl_hires_v3.0.png...
Processing N_19800727_extn_blmrbl_hires_v3.0.png...
Processing N_19800729_extn_blmrbl_hires_v3.0.png...
Processing N_19800731_extn_blmrbl_hires_v3.0.png...
Processing N_19800802_extn_blmrbl_hires_v3.0.png...
Processing N_19800804_extn_blmrbl_hires_v3.0.png...
Processing N_19800806_extn_blmrbl_hires_v3.0.png...
Processing N_19800808_extn_blmrbl_hires_v3.0.png...
Processing N_19800810_extn_blmrbl_hires_v3.0.png...
Processing N_19800812_extn_blmrbl_hires_v3.0.png...
Processing N_19800814_extn_blmrbl_hires_v3.0.png...
Processing N_19800816_extn_blmrbl_hires_v3.0.png...
Analysis complete. Results saved to D:\polarData\1980\ice_extent_analysis.csv

 Processing folder: 1981 (Previous Year: 

#4. ViT embedding for Multimodal Experiment

In [None]:
import os
import torch
import cv2
import numpy as np
import pandas as pd
from transformers import ViTFeatureExtractor, ViTModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")

feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(device)

root_folder = r"D:\polarData"

def find_image_file(folder, base_name):
    for file in os.listdir(folder):
        if file.startswith(base_name):
            return os.path.join(folder, file)
    return None

def extract_vit_features(image_path):
    image = cv2.imread(image_path)
    if image is None:
        print(f" Warning: Could not read {image_path}")
        return None

    image_resized = cv2.resize(image, (224, 224))
    image_norm = image_resized / 255.0
    image_transposed = np.transpose(image_norm, (2, 0, 1))

    image_tensor = torch.tensor(image_transposed, dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        vit_output = model(image_tensor).last_hidden_state[:, 0, :].cpu().numpy()

    if vit_output is None or vit_output.shape[0] == 0:
        print(f" Error: ViT did not return any features for {image_path}")
        return None

    return vit_output.squeeze()

def process_folders(root_folder):
    for year_folder in sorted(os.listdir(root_folder)):
        if year_folder == "1979":
            print(f" Skipping reference year: {year_folder}")
            continue

        year_path = os.path.join(root_folder, year_folder)

        if not os.path.isdir(year_path):
            continue

        print(f" Processing folder: {year_folder}...")

        csv_path = os.path.join(year_path, "ice_extent_analysis.csv")
        if not os.path.exists(csv_path):
            print(f" No CSV found in {year_folder}, skipping.")
            continue

        df = pd.read_csv(csv_path)

        if "ViT_Feature_0" in df.columns:
            print(f" ViT features already exist in {year_folder}, skipping.")
            continue

        vit_features_list = []
        images_processed = 0

        for image_name in df["Image"]:
            image_path = find_image_file(year_path, image_name)

            if image_path:
                vit_features = extract_vit_features(image_path)

                if vit_features is not None:
                    vit_features_list.append(vit_features.tolist())
                    images_processed += 1
                else:
                    print(f" Warning: No features extracted for {image_name}")
                    vit_features_list.append([None] * 768)
            else:
                print(f" Image {image_name} not found in {year_folder}, skipping.")
                vit_features_list.append([None] * 768)

        if images_processed == 0:
            print(f" No images processed for {year_folder}!")
            continue

        vit_columns = [f"ViT_Feature_{i}" for i in range(768)]
        vit_df = pd.DataFrame(vit_features_list, columns=vit_columns)

        df.insert(0, "Year", int(year_folder))

        updated_df = pd.concat([df, vit_df], axis=1)

        updated_csv_path = os.path.join(year_path, "ice_extent_analysis_updated.csv")
        updated_df.to_csv(updated_csv_path, index=False)

        print(f" Updated CSV saved: {updated_csv_path} ({images_processed} images processed)")

process_folders(root_folder)

print("\n All folders processed! CSVs updated with ViT features.")


 Using device: cpu


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Skipping reference year: 1979
 Processing folder: 1980...
 Updated CSV saved: D:\polarData\1980\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1981...
 Updated CSV saved: D:\polarData\1981\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1982...
 Updated CSV saved: D:\polarData\1982\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1983...
 Updated CSV saved: D:\polarData\1983\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1984...
 Updated CSV saved: D:\polarData\1984\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1985...
 Updated CSV saved: D:\polarData\1985\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1986...
 Updated CSV saved: D:\polarData\1986\ice_extent_analysis_updated.csv (16 images processed)
 Processing folder: 1987...
 Updated CSV saved: D:\polarData\1987\ice_extent_analysis_updated.csv (16 images processed)
 Processi

#5. Merging the CSV for Each Year into A Single CSV file for the Final Experiment

In [None]:
def merge_all_csvs(root_folder):
    all_data = []

    for year_folder in sorted(os.listdir(root_folder)):
        if not year_folder.isdigit() or int(year_folder) < 1980 or int(year_folder) > 2024:
            print(f" Skipping folder: {year_folder}")
            continue

        year_path = os.path.join(root_folder, year_folder)
        updated_csv_path = os.path.join(year_path, "ice_extent_analysis_updated.csv")

        if os.path.exists(updated_csv_path):
            df = pd.read_csv(updated_csv_path)
            all_data.append(df)
            print(f" Merged data from {year_folder}")
        else:
            print(f" No updated CSV found in {year_folder}, skipping.")

    if not all_data:
        print(" No data found! Check if CSVs exist.")
        return

    final_df = pd.concat(all_data, ignore_index=True)

    #the extracted features and vit embeddings for the multimodal approach is stored in a single CSV.

    final_csv_path = os.path.join(root_folder, "ice_extent_full_dataset.csv")
    final_df.to_csv(final_csv_path, index=False)

    print(f" Merged dataset saved: {final_csv_path}")

merge_all_csvs(root_folder)


 Skipping folder: 1979
 Merged data from 1980
 Merged data from 1981
 Merged data from 1982
 Merged data from 1983
 Merged data from 1984
 Merged data from 1985
 Merged data from 1986
 Merged data from 1987
 Merged data from 1988
 Merged data from 1989
 Merged data from 1990
 Merged data from 1991
 Merged data from 1992
 Merged data from 1993
 Merged data from 1994
 Merged data from 1995
 Merged data from 1996
 Merged data from 1997
 Merged data from 1998
 Merged data from 1999
 Merged data from 2000
 Merged data from 2001
 Merged data from 2002
 Merged data from 2003
 Merged data from 2004
 Merged data from 2005
 Merged data from 2006
 Merged data from 2007
 Merged data from 2008
 Merged data from 2009
 Merged data from 2010
 Merged data from 2011
 Merged data from 2012
 Merged data from 2013
 Merged data from 2014
 Merged data from 2015
 Merged data from 2016
 Merged data from 2017
 Merged data from 2018
 Merged data from 2019
 Merged data from 2020
 Merged data from 2021
 Merged dat