# Script to download data

In [1]:
import requests
from pathlib import Path
from tqdm import tqdm
import time

# Configuration
BASE_URL = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/"
START_YEAR = 1929
END_YEAR = 2025
OUTPUT_DIR = Path("noaa_gsod_data")

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Download directory: {OUTPUT_DIR.absolute()}")
print(f"Years to download: {START_YEAR} - {END_YEAR}")
print(f"Total files: {END_YEAR - START_YEAR + 1}")

def download_file(url, output_path):
    """Download a file with progress bar."""
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        
        with open(output_path, 'wb') as f:
            if total_size == 0:
                f.write(response.content)
            else:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        
        return True, total_size
    except requests.exceptions.RequestException as e:
        return False, str(e)

def download_all_years():
    """Download all tar.gz files from START_YEAR to END_YEAR."""
    results = {"success": [], "failed": []}
    
    years = range(START_YEAR, END_YEAR + 1)
    
    for year in tqdm(years, desc="Downloading files"):
        filename = f"{year}.tar.gz"
        url = f"{BASE_URL}{filename}"
        output_path = OUTPUT_DIR / filename
        
        # Skip if file already exists
        if output_path.exists():
            file_size = output_path.stat().st_size
            print(f"⏭️  Skipping {filename} (already exists, {file_size:,} bytes)")
            results["success"].append((year, file_size))
            continue
        
        success, result = download_file(url, output_path)
        
        if success:
            print(f"✓ Downloaded {filename} ({result:,} bytes)")
            results["success"].append((year, result))
        else:
            print(f"✗ Failed to download {filename}: {result}")
            results["failed"].append((year, result))
        
        # Small delay to be respectful to the server
        time.sleep(0.5)
    
    return results

# Run the download
print("\nStarting download...\n")
results = download_all_years()

# Summary
print("\n" + "="*60)
print("DOWNLOAD SUMMARY")
print("="*60)
print(f"Successful downloads: {len(results['success'])}")
print(f"Failed downloads: {len(results['failed'])}")

if results['success']:
    total_size = sum(size for _, size in results['success'])
    print(f"Total downloaded size: {total_size / (1024**3):.2f} GB")

if results['failed']:
    print("\nFailed years:")
    for year, error in results['failed']:
        print(f"  - {year}: {error}")

Download directory: /home/alumno/Desktop/datos/Large-Scale Data Analytics/Final Project/noaa_gsod_data
Years to download: 1929 - 2025
Total files: 97

Starting download...



Downloading files:   0%|          | 0/97 [00:00<?, ?it/s]

✓ Downloaded 1929.tar.gz (48,705 bytes)


Downloading files:   1%|          | 1/97 [00:02<04:46,  2.98s/it]

✓ Downloaded 1930.tar.gz (159,645 bytes)


Downloading files:   2%|▏         | 2/97 [00:04<03:42,  2.34s/it]

✓ Downloaded 1931.tar.gz (228,365 bytes)


Downloading files:   3%|▎         | 3/97 [00:06<03:27,  2.21s/it]

✓ Downloaded 1932.tar.gz (250,415 bytes)


Downloading files:   4%|▍         | 4/97 [00:08<03:03,  1.97s/it]

✓ Downloaded 1933.tar.gz (405,611 bytes)


Downloading files:   5%|▌         | 5/97 [00:11<03:21,  2.19s/it]

✓ Downloaded 1934.tar.gz (465,420 bytes)


Downloading files:   6%|▌         | 6/97 [00:12<02:57,  1.95s/it]

✓ Downloaded 1935.tar.gz (625,337 bytes)


Downloading files:   7%|▋         | 7/97 [00:14<02:43,  1.81s/it]

✓ Downloaded 1936.tar.gz (1,172,616 bytes)


Downloading files:   8%|▊         | 8/97 [00:15<02:42,  1.83s/it]

✓ Downloaded 1937.tar.gz (1,863,665 bytes)


Downloading files:   9%|▉         | 9/97 [00:18<02:46,  1.89s/it]

✓ Downloaded 1938.tar.gz (1,242,350 bytes)


Downloading files:  10%|█         | 10/97 [00:19<02:41,  1.85s/it]

✓ Downloaded 1939.tar.gz (1,584,028 bytes)


Downloading files:  11%|█▏        | 11/97 [00:21<02:41,  1.87s/it]

✓ Downloaded 1940.tar.gz (2,205,300 bytes)


Downloading files:  12%|█▏        | 12/97 [00:24<02:54,  2.05s/it]

✓ Downloaded 1941.tar.gz (2,662,172 bytes)


Downloading files:  13%|█▎        | 13/97 [00:26<03:11,  2.28s/it]

✓ Downloaded 1942.tar.gz (3,389,849 bytes)


Downloading files:  14%|█▍        | 14/97 [00:29<03:10,  2.30s/it]

✓ Downloaded 1943.tar.gz (5,401,666 bytes)


Downloading files:  15%|█▌        | 15/97 [00:32<03:23,  2.48s/it]

✓ Downloaded 1944.tar.gz (5,782,279 bytes)


Downloading files:  16%|█▋        | 16/97 [00:35<03:43,  2.75s/it]

✓ Downloaded 1945.tar.gz (6,291,691 bytes)


Downloading files:  18%|█▊        | 17/97 [00:39<04:16,  3.21s/it]

✓ Downloaded 1946.tar.gz (3,664,229 bytes)


Downloading files:  19%|█▊        | 18/97 [00:42<03:53,  2.96s/it]

✓ Downloaded 1947.tar.gz (3,694,836 bytes)


Downloading files:  20%|█▉        | 19/97 [00:46<04:18,  3.32s/it]

✓ Downloaded 1948.tar.gz (7,642,246 bytes)


Downloading files:  21%|██        | 20/97 [00:50<04:26,  3.46s/it]

✓ Downloaded 1949.tar.gz (11,238,468 bytes)


Downloading files:  22%|██▏       | 21/97 [00:56<05:30,  4.35s/it]

✓ Downloaded 1950.tar.gz (12,583,411 bytes)


Downloading files:  23%|██▎       | 22/97 [01:03<06:18,  5.05s/it]

✓ Downloaded 1951.tar.gz (12,793,321 bytes)


Downloading files:  24%|██▎       | 23/97 [01:13<08:12,  6.66s/it]

✓ Downloaded 1952.tar.gz (14,238,115 bytes)


Downloading files:  25%|██▍       | 24/97 [01:29<11:20,  9.32s/it]

✓ Downloaded 1953.tar.gz (15,321,684 bytes)


Downloading files:  26%|██▌       | 25/97 [01:34<09:52,  8.23s/it]

✓ Downloaded 1954.tar.gz (16,238,778 bytes)


Downloading files:  27%|██▋       | 26/97 [01:50<12:30, 10.56s/it]

✓ Downloaded 1955.tar.gz (15,661,402 bytes)


Downloading files:  28%|██▊       | 27/97 [01:55<10:12,  8.75s/it]

✓ Downloaded 1956.tar.gz (16,098,968 bytes)


Downloading files:  29%|██▉       | 28/97 [02:04<10:04,  8.76s/it]

✓ Downloaded 1957.tar.gz (20,361,737 bytes)


Downloading files:  30%|██▉       | 29/97 [02:17<11:19,  9.99s/it]

✓ Downloaded 1958.tar.gz (21,753,447 bytes)


Downloading files:  31%|███       | 30/97 [02:30<12:16, 10.99s/it]

✓ Downloaded 1959.tar.gz (21,831,116 bytes)


Downloading files:  32%|███▏      | 31/97 [02:48<14:33, 13.23s/it]

✓ Downloaded 1960.tar.gz (22,436,888 bytes)


Downloading files:  33%|███▎      | 32/97 [03:11<17:14, 15.92s/it]

✓ Downloaded 1961.tar.gz (23,444,550 bytes)


Downloading files:  34%|███▍      | 33/97 [03:27<17:13, 16.14s/it]

✓ Downloaded 1962.tar.gz (23,930,752 bytes)


Downloading files:  35%|███▌      | 34/97 [03:41<16:16, 15.49s/it]

✓ Downloaded 1963.tar.gz (23,697,312 bytes)


Downloading files:  36%|███▌      | 35/97 [04:03<18:05, 17.50s/it]

✓ Downloaded 1964.tar.gz (20,483,736 bytes)


Downloading files:  37%|███▋      | 36/97 [04:12<14:55, 14.68s/it]

✓ Downloaded 1965.tar.gz (16,080,034 bytes)


Downloading files:  38%|███▊      | 37/97 [04:23<13:37, 13.63s/it]

✓ Downloaded 1966.tar.gz (16,607,093 bytes)


Downloading files:  39%|███▉      | 38/97 [04:34<12:42, 12.93s/it]

✓ Downloaded 1967.tar.gz (16,561,959 bytes)


Downloading files:  40%|████      | 39/97 [04:43<11:16, 11.66s/it]

✓ Downloaded 1968.tar.gz (15,241,122 bytes)


Downloading files:  41%|████      | 40/97 [04:55<11:13, 11.82s/it]

✓ Downloaded 1969.tar.gz (22,994,174 bytes)


Downloading files:  42%|████▏     | 41/97 [05:02<09:49, 10.52s/it]

✓ Downloaded 1970.tar.gz (22,454,179 bytes)


Downloading files:  43%|████▎     | 42/97 [05:21<11:50, 12.91s/it]

✓ Downloaded 1971.tar.gz (12,950,563 bytes)


Downloading files:  44%|████▍     | 43/97 [05:27<09:41, 10.77s/it]

✓ Downloaded 1972.tar.gz (5,107,257 bytes)


Downloading files:  45%|████▌     | 44/97 [05:30<07:30,  8.49s/it]

✓ Downloaded 1973.tar.gz (54,798,538 bytes)


Downloading files:  46%|████▋     | 45/97 [06:10<15:36, 18.02s/it]

✓ Downloaded 1974.tar.gz (55,493,864 bytes)


Downloading files:  47%|████▋     | 46/97 [06:49<20:39, 24.31s/it]

✓ Downloaded 1975.tar.gz (57,881,367 bytes)


Downloading files:  48%|████▊     | 47/97 [07:11<19:33, 23.46s/it]

✓ Downloaded 1976.tar.gz (59,605,652 bytes)


Downloading files:  49%|████▉     | 48/97 [07:51<23:21, 28.61s/it]

✓ Downloaded 1977.tar.gz (59,455,167 bytes)


Downloading files:  51%|█████     | 49/97 [08:05<19:22, 24.22s/it]

✓ Downloaded 1978.tar.gz (59,830,672 bytes)


Downloading files:  52%|█████▏    | 50/97 [08:38<21:01, 26.85s/it]

✓ Downloaded 1979.tar.gz (60,501,159 bytes)


Downloading files:  53%|█████▎    | 51/97 [09:06<20:46, 27.09s/it]

✓ Downloaded 1980.tar.gz (59,682,285 bytes)


Downloading files:  54%|█████▎    | 52/97 [09:30<19:36, 26.14s/it]

✓ Downloaded 1981.tar.gz (60,480,538 bytes)


Downloading files:  55%|█████▍    | 53/97 [09:53<18:36, 25.38s/it]

✓ Downloaded 1982.tar.gz (60,495,023 bytes)


Downloading files:  56%|█████▌    | 54/97 [10:25<19:32, 27.27s/it]

✓ Downloaded 1983.tar.gz (62,763,284 bytes)


Downloading files:  57%|█████▋    | 55/97 [11:16<24:10, 34.53s/it]

✓ Downloaded 1984.tar.gz (65,105,200 bytes)


Downloading files:  58%|█████▊    | 56/97 [11:30<19:20, 28.30s/it]

✓ Downloaded 1985.tar.gz (66,624,606 bytes)


Downloading files:  59%|█████▉    | 57/97 [11:47<16:35, 24.90s/it]

✓ Downloaded 1986.tar.gz (67,703,695 bytes)


Downloading files:  60%|█████▉    | 58/97 [12:04<14:34, 22.43s/it]

✓ Downloaded 1987.tar.gz (69,970,080 bytes)


Downloading files:  61%|██████    | 59/97 [12:26<14:06, 22.28s/it]

✓ Downloaded 1988.tar.gz (71,519,264 bytes)


Downloading files:  62%|██████▏   | 60/97 [13:08<17:30, 28.39s/it]

✓ Downloaded 1989.tar.gz (71,766,358 bytes)


Downloading files:  63%|██████▎   | 61/97 [13:46<18:38, 31.07s/it]

✓ Downloaded 1990.tar.gz (74,271,561 bytes)


Downloading files:  64%|██████▍   | 62/97 [14:36<21:29, 36.85s/it]

✓ Downloaded 1991.tar.gz (72,663,782 bytes)


Downloading files:  65%|██████▍   | 63/97 [14:55<17:50, 31.48s/it]

✓ Downloaded 1992.tar.gz (71,497,082 bytes)


Downloading files:  66%|██████▌   | 64/97 [15:52<21:31, 39.14s/it]

✓ Downloaded 1993.tar.gz (72,339,659 bytes)


Downloading files:  67%|██████▋   | 65/97 [16:12<17:47, 33.36s/it]

✓ Downloaded 1994.tar.gz (73,075,863 bytes)


Downloading files:  68%|██████▊   | 66/97 [16:32<15:12, 29.43s/it]

✓ Downloaded 1995.tar.gz (71,265,290 bytes)


Downloading files:  69%|██████▉   | 67/97 [17:06<15:20, 30.68s/it]

✓ Downloaded 1996.tar.gz (70,480,498 bytes)


Downloading files:  70%|███████   | 68/97 [17:33<14:18, 29.59s/it]

✓ Downloaded 1997.tar.gz (70,866,194 bytes)


Downloading files:  71%|███████   | 69/97 [18:14<15:25, 33.07s/it]

✓ Downloaded 1998.tar.gz (70,838,312 bytes)


Downloading files:  72%|███████▏  | 70/97 [18:36<13:21, 29.68s/it]

✓ Downloaded 1999.tar.gz (71,138,768 bytes)


Downloading files:  73%|███████▎  | 71/97 [19:05<12:45, 29.43s/it]

✓ Downloaded 2000.tar.gz (69,525,720 bytes)


Downloading files:  74%|███████▍  | 72/97 [19:54<14:42, 35.29s/it]

✓ Downloaded 2001.tar.gz (73,027,824 bytes)


Downloading files:  75%|███████▌  | 73/97 [20:12<12:06, 30.26s/it]

✓ Downloaded 2002.tar.gz (76,706,165 bytes)


Downloading files:  76%|███████▋  | 74/97 [20:33<10:28, 27.31s/it]

✓ Downloaded 2003.tar.gz (77,548,308 bytes)


Downloading files:  77%|███████▋  | 75/97 [21:03<10:20, 28.22s/it]

✓ Downloaded 2004.tar.gz (80,442,642 bytes)


Downloading files:  78%|███████▊  | 76/97 [21:21<08:51, 25.29s/it]

✓ Downloaded 2005.tar.gz (86,083,238 bytes)


Downloading files:  79%|███████▉  | 77/97 [21:43<08:01, 24.09s/it]

✓ Downloaded 2006.tar.gz (83,307,684 bytes)


Downloading files:  80%|████████  | 78/97 [22:02<07:10, 22.68s/it]

✓ Downloaded 2007.tar.gz (84,873,681 bytes)


Downloading files:  81%|████████▏ | 79/97 [22:54<09:27, 31.50s/it]

✓ Downloaded 2008.tar.gz (90,018,526 bytes)


Downloading files:  82%|████████▏ | 80/97 [23:19<08:23, 29.64s/it]

✓ Downloaded 2009.tar.gz (92,844,436 bytes)


Downloading files:  84%|████████▎ | 81/97 [23:40<07:12, 27.03s/it]

✓ Downloaded 2010.tar.gz (95,097,389 bytes)


Downloading files:  85%|████████▍ | 82/97 [24:13<07:09, 28.60s/it]

✓ Downloaded 2011.tar.gz (95,913,480 bytes)


Downloading files:  86%|████████▌ | 83/97 [24:36<06:19, 27.09s/it]

✓ Downloaded 2012.tar.gz (100,550,463 bytes)


Downloading files:  87%|████████▋ | 84/97 [25:53<09:05, 41.93s/it]

✓ Downloaded 2013.tar.gz (101,641,860 bytes)


Downloading files:  88%|████████▊ | 85/97 [26:22<07:37, 38.15s/it]

✓ Downloaded 2014.tar.gz (103,817,011 bytes)


Downloading files:  89%|████████▊ | 86/97 [26:52<06:32, 35.64s/it]

✓ Downloaded 2015.tar.gz (106,565,202 bytes)


Downloading files:  90%|████████▉ | 87/97 [28:06<07:51, 47.16s/it]

✓ Downloaded 2016.tar.gz (107,377,575 bytes)


Downloading files:  91%|█████████ | 88/97 [29:09<07:48, 52.00s/it]

✓ Downloaded 2017.tar.gz (110,934,955 bytes)


Downloading files:  92%|█████████▏| 89/97 [30:50<08:52, 66.50s/it]

✓ Downloaded 2018.tar.gz (110,326,569 bytes)


Downloading files:  93%|█████████▎| 90/97 [31:34<06:58, 59.76s/it]

✓ Downloaded 2019.tar.gz (110,816,319 bytes)


Downloading files:  94%|█████████▍| 91/97 [32:07<05:10, 51.79s/it]

✓ Downloaded 2020.tar.gz (110,079,278 bytes)


Downloading files:  95%|█████████▍| 92/97 [33:30<05:06, 61.20s/it]

✓ Downloaded 2021.tar.gz (108,057,900 bytes)


Downloading files:  96%|█████████▌| 93/97 [35:16<04:59, 74.78s/it]

✓ Downloaded 2022.tar.gz (107,099,789 bytes)


Downloading files:  97%|█████████▋| 94/97 [35:54<03:11, 63.73s/it]

✓ Downloaded 2023.tar.gz (108,776,234 bytes)


Downloading files:  98%|█████████▊| 95/97 [37:17<02:18, 69.43s/it]

✓ Downloaded 2024.tar.gz (106,289,853 bytes)


Downloading files:  99%|█████████▉| 96/97 [38:16<01:06, 66.24s/it]

✓ Downloaded 2025.tar.gz (70,035,141 bytes)


Downloading files: 100%|██████████| 97/97 [38:50<00:00, 24.03s/it]


DOWNLOAD SUMMARY
Successful downloads: 97
Failed downloads: 0
Total downloaded size: 4.39 GB





In [2]:
import tarfile
from pathlib import Path
from tqdm import tqdm
import shutil

INPUT_DIR = Path("noaa_gsod_data")
OUTPUT_DIR = Path("noaa_gsod_extracted")

OUTPUT_DIR.mkdir(exist_ok=True)

def extract_all_targz():
    """Extract all tar.gz files."""
    tar_files = list(INPUT_DIR.glob("*.tar.gz"))
    
    for tar_path in tqdm(tar_files, desc="Extracting archives"):
        year = tar_path.stem  # Gets filename without .tar.gz
        year_dir = OUTPUT_DIR / year
        
        # Skip if already extracted
        if year_dir.exists() and any(year_dir.iterdir()):
            print(f"⏭️  Skipping {year} (already extracted)")
            continue
        
        year_dir.mkdir(exist_ok=True)
        
        try:
            with tarfile.open(tar_path, 'r:gz') as tar:
                tar.extractall(path=year_dir)
            print(f"✓ Extracted {year}")
        except Exception as e:
            print(f"✗ Failed to extract {year}: {e}")

# Extract all files
extract_all_targz()

print("\nExtraction complete!")
print(f"All CSV files are now in: {OUTPUT_DIR.absolute()}")

  tar.extractall(path=year_dir)
Extracting archives:   3%|▎         | 3/97 [00:00<00:03, 29.78it/s]

✓ Extracted 1929.tar
✓ Extracted 1930.tar
✓ Extracted 1931.tar
✓ Extracted 1932.tar


Extracting archives:   6%|▌         | 6/97 [00:00<00:07, 12.65it/s]

✓ Extracted 1933.tar
✓ Extracted 1934.tar
✓ Extracted 1935.tar


Extracting archives:   8%|▊         | 8/97 [00:01<00:15,  5.92it/s]

✓ Extracted 1936.tar
✓ Extracted 1937.tar


Extracting archives:  10%|█         | 10/97 [00:01<00:20,  4.30it/s]

✓ Extracted 1938.tar


Extracting archives:  11%|█▏        | 11/97 [00:02<00:22,  3.77it/s]

✓ Extracted 1939.tar


Extracting archives:  12%|█▏        | 12/97 [00:02<00:27,  3.14it/s]

✓ Extracted 1940.tar


Extracting archives:  13%|█▎        | 13/97 [00:03<00:31,  2.71it/s]

✓ Extracted 1941.tar


Extracting archives:  14%|█▍        | 14/97 [00:04<00:40,  2.04it/s]

✓ Extracted 1942.tar


Extracting archives:  15%|█▌        | 15/97 [00:05<00:54,  1.50it/s]

✓ Extracted 1943.tar


Extracting archives:  16%|█▋        | 16/97 [00:06<01:07,  1.20it/s]

✓ Extracted 1944.tar


Extracting archives:  18%|█▊        | 17/97 [00:08<01:21,  1.02s/it]

✓ Extracted 1945.tar


Extracting archives:  19%|█▊        | 18/97 [00:08<01:17,  1.02it/s]

✓ Extracted 1946.tar


Extracting archives:  20%|█▉        | 19/97 [00:09<01:11,  1.09it/s]

✓ Extracted 1947.tar


Extracting archives:  21%|██        | 20/97 [00:11<01:30,  1.18s/it]

✓ Extracted 1948.tar


Extracting archives:  22%|██▏       | 21/97 [00:14<02:00,  1.59s/it]

✓ Extracted 1949.tar


Extracting archives:  23%|██▎       | 22/97 [00:16<02:28,  1.98s/it]

✓ Extracted 1950.tar


Extracting archives:  24%|██▎       | 23/97 [00:19<02:40,  2.17s/it]

✓ Extracted 1951.tar


Extracting archives:  25%|██▍       | 24/97 [00:22<02:55,  2.40s/it]

✓ Extracted 1952.tar


Extracting archives:  26%|██▌       | 25/97 [00:26<03:21,  2.81s/it]

✓ Extracted 1953.tar


Extracting archives:  27%|██▋       | 26/97 [00:29<03:32,  2.99s/it]

✓ Extracted 1954.tar


Extracting archives:  28%|██▊       | 27/97 [00:33<03:39,  3.13s/it]

✓ Extracted 1955.tar


Extracting archives:  29%|██▉       | 28/97 [00:36<03:48,  3.31s/it]

✓ Extracted 1956.tar


Extracting archives:  30%|██▉       | 29/97 [00:41<04:11,  3.70s/it]

✓ Extracted 1957.tar


Extracting archives:  31%|███       | 30/97 [00:46<04:33,  4.08s/it]

✓ Extracted 1958.tar


Extracting archives:  32%|███▏      | 31/97 [00:51<04:51,  4.42s/it]

✓ Extracted 1959.tar


Extracting archives:  33%|███▎      | 32/97 [00:56<05:00,  4.62s/it]

✓ Extracted 1960.tar


Extracting archives:  34%|███▍      | 33/97 [01:02<05:11,  4.87s/it]

✓ Extracted 1961.tar


Extracting archives:  35%|███▌      | 34/97 [01:07<05:19,  5.07s/it]

✓ Extracted 1962.tar


Extracting archives:  36%|███▌      | 35/97 [01:12<05:12,  5.04s/it]

✓ Extracted 1963.tar


Extracting archives:  37%|███▋      | 36/97 [01:16<04:46,  4.70s/it]

✓ Extracted 1964.tar


Extracting archives:  38%|███▊      | 37/97 [01:19<04:16,  4.27s/it]

✓ Extracted 1965.tar


Extracting archives:  39%|███▉      | 38/97 [01:23<03:54,  3.97s/it]

✓ Extracted 1966.tar


Extracting archives:  40%|████      | 39/97 [01:26<03:35,  3.72s/it]

✓ Extracted 1967.tar


Extracting archives:  41%|████      | 40/97 [01:29<03:19,  3.50s/it]

✓ Extracted 1968.tar


Extracting archives:  42%|████▏     | 41/97 [01:33<03:29,  3.74s/it]

✓ Extracted 1969.tar


Extracting archives:  43%|████▎     | 42/97 [01:37<03:34,  3.90s/it]

✓ Extracted 1970.tar


Extracting archives:  44%|████▍     | 43/97 [01:41<03:21,  3.74s/it]

✓ Extracted 1971.tar


Extracting archives:  45%|████▌     | 44/97 [01:42<02:33,  2.89s/it]

✓ Extracted 1972.tar


Extracting archives:  46%|████▋     | 45/97 [01:53<04:42,  5.44s/it]

✓ Extracted 1973.tar


Extracting archives:  47%|████▋     | 46/97 [02:04<06:07,  7.21s/it]

✓ Extracted 1974.tar


Extracting archives:  48%|████▊     | 47/97 [02:16<07:09,  8.58s/it]

✓ Extracted 1975.tar


Extracting archives:  49%|████▉     | 48/97 [02:29<07:59,  9.78s/it]

✓ Extracted 1976.tar


Extracting archives:  51%|█████     | 49/97 [02:43<08:54, 11.14s/it]

✓ Extracted 1977.tar


Extracting archives:  52%|█████▏    | 50/97 [02:56<09:08, 11.67s/it]

✓ Extracted 1978.tar


Extracting archives:  53%|█████▎    | 51/97 [03:09<09:17, 12.12s/it]

✓ Extracted 1979.tar


Extracting archives:  54%|█████▎    | 52/97 [03:22<09:13, 12.31s/it]

✓ Extracted 1980.tar


Extracting archives:  55%|█████▍    | 53/97 [03:35<09:06, 12.41s/it]

✓ Extracted 1981.tar


Extracting archives:  56%|█████▌    | 54/97 [03:47<08:49, 12.30s/it]

✓ Extracted 1982.tar


Extracting archives:  57%|█████▋    | 55/97 [03:59<08:42, 12.45s/it]

✓ Extracted 1983.tar


Extracting archives:  58%|█████▊    | 56/97 [04:12<08:32, 12.51s/it]

✓ Extracted 1984.tar


Extracting archives:  59%|█████▉    | 57/97 [04:25<08:32, 12.80s/it]

✓ Extracted 1985.tar


Extracting archives:  60%|█████▉    | 58/97 [04:39<08:25, 12.97s/it]

✓ Extracted 1986.tar


Extracting archives:  61%|██████    | 59/97 [04:53<08:22, 13.23s/it]

✓ Extracted 1987.tar


Extracting archives:  62%|██████▏   | 60/97 [05:06<08:13, 13.33s/it]

✓ Extracted 1988.tar


Extracting archives:  63%|██████▎   | 61/97 [05:22<08:22, 13.97s/it]

✓ Extracted 1989.tar


Extracting archives:  64%|██████▍   | 62/97 [05:36<08:16, 14.19s/it]

✓ Extracted 1990.tar


Extracting archives:  65%|██████▍   | 63/97 [05:50<07:58, 14.08s/it]

✓ Extracted 1991.tar


Extracting archives:  66%|██████▌   | 64/97 [06:04<07:39, 13.93s/it]

✓ Extracted 1992.tar


Extracting archives:  67%|██████▋   | 65/97 [06:17<07:23, 13.86s/it]

✓ Extracted 1993.tar


Extracting archives:  68%|██████▊   | 66/97 [06:31<07:05, 13.72s/it]

✓ Extracted 1994.tar


Extracting archives:  69%|██████▉   | 67/97 [06:44<06:46, 13.56s/it]

✓ Extracted 1995.tar


Extracting archives:  70%|███████   | 68/97 [06:57<06:31, 13.51s/it]

✓ Extracted 1996.tar


Extracting archives:  71%|███████   | 69/97 [07:11<06:19, 13.55s/it]

✓ Extracted 1997.tar


Extracting archives:  72%|███████▏  | 70/97 [07:24<06:02, 13.41s/it]

✓ Extracted 1998.tar


Extracting archives:  73%|███████▎  | 71/97 [07:38<05:49, 13.44s/it]

✓ Extracted 1999.tar


Extracting archives:  74%|███████▍  | 72/97 [07:51<05:35, 13.42s/it]

✓ Extracted 2000.tar


Extracting archives:  75%|███████▌  | 73/97 [08:05<05:25, 13.55s/it]

✓ Extracted 2001.tar


Extracting archives:  76%|███████▋  | 74/97 [08:19<05:16, 13.74s/it]

✓ Extracted 2002.tar


Extracting archives:  77%|███████▋  | 75/97 [08:33<05:04, 13.86s/it]

✓ Extracted 2003.tar


Extracting archives:  78%|███████▊  | 76/97 [08:48<04:57, 14.16s/it]

✓ Extracted 2004.tar


Extracting archives:  79%|███████▉  | 77/97 [09:04<04:54, 14.75s/it]

✓ Extracted 2005.tar


Extracting archives:  80%|████████  | 78/97 [09:20<04:47, 15.11s/it]

✓ Extracted 2006.tar


Extracting archives:  81%|████████▏ | 79/97 [09:36<04:34, 15.23s/it]

✓ Extracted 2007.tar


Extracting archives:  82%|████████▏ | 80/97 [09:53<04:27, 15.75s/it]

✓ Extracted 2008.tar


Extracting archives:  84%|████████▎ | 81/97 [10:11<04:24, 16.52s/it]

✓ Extracted 2009.tar


Extracting archives:  85%|████████▍ | 82/97 [10:29<04:15, 17.02s/it]

✓ Extracted 2010.tar


Extracting archives:  86%|████████▌ | 83/97 [10:48<04:04, 17.49s/it]

✓ Extracted 2011.tar


Extracting archives:  87%|████████▋ | 84/97 [11:08<03:57, 18.24s/it]

✓ Extracted 2012.tar


Extracting archives:  88%|████████▊ | 85/97 [11:28<03:44, 18.73s/it]

✓ Extracted 2013.tar


Extracting archives:  89%|████████▊ | 86/97 [11:49<03:34, 19.52s/it]

✓ Extracted 2014.tar


Extracting archives:  90%|████████▉ | 87/97 [12:11<03:21, 20.17s/it]

✓ Extracted 2015.tar


Extracting archives:  91%|█████████ | 88/97 [12:31<03:02, 20.31s/it]

✓ Extracted 2016.tar


Extracting archives:  92%|█████████▏| 89/97 [12:47<02:30, 18.78s/it]

✗ Failed to extract 2017.tar: [Errno 28] No space left on device


Extracting archives:  95%|█████████▍| 92/97 [12:47<00:35,  7.20s/it]

✗ Failed to extract 2018.tar: [Errno 28] No space left on device
✗ Failed to extract 2019.tar: [Errno 28] No space left on device
✗ Failed to extract 2020.tar: [Errno 28] No space left on device


Extracting archives:  97%|█████████▋| 94/97 [12:47<00:24,  8.17s/it]


✗ Failed to extract 2021.tar: [Errno 28] No space left on device
✗ Failed to extract 2022.tar: [Errno 28] No space left on device


OSError: [Errno 28] No space left on device: 'noaa_gsod_extracted/2023.tar'

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("NOAA GSOD") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Read all CSVs (Spark will auto-parallelize)
df = spark.read.csv(
    "noaa_gsod_extracted/*/*.csv",
    header=True,
    inferSchema=True
)

# Save as Parquet (much faster for analysis)
df.write.parquet(
    "noaa_gsod_parquet",
    mode="overwrite",
    compression="snappy"
)

print(f"Total records: {df.count()}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/11 12:10:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/11 12:10:20 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: noaa_gsod_extracted/*/*.csv.
java.io.FileNotFoundException: File noaa_gsod_extracted/*/*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.F

Py4JError: An error occurred while calling o32.csv

25/12/11 12:12:29 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.