In [1]:
import os
import shutil
import glob
import json
import hashlib
from pathlib import Path
from collections import defaultdict
import re

def collect_notebooks():
    """
    First collect all notebooks from subdirectories into a single collection folder
    """
    new_folder = "all_notebooks"
    os.makedirs(new_folder, exist_ok=True)
    
    # Search all .ipynb files recursively from current directory
    notebooks = glob.glob("**/*.ipynb", recursive=True)
    
    print(f"Collecting {len(notebooks)} notebooks into '{new_folder}' folder...")
    
    collected_notebooks = []
    
    for i, notebook_path in enumerate(notebooks, 1):
        filename = os.path.basename(notebook_path)
        folder_name = os.path.basename(os.path.dirname(notebook_path))
        
        # Skip if already in collection folder
        if new_folder in notebook_path:
            continue
            
        if folder_name:
            unique_filename = f"{folder_name}_{filename}"
        else:
            unique_filename = filename
        destination = os.path.join(new_folder, unique_filename)
        
        try:
            shutil.copy2(notebook_path, destination)
            print(f"   {i:3d}: {unique_filename}")
            collected_notebooks.append(unique_filename)
        except Exception as e:
            print(f"   Error copying {notebook_path}: {e}")
    
    print(f"\nAll notebooks collected to: {os.path.abspath(new_folder)}")
    print(f"Total notebooks collected: {len(collected_notebooks)}")
    
    return new_folder, len(collected_notebooks)

def get_notebook_content_hash(notebook_path):
    try:
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook_data = json.load(f)
        
        content_to_hash = {
            'cells': [],
            'metadata': notebook_data.get('metadata', {}).get('kernelspec', {})
        }
        
        for cell in notebook_data.get('cells', []):
            cell_content = {
                'cell_type': cell.get('cell_type'),
                'source': cell.get('source', [])
            }
            content_to_hash['cells'].append(cell_content)
        
        content_str = json.dumps(content_to_hash, sort_keys=True)
        return hashlib.md5(content_str.encode()).hexdigest()
    
    except Exception as e:
        print(f"Warning: Could not read {notebook_path}: {e}")
        return None

def clean_notebook_name(filename):
    name = filename.replace('.ipynb', '')
    
    prefixes_to_remove = [
        r'^[a-z]_',
        r'^\d+_',
        r'^\d+[a-z]_',
        r'^[a-z]\d+_',
        r'^docs_',
        r'^intro_',
    ]
    
    for prefix_pattern in prefixes_to_remove:
        name = re.sub(prefix_pattern, '', name)
    
    return name

def categorize_notebook(filename):
    """
    Categorize notebook based on filename keywords following PyGIS.io structure
    """
    name_lower = filename.lower()
    
    categories = {
        0: {
            "name": "0_Get_Started_in_Spatial_Python",
            "keywords": ["a_intro", "intro", "introduction", "getting_started", "setup", "install", 
                        "environment", "welcome", "basic", "begin", "start", "first", "tutorial", 
                        "hello", "about_py", "b_about_py", "b_intro_py", "learn_more", 
                        "python_by_example", "get_started"]
        },
        1: {
            "name": "1_Spatial_Data_Types_in_Python",
            "keywords": ["c_features", "c_vectors", "c_new_vectors", "c_store", "spatial_data", 
                        "vector", "data_types", "shapefile", "geojson", "format", "geopandas", 
                        "dataframe", "geometry", "point", "line", "polygon", "geodata",
                        "features", "store_features", "geometries", "spatial_objects"]
        },
        2: {
            "name": "2_Nature_of_Coordinate_Systems_in_Python",
            "keywords": ["d_crs", "d_understand_crs", "d_affine", "coordinate", "crs", 
                        "projection", "transform", "reproject", "epsg", "proj4", "utm", 
                        "geographic", "projected", "datum", "affine", "nature_of_coordinate"]
        },
        3: {
            "name": "3_Vector_Operations_in_Python",
            "keywords": ["e_attributes", "e_extraction", "e_vector_merge", "e_dissolve", 
                        "e_summarize", "e_interpolation", "buffer", "overlay", "join", 
                        "spatial_join", "dissolve", "merge", "clip", "intersection", "union", 
                        "difference", "proximity", "nearest", "neighbor", "density", 
                        "interpolation", "attribute", "select", "query", "extraction", 
                        "vector_operations", "attributes", "indexing"]
        },
        4: {
            "name": "4_Raster_Operations_in_Python",
            "keywords": ["e_new_rasters", "rasterio", "gdal", "raster", "band", "pixel", 
                        "resample", "reproject_raster", "band_math", "rasterize", "window", 
                        "tiff", "geotiff", "dem", "elevation", "slope", "aspect", 
                        "raster_operations", "new_rasters"]
        },
        5: {
            "name": "5_Accessing_OSM_Census_Data_in_Python",
            "keywords": ["d_access_osm", "osm", "openstreetmap", "census", "acs", "api", 
                        "download", "fetch", "overpass", "osmnx", "census_data", "demographic", 
                        "population", "accessing_osm", "accessing_census"]
        },
        6: {
            "name": "6_Remote_Sensing_in_Python",
            "keywords": ["f_rs_", "rs_", "remote_sensing", "satellite", "landsat", "sentinel", 
                        "modis", "ndvi", "imagery", "spectral", "band_ratio", "vegetation", 
                        "index", "classification", "machine_learning", "ml", "prediction", 
                        "supervised", "unsupervised", "geowombat", "rs_io", "rs_config", 
                        "rs_edit", "rs_plot", "rs_crs", "rs_extraction", "rs_mosaic", 
                        "rs_task", "rs_common_task", "rs_band_math", "rs_ml_predict"]
        }
    }
    
    scores = {}
    for cat_id, cat_info in categories.items():
        score = 0
        matched_keywords = []
        
        for keyword in cat_info["keywords"]:
            if keyword in name_lower:
                if name_lower.startswith(keyword) or f"_{keyword}" in name_lower:
                    score += 3
                else:
                    score += 1
                matched_keywords.append(keyword)
        
        scores[cat_id] = {
            "score": score,
            "name": cat_info["name"],
            "matched_keywords": matched_keywords
        }
    
    best_category = max(scores.items(), key=lambda x: x[1]["score"])
    
    if best_category[1]["score"] > 0:
        return best_category[0], best_category[1]["name"], best_category[1]["score"], best_category[1]["matched_keywords"]
    else:
        return None, "Uncategorized", 0, []

def create_folder_structure():
    """Create the main folder structure based on PyGIS.io organization"""
    folders = [
        "0_Get_Started_in_Spatial_Python",
        "1_Spatial_Data_Types_in_Python", 
        "2_Nature_of_Coordinate_Systems_in_Python",
        "3_Vector_Operations_in_Python",
        "4_Raster_Operations_in_Python",
        "5_Accessing_OSM_Census_Data_in_Python",
        "6_Remote_Sensing_in_Python"
    ]
    
    for folder in folders:
        os.makedirs(folder, exist_ok=True)
        print(f"Created folder: {folder}")
    
    os.makedirs("Uncategorized", exist_ok=True)
    print("Created folder: Uncategorized")
    
    return folders

def organize_notebooks_by_content(source_folder="all_notebooks"):
    """
    Organize notebooks by content analysis and remove duplicates
    """
    if not os.path.exists(source_folder):
        print(f"Error: Source folder '{source_folder}' not found!")
        return
    
    folders = create_folder_structure()
    notebooks = glob.glob(os.path.join(source_folder, "*.ipynb"))
    
    print(f"\nAnalyzing {len(notebooks)} notebooks for content and duplicates...")
    
    # Group notebooks by content hash
    hash_groups = defaultdict(list)
    notebook_info = []
    
    for notebook_path in notebooks:
        filename = os.path.basename(notebook_path)
        content_hash = get_notebook_content_hash(notebook_path)
        
        info = {
            'filename': filename,
            'path': notebook_path,
            'content_hash': content_hash,
            'size': os.path.getsize(notebook_path)
        }
        
        notebook_info.append(info)
        if content_hash:
            hash_groups[content_hash].append(info)
    
    # Select best notebooks (remove duplicates)
    selected_notebooks = []
    duplicate_count = 0
    
    for content_hash, notebooks_with_same_content in hash_groups.items():
        if len(notebooks_with_same_content) > 1:
            # Multiple notebooks with same content - select the best one
            best_notebook = max(notebooks_with_same_content, key=lambda x: x['size'])
            selected_notebooks.append(best_notebook)
            duplicate_count += len(notebooks_with_same_content) - 1
            
            print(f"Found {len(notebooks_with_same_content)} duplicates, selected: {best_notebook['filename']}")
            for nb in notebooks_with_same_content:
                if nb != best_notebook:
                    print(f"  Skipping duplicate: {nb['filename']}")
        else:
            selected_notebooks.append(notebooks_with_same_content[0])
    
    print(f"\nSelected {len(selected_notebooks)} unique notebooks (removed {duplicate_count} duplicates)")
    
    # Categorize and organize selected notebooks
    categorization_results = {
        0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], "uncategorized": []
    }
    
    for notebook_info in selected_notebooks:
        filename = notebook_info['filename']
        cat_id, cat_folder, score, matched_keywords = categorize_notebook(filename)
        
        if cat_id is not None:
            destination_folder = cat_folder
            categorization_results[cat_id].append({
                "filename": filename,
                "score": score,
                "keywords": matched_keywords
            })
        else:
            destination_folder = "Uncategorized"
            categorization_results["uncategorized"].append({
                "filename": filename,
                "score": score,
                "keywords": matched_keywords
            })
        
        destination_path = os.path.join(destination_folder, filename)
        
        try:
            shutil.copy2(notebook_info['path'], destination_path)
            print(f"{filename} -> {destination_folder}")
            if matched_keywords:
                print(f"  Matched keywords: {', '.join(matched_keywords)}")
        except Exception as e:
            print(f"Error copying {filename}: {e}")
    
    # Print summary
    print(f"\nORGANIZATION SUMMARY")
    print("=" * 50)
    
    folder_descriptions = [
        "0_Get_Started_in_Spatial_Python: Introduction, setup, and Python basics",
        "1_Spatial_Data_Types_in_Python: Vector data, geometries, and spatial data structures", 
        "2_Nature_of_Coordinate_Systems_in_Python: CRS, projections, and coordinate transformations",
        "3_Vector_Operations_in_Python: Vector analysis, spatial joins, and geoprocessing",
        "4_Raster_Operations_in_Python: Raster data manipulation and analysis",
        "5_Accessing_OSM_Census_Data_in_Python: OpenStreetMap and Census data access",
        "6_Remote_Sensing_in_Python: Satellite imagery and remote sensing workflows"
    ]
    
    for i, description in enumerate(folder_descriptions):
        count = len(categorization_results[i])
        print(f"{description}: {count} notebooks")
    
    uncategorized_count = len(categorization_results["uncategorized"])
    print(f"Uncategorized: {uncategorized_count} notebooks")
    
    if uncategorized_count > 0:
        print(f"\nUNCATEGORIZED NOTEBOOKS:")
        for item in categorization_results["uncategorized"]:
            print(f"  {item['filename']}")

def create_index_file(output_dir="all_notebooks"):
    """Create an index file listing all notebooks by category"""
    folders = [
        "0_Get_Started_in_Spatial_Python",
        "1_Spatial_Data_Types_in_Python", 
        "2_Nature_of_Coordinate_Systems_in_Python",
        "3_Vector_Operations_in_Python",
        "4_Raster_Operations_in_Python",
        "5_Accessing_OSM_Census_Data_in_Python",
        "6_Remote_Sensing_in_Python",
        "Uncategorized"
    ]
    
    index_file = "NOTEBOOK_INDEX.md"
    
    with open(index_file, 'w', encoding='utf-8') as f:
        f.write("# PyGIS Notebook Collection Index\n\n")
        f.write("This directory contains organized Jupyter notebooks from the pyGIS repository.\n\n")
        
        total_notebooks = 0
        for folder in folders:
            if os.path.exists(folder):
                notebooks = glob.glob(os.path.join(folder, "*.ipynb"))
                if notebooks:
                    f.write(f"## {folder}\n\n")
                    for notebook in sorted(notebooks):
                        notebook_name = os.path.basename(notebook)
                        f.write(f"- [{notebook_name}](./{folder}/{notebook_name})\n")
                        total_notebooks += 1
                    f.write("\n")
        
        f.write(f"\n**Total notebooks: {total_notebooks}**\n")
        f.write("\n---\n")
        f.write("*Generated by PyGIS Notebook Collection Manager*\n")
    
    print(f"Created index file: {index_file}")

def main():
    print("PyGIS Notebook Collection Manager")
    print("=" * 50)
    
    print("\nSTEP 1: COLLECTING NOTEBOOKS")
    print("=" * 50)
    source_folder, notebook_count = collect_notebooks()
    
    if notebook_count == 0:
        print("No notebooks found to organize!")
        return
        
    print(f"\nSTEP 2: ORGANIZING {notebook_count} NOTEBOOKS")
    print("=" * 50)
    organize_notebooks_by_content(source_folder)
    
    print(f"\nSTEP 3: CREATING INDEX")
    print("=" * 50)
    create_index_file()
    
    print(f"\nDone! Check the organized folders and NOTEBOOK_INDEX.md file.")
    print(f"Original collection folder '{source_folder}' is preserved for backup.")

if __name__ == "__main__":
    main()

PyGIS Notebook Collection Manager

STEP 1: COLLECTING NOTEBOOKS
Collecting 193 notebooks into 'all_notebooks' folder...
     1: folder_management.ipynb
     2: docs_g_xr_demo.ipynb
     3: docs_test.ipynb
     4: docs_a_intro.ipynb
     5: docs_b_about_py.ipynb
     6: docs_b_conda_started.ipynb
     7: docs_b_conda_started2.ipynb
     8: docs_b_getting_started.ipynb
     9: docs_b_intro_py.ipynb
    10: docs_b_learn_more.ipynb
    11: docs_b_python_by_example.ipynb
    12: docs_c_attributes.ipynb
    13: docs_c_features.ipynb
    14: docs_c_intro_structures.ipynb
    15: docs_c_new_vectors.ipynb
    16: docs_c_rasters.ipynb
    17: docs_c_store_features.ipynb
    18: docs_c_vectors.ipynb
    19: docs_d_access_census.ipynb
    20: docs_d_access_osm.ipynb
    21: docs_d_affine.ipynb
    22: docs_d_crs_what_is_it.ipynb
    23: docs_d_exercises.ipynb
    24: docs_d_raster_crs_intro.ipynb
    25: docs_d_understand_crs_codes.ipynb
    26: docs_d_vector_crs_intro.ipynb
    27: docs_e_attribu