In [1]:
# Mount Google Drive to the Colab environment for accessing/saving files directly to/from Drive.
from google.colab import drive

drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [4]:
# Importing necessary libraries for geospatial data processing and manipulation:

import geopandas as gpd           # For handling geospatial data structures and operations.
import pandas as pd               # For data manipulation and analysis.
from shapely.geometry import Point # For creating point geometries from coordinate data.
import os                         # For interacting with the operating system, e.g., managing file paths.
import requests
import zipfile

In [5]:
os.getcwd()

'/content'

In [15]:
# Change the current working directory to the specified path

os.chdir("/content/gdrive/MyDrive") # Change to your desired directory path

# Define the relative path where data is stored

path = "GEO-AI Challenge for Cropland Mapping by ITU/" # Change to your desired directory path

In [19]:
# URL for the World Country Polygons - Very High Definition shapefile
# (You would need to replace this with the actual URL copied from the World Bank website)
url = "https://datacatalogfiles.worldbank.org/ddh-published/0038272/DR0046659/wb_countries_admin0_10m.zip"

response = requests.get(url, stream=True)
response.raise_for_status()

# Save the zip file
zip_filename = os.path.join(path, "world_country_polygons_vhd.zip")
with open(zip_filename, "wb") as file:
    for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)

# Unzip the downloaded file
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(path)

# Delete the zip file after extraction
os.remove(zip_filename)

print("Shapefile downloaded and extracted successfully!")

Shapefile downloaded and extracted successfully!


In [21]:
# List of files to be processed

files = ["Train","Test"]

# Load world countries shapefile from World Bank dataset:
# https://datacatalog.worldbank.org/search/dataset/0038272/World-Bank-Official-Boundaries
# the World Country Polygons - Very High Definition
# unzip file into path

wb_countri_shp = gpd.read_file(path+"WB_countries_Admin0_10m/WB_countries_Admin0_10m.shp")

# Loop through the train and test files

for file in files:

    # Read CSV data for the current file (either Train or Test)

    file_csv = pd.read_csv(path + file + ".csv")

    # Convert longitude and latitude columns to Shapely Point objects

    geometry = [Point(xy) for xy in zip(file_csv["Lon"],file_csv["Lat"])]

    # Convert the DataFrame to a GeoDataFrame with the geometry

    gfile = gpd.GeoDataFrame(file_csv,geometry = geometry)

    # Set the Coordinate Reference System (CRS) for the GeoDataFrame

    gfile.crs = "EPSG:4326"

    # Assign the corresponding country to each point in the GeoDataFrame

    gfile["COUNTRY"] = gfile.geometry.apply(lambda point: wb_countri_shp[wb_countri_shp.geometry.contains(point)]["NAME_EN"].values[0] if wb_countri_shp[wb_countri_shp.geometry.contains(point)].shape[0] else None)

    # If processing the Train file, rearrange columns so 'Target' is last

    if file == "Train":

        cols = [col for col in gfile if col != 'Target'] + ['Target']

        gfile = gfile[cols]

    # Filter records to get points in Iran or Sudan and separately for Afghanistan

    iran_sudan_gdf = gfile[gfile['COUNTRY'].isin(['Iran', 'Sudan'])]

    afghanistan_gdf = gfile[gfile['COUNTRY'] == 'Afghanistan']

    # Save the filtered GeoDataFrames to shapefile format

    iran_sudan_gdf.to_file(path + file + "_Iran_Sudan.shp")

    afghanistan_gdf.to_file(path + file + "_Afghanistan.shp")
