In [2]:
import zipfile

import matplotlib.pyplot as plt

from hda import Client

import geopandas as gpd
import pandas as pd
#from geocube.api.core import make_geocube
%run helpers.ipynb



# 1. Downloading data

In [2]:
data = {
    "datasetId": "EO:CLMS:DAT:CORINE",
    "stringChoiceValues": [
        {
            "name": "product_type",
            "value": "Corine Land Cover 2018"
        },
        {
            "name": "format",
            "value": "GeoPackage"
        }
    ]
}

In [3]:
c = Client(debug=False)

matches = c.search(data)
matches.download()

Exception: Missing/incomplete configuration file: /home/jovyan/.hdarc

In [3]:
zip_ref = zipfile.ZipFile("u2018_clc2018_v2020_20u1_geoPackage.zip")
zip_ref.extractall("./data")
zip_ref.close() # close file

# 2. Data Understanding

## 2.1 Work with Geopandas

As an alternative to the tiff format the CLC dataset is also available as in a geopackage format. 
The geopackage has the data not as a raster, but defined with polygons.
This has the advantage of not being limited by the raster resolution, but is also slower when processing.
An easy way to load the Data is with the GeoPandas library.

In [4]:
gdf = gpd.read_file("data/u2018_clc2018_v2020_20u1_geoPackage/DATA/U2018_CLC2018_V2020_20u1.gpkg")
gdf.head()

Unnamed: 0,Code_18,Remark,Area_Ha,ID,geometry
0,111,,130.863654,EU_1,"MULTIPOLYGON (((1917182.160 943608.860, 191714..."
1,111,,53.744524,EU_2,"MULTIPOLYGON (((1953122.840 950507.440, 195311..."
2,111,,30.719104,EU_3,"MULTIPOLYGON (((1956709.150 951094.550, 195669..."
3,111,,50.201782,EU_4,"MULTIPOLYGON (((1805587.500 950821.540, 180555..."
4,111,,481.848803,EU_5,"MULTIPOLYGON (((1792547.840 952643.380, 179251..."


GeoPandas stores the data in a GeoDataFrame, which is a subclass of a Pandas Dataframe, with additional methods for geospatial operations like coordinate transformations.
Each line in the dataframe contains an area defined as a shapely multipolygon, with the label code.

In [9]:
ledgend = pd.read_csv("data/u2018_clc2018_v2020_20u1_geoPackage/Legend/CLC_legend.csv", delimiter = ";")
ledgend["RGB"] = ledgend["RGB"].fillna("000-000-000")
ledgend["RGB"] = ledgend["RGB"].apply(lambda c : '#%02x%02x%02x'%tuple(int(x) for x in c.split("-"))) # convert RGB to hex
code_color = pd.Series(ledgend.RGB.values,index=ledgend.CLC_CODE).to_dict()
gdf["color"] = gdf["Code_18"].apply(lambda code: code_color[int(code)])



Plotting the complete geopackage is computationally intensive, but with GeoPandas we can easily cut a section from the data.

In [10]:
#load a GeoDataframe containg outlines of countries
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

#polygon with the outline of spain
spain_outline = world[world["name"] == "Spain"]
 
#convert the polygon to the same coordinate reference system(CRS) as the CLC data
spain_outline = spain_outline.to_crs(gdf.crs)
#Clip the CLC data with the outline of Spain
spain = gpd.clip(gdf,spain_outline)

#load colors from the legend into the GeoDataframe
spai["color"] = spain["Code_18"].apply(lambda code: code_color[int(code)])

spain.plot( color=spain['color'] )

NameError: name 'spai' is not defined

In [None]:
import matplotlib.pyplot as plt
plt.savefig("spain.svg")


In [None]:
gdf.crs

# 3. Data Preparation