In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
import s3fs
import hvac
import os

client = hvac.Client(url='https://vault.lab.sspcloud.fr',
                     token=os.environ['VAULT_TOKEN'])

secret = os.environ['VAULT_MOUNT'] + os.environ['VAULT_TOP_DIR'] + '/s3'
mount_point, secret_path = secret.split('/', 1)
secret_dict = client.secrets.kv.read_secret_version(path=secret_path, mount_point = mount_point)

os.environ["AWS_ACCESS_KEY_ID"] = secret_dict['data']['data']['ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_dict['data']['data']['SECRET_ACCESS_KEY']
try:
    del os.environ['AWS_SESSION_TOKEN']
except KeyError:
    pass

In [4]:
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio.lab.sspcloud.fr'},
    key=os.environ["AWS_ACCESS_KEY_ID"],
    secret=os.environ["AWS_SECRET_ACCESS_KEY"]
)
fs.ls('projet-slums-detection/Donnees/RIL/')

## Petites communes

In [5]:
with fs.open('projet-slums-detection/Donnees/RIL/ril_pc_dom.csv', 'r') as f:
    df_pc = pd.read_csv(f)

Data types to specify

In [6]:
df_pc.info()

In [7]:
pd.set_option('display.max_columns', None)
df_pc.head()

### Dictionnaire des variables

In [8]:
df_pc.grp_rotation.value_counts()

- id: Identifiant du logement
- depcom: Code commune
- id_ea: Autre id
- principal: Oui ou Non, quasiment uniquement des Oui.
- numero: Numéro de la voie
- repetition: Beaucoup de NaN, signification ?
- complement: Complément d'adresse
- type_voie: Type de voie
- libelle_voie: Nom de la voie
- rivoli: Signification ?
- ea_type: HABIT, HOTEL ou COMMU
- hab_hot_type: Signification ? Quelques modalités: 002D, 002C, 002B, etc.
- nb_log: Nombre de logements
- etat: Variable entière 0 -> 6. Signification ?
- pc_num: Numéro de permis de construire
- commentaire: Commentaire sur le logement
- hab_construction_an: Année de construction
- hab_nb_imm: Variable entière 0 -> 4. Signification ?
- hab_niv: Signification ?
- hot_sous_type: Type d'hôtel
- hot_enseigne: Enseigne d'hôtel
- lien_cmt: Signification ?
- x: Coordinate
- y: Coordinate
- code_epsg: Système de projection: 4559, 2975, 4471, 2972
- type_localisation: CIC, MAN, CENT. Signification ?
- numero_parcelle: Numéro de parcelle
- code_ilot: Code d'ilot
- code_iris: Iris
- quartier_prioritaire: QP
- canton: Signification ?
- liste_insee: Que des N
- id_rp: Id qui sert à quoi ?
- dernier_tirage: 2019 6103 fois
- echantillon: Que des N.
- grp_rotation: 2, 3, 4 ou 5.
- date_maj_ea: Quasi vide
- com_anc_idft: quasi vide
- com_type: Quasi vide
- com_statut: Quasi vide
- com_nb_log_fonction: Quasi vide
- com_nom: Quasi vide
- com_date_creation_base: Quasi vide
- com_date_fermeture: Quasi vide
- com_capacite_theorique: Quasi vide
- millesime: 2020 -> 2023
- date_livraison: Date de livraison. A explorer.
- ref_ea: Signification ?
- id_rca_adresse: Signification ?
- reperage: Indication pour repérer le batiment. Intéressant
- id_ea_regroup: Quasi vide

In [9]:
df_pc = df_pc[["id", "depcom", "ea_type", "nb_log", "hab_construction_an", "x", "y", "code_epsg", "code_ilot", "grp_rotation", "millesime", "date_livraison", "reperage"]]

In [10]:
df_pc.head()

Un peu plus en détail:

In [11]:
df_pc["dep"] = df_pc["depcom"].astype(str).str[:3]

In [12]:
df_pc.dep.value_counts()

In [13]:
df_pc[df_pc.dep == "977"].depcom.value_counts()

Que Saint Barthélémy.

In [14]:
dep_mapping = {
    "971": "Guadeloupe",
    "972": "Martinique",
    "973": "Guyane",
    "974": "Réunion",
    "976": "Mayotte",
    "977": "Saint-Barthélémy",
}
df_pc["dep"] = df_pc["dep"].apply(lambda x: dep_mapping[x])
df_pc.head()

In [15]:
df_pc.ea_type.value_counts()

In [16]:
from matplotlib import pyplot as plt

plt.hist(df_pc["hab_construction_an"], bins=100)
plt.show()

Variable pas exploitable.

In [17]:
df_pc.code_epsg.value_counts()

In [18]:
df_pc.groupby(["dep", "code_epsg"])["id"].count()

In [19]:
dep_to_epsg = {
    "Guadeloupe": 4559,
    "Guyane": 2972,
    "Martinique": 4559,
    "Mayotte": 4471,
    "Réunion": 2975,
    "Saint-Barthélémy": 4559
}

In [20]:
df_pc.code_ilot.value_counts()

In [21]:
df_pc.grp_rotation.value_counts()

In [22]:
df_pc.groupby(["dep", "grp_rotation"])["id"].count()

Le GR 1 a été enquêté à l’EAR 2019 donc son enquête carto a eu lieu de mai à août 2018, ..., GR5 -> EAR 2023 donc enquête carto de mai à août 2022. Avec l’annulation de l’EAR 2021 sauf à Mayotte, les PC (sauf Mayotte) ont été décalées d’un GR, ainsi par exemple les PC du GR3 (EAR 2021 et enquête carto 2020) sont passées dans le GR4 et ont 2 RIL consécutifs (millesimes 2021 et 2022). Certaines PC n’ont pas de RIL. 
J’ai également mis le fichier des contours d’îlots tels qu’ils existent actuellement mais attention les contours d’îlots ont pu changer ces dernières années surtout en Guyane.

In [23]:
df_pc.millesime.value_counts()

In [24]:
df_pc[df_pc.millesime == 2020].groupby(["depcom", "grp_rotation"])["id"].count()

Comprendre pourquoi ?

In [25]:
df_pc[df_pc.millesime == 2021].groupby(["depcom", "grp_rotation"])["id"].count()

In [26]:
df_pc[df_pc.millesime == 2022].groupby(["depcom", "grp_rotation"])["id"].count()

In [27]:
df_pc[df_pc.millesime == 2023].groupby(["depcom", "grp_rotation"])["id"].count()

In [28]:
depcom_to_gr_pre_2021 = {
    97119: 2, 
    97130: 2, 
    97131: 2, 
    97132: 2, 
    97211: 2, 
    97216: 2, 
    97218: 2,
    97223: 2, 
    97313: 2, 
    97357: 2, 
    97360: 2, 
    97403: 2, 
    97417: 2, 
    97419: 5, 
    97421: 2, 
    97423: 2,
    97424: 5,
    97108: 3,
    97109: 3,
    97112: 3, 
    97126: 3, 
    97202: 3, 
    97205: 3, 
    97215: 3, 
    97217: 3,
    97219: 3,
    97303: 3,
    97361: 3,
    97406: 3,
    97613: 3
}

depcom_to_gr_post_2021 = {
    97108: 4,
    97109: 4,
    97112: 4,
    97126: 4,
    97202: 4,
    97205: 4,
    97215: 4,
    97217: 4,
    97219: 4,
    97303: 4,
    97361: 4,
    97406: 4,
    97605: 4,
    97609: 4,
    97102: 5,
    97106: 5,
    97133: 5,
    97203: 5,
    97204: 5,
    97206: 5,
    97231: 5,
    97233: 5,
    97301: 5,
    97308: 5,
    97314: 5,
    97352: 5,
    97356: 5,
    97419: 5,
    97424: 5,
    97612: 5,
    97701: 5
}

In [29]:
gr_to_validity = {
    2: ("2019-05", "2019-08"),
    3: ("2020-05", "2020-08"),
    4: ("2021-05", "2021-08"),
    5: ("2022-05", "2022-08")
}

millesime_to_validity = {
    2020: ("2019-05", "2019-08"),
    2021: ("2020-05", "2020-08"),
    2022: ("2021-05", "2021-08"),
    2023: ("2022-05", "2022-08")
}

Il faut choisir ces dates de validité

In [30]:
df_pc.code_ilot.value_counts()

In [31]:
df_pc[df_pc.millesime == 2020].groupby(["depcom", "code_ilot"])["id"].count()

Utilité des ilots pour les PC ?

## Grandes communes

In [32]:
with fs.open('projet-slums-detection/Donnees/RIL/ril_gc_dom.csv', 'r') as f:
    df_gc = pd.read_csv(f)

Data types to specify

In [33]:
df_gc.info()

In [34]:
df_gc.head()

In [35]:
df_gc["dep"] = df_gc["depcom"].astype(str).str[:3]

In [36]:
df_gc.dep.value_counts()

In [37]:
df_gc[df_gc.dep == "978"].depcom.value_counts()

Que Saint Martin.

In [38]:
dep_mapping = {
    "971": "Guadeloupe",
    "972": "Martinique",
    "973": "Guyane",
    "974": "Réunion",
    "976": "Mayotte",
    "978": "Saint-Martin",
}
df_gc["dep"] = df_gc["dep"].apply(lambda x: dep_mapping[x])
df_gc.head()

In [39]:
df_gc.ea_type.value_counts()

In [40]:
from matplotlib import pyplot as plt

plt.hist(df_gc["hab_construction_an"], bins=100)
plt.show()

Variable pas exploitable.

In [41]:
df_gc.code_epsg.value_counts()

In [42]:
df_gc.groupby(["dep", "code_epsg"])["id"].count()

In [43]:
dep_to_epsg = {
    "Guadeloupe": 4559,
    "Guyane": 2972,
    "Martinique": 4559,
    "Mayotte": 4471,
    "Réunion": 2975,
    "Saint-Martin": 4559
}

In [44]:
df_gc.code_ilot.value_counts()

In [45]:
df_gc.grp_rotation.value_counts()

In [46]:
df_gc.groupby(["dep", "grp_rotation"])["id"].count()

In [47]:
df_gc.millesime.value_counts()

In [48]:
df_gc[df_gc.millesime == 2020].groupby(["depcom", "grp_rotation"])["id"].count()

Pour les GC on a pour 1 millesime plusieurs groupes de rotation dans la base.

In [49]:
df_gc[df_gc.millesime == 2021].groupby(["depcom", "grp_rotation"])["id"].count()

In [50]:
df_gc[df_gc.millesime == 2022].groupby(["depcom", "grp_rotation"])["id"].count()

In [51]:
df_gc[df_gc.millesime == 2023].groupby(["depcom", "grp_rotation"])["id"].count()

Ilots ? normalement 1 groupe de rotation par ilot ? 

In [52]:
df_gc[df_gc.millesime == 2020].groupby(["code_ilot", "grp_rotation"])["id"].count()

Par ilot et commune ? 

In [53]:
df_gc[df_gc.millesime == 2020].groupby(["depcom", "code_ilot", "grp_rotation"])["id"].count()

Nombre de groupes de rotations différents par commune x ilot

In [54]:
nb_grp_rotation = df_gc[df_gc.millesime == 2020].groupby(["depcom", "code_ilot", "grp_rotation"])["id"].count().reset_index().groupby(["depcom", "code_ilot"])["grp_rotation"].count()

In [55]:
plt.hist(nb_grp_rotation)
plt.show()

In [56]:
nb_grp_rotation.value_counts()

On est très souvent à 1 groupe de rotation par commune x ilot

In [57]:
df_gc.millesime.value_counts()

In [58]:
millesime_to_validity = {
    2020: ("2019-05", "2019-08"),
    2021: ("2020-05", "2020-08"),
    2022: ("2021-05", "2021-08"),
    2023: ("2022-05", "2022-08")
}

Il faut choisir ces dates de validité

## Géométrie ilots

In [5]:
!pip install pyreadr

In [6]:
import pyreadr

fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
result = pyreadr.read_r('rca_geometry_ilots.rds')

In [7]:
result.keys()

In [8]:
df_geom = result[None] 

In [9]:
type(df_geom)

In [10]:
df_geom.info()

In [11]:
df_geom.head()

In [12]:
df_geom.depcom.str[:3].value_counts()

In [13]:
df_geom.srid.value_counts()

In [14]:
from shapely import wkt

df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)

In [15]:
import geopandas as gpd

gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
gdf

In [16]:
gdf.geometry

In [17]:
gdf.groupby(["depcom", "code_ilot"])["srid"].count()

On a bien que des depcom x ilot distincts

## Save as partitionned_parquet file

In [18]:
with fs.open('projet-slums-detection/Donnees/RIL/ril_pc_dom.csv', 'r') as f:
    df_pc = pd.read_csv(f, dtype={
        "id": str,
        "depcom": str,
        "ea_type": str,
        "nb_log": pd.Int64Dtype(),
        "hab_construction_an": pd.Int64Dtype(),
        "x": float,
        "y": float,
        "code_epsg": str,
        "code_ilot": str,
        "grp_rotation": pd.Int64Dtype(),
        "millesime": pd.Int64Dtype(),
        "date_livraison": object,
        "reperage": str
    })

In [19]:
df_pc = df_pc[["id", "depcom", "ea_type", "nb_log", "hab_construction_an", "x", "y", "code_epsg", "code_ilot", "grp_rotation", "millesime", "date_livraison", "reperage"]]

In [20]:
df_pc.head()

In [21]:
with fs.open('projet-slums-detection/Donnees/RIL/ril_gc_dom.csv', 'r') as f:
    df_gc = pd.read_csv(f, dtype={
        "id": str,
        "depcom": str,
        "ea_type": str,
        "nb_log": pd.Int64Dtype(),
        "hab_construction_an": pd.Int64Dtype(),
        "x": float,
        "y": float,
        "code_epsg": str,
        "code_ilot": str,
        "grp_rotation": pd.Int64Dtype(),
        "millesime": pd.Int64Dtype(),
        "date_livraison": object,
        "reperage": str
    })

In [22]:
df_gc = df_gc[["id", "depcom", "ea_type", "nb_log", "hab_construction_an", "x", "y", "code_epsg", "code_ilot", "grp_rotation", "millesime", "date_livraison", "reperage"]]

In [23]:
df_pc["type_com"] = "pc"
df_gc["type_com"] = "gc"

In [24]:
df_ril = pd.concat([df_gc, df_pc])
df_ril.head()

In [25]:
df_ril.shape

In [26]:
df_ril["dep"] = df_ril["depcom"].str[:3]
df_ril.head()

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa

def write_parquet_as_partitioned_dataset(fs, table, endpoint, bucket_name, path, partition_cols=None, compression="SNAPPY"):
    url = f"https://{endpoint}"
    file_uri = f"{bucket_name}/{path}"
    pq.write_to_dataset(table, root_path=file_uri, partition_cols=partition_cols, filesystem=fs, compression=compression)

In [None]:
endpoint = os.environ['AWS_S3_ENDPOINT']
bucket_name = "projet-slums-detection"
path = "Donnees/RIL/ril_data.parquet"

In [None]:
table_ril = pa.Table.from_pandas(df_ril)

In [None]:
write_parquet_as_partitioned_dataset(fs, table_ril, endpoint, bucket_name, path, partition_cols=["dep", "millesime"])

## Function to get a RIL file

In [27]:
!pip install affine
!pip install rasterio

In [28]:
import pyarrow.parquet as pq
import pyarrow as pa

In [29]:
dep_to_crs = {
    "971": "4559",
    "973": "2972",
    "972": "4559",
    "976": "4471",
    "974": "2975",
    "977": "4559",
    "978": "4559"
}

In [30]:
import sys
sys.path.append("../src/")
from utils import get_environment, get_file_system
from typing import Literal
from datetime import datetime

def load_ril(millesime: Literal["2020", "2021", "2022", "2023"], dep: Literal["971", "972", "973", "974", "976", "977", "978"]) -> gpd.GeoDataFrame:
    """
    Load RIL for a given datetime.

    Args:
        millesime (Literal): Year.
        dep (Literal): Departement.

    Returns:
        gpd.GeoDataFrame: RIL GeoDataFrame.
    """
    environment = get_environment()
    fs = get_file_system()
    
    dataset = pq.ParquetDataset(
        os.path.join(
            environment["bucket"], environment["sources"]["RIL"], "dep=" + dep, "millesime=" + millesime
        ),
        filesystem=fs
    )
    
    df = dataset.read().to_pandas()
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df.x, df.y)
    )
    crs = dep_to_crs[dep]
    gdf = gdf.set_crs("epsg:" + crs)
    
    return gdf

In [31]:
a = load_ril("2020", "971")

In [32]:
a

## Nécessaire pour les fonctions de filtre

In [33]:
fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
result = pyreadr.read_r('rca_geometry_ilots.rds')

In [34]:
result.keys()

In [35]:
df_geom = result[None] 

In [36]:
type(df_geom)

In [37]:
df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
gdf

In [38]:
gdf.iloc[-3:, :].dissolve(by="gr")

Warning, les ilots changent en Guyane, là on a des ilots statiques

In [39]:
gdf.wkt.iloc[-3]

In [40]:
gdf.wkt.iloc[-2]

In [41]:
gdf.wkt.iloc[-1]

In [42]:
gdf.iloc[-3:, :].dissolve(by="gr").wkt[0]

In [43]:
gdf.iloc[-3:, :].dissolve(by="gr").wkt[1]

In [44]:
gdf.iloc[-100:, :].dissolve(by="gr").wkt[0]

In [45]:
gdf.iloc[-736:, :].dissolve(by="gr")

In [46]:
gdf.iloc[-737, :].wkt.buffer(0)

In [47]:
gdf.iloc[-737, :].wkt

In [48]:
gdf.iloc[-737, :].loc["wkt"] = gdf.iloc[-737, :].wkt.buffer(0)

In [49]:
gdf.iloc[-737, :].loc["wkt"].buffer(0)

In [50]:
a = gdf.iloc[-737:-735, :].copy()

In [51]:
a

In [52]:
b = a.copy()

In [53]:
b["wkt"] = [geom.buffer(0) for geom in b["wkt"]]

In [54]:
b

In [55]:
gdf["wkt"] = [geom.buffer(0) for geom in gdf["wkt"]]

In [56]:
gdf.dissolve(by="gr")

Compute intersection between image and polygon

En fait là on a un problème, déjà il faut séparer en fonction du département.

In [57]:
def compute_gr_geometry(dep):
    fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
    result = pyreadr.read_r('rca_geometry_ilots.rds')
    df_geom = result[None]
    df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
    gdf["dep"] = gdf.depcom.str[:3]
    gdf = gdf[gdf.dep == dep]
    
    return gdf.dissolve(by="gr").to_crs("epsg:" + dep_to_crs[dep])

In [58]:
guad_geoms = compute_gr_geometry("971")

In [59]:
guad_geoms.crs

In [60]:
guad_geoms

Intersection with an image.

In [61]:
!pip install py7zr
!pip install torch

In [62]:
import sys
sys.path.append('../src')
import shutil
import yaml
import py7zr
import os
import s3fs
from satellite_image import SatelliteImage
from labeler import RILLabeler
from labeler import BDTOPOLabeler
from labeled_satellite_image import SegmentationLabeledSatelliteImage
from utils import *
from filter import *
import matplotlib.pyplot as plt
import numpy as np
import rasterio.plot as rp
import re
from datetime import datetime
from tqdm import tqdm

## Téléchargement des données Pléiades

In [63]:
environment = get_environment()

bucket = environment["bucket"]
path_s3_pleiades_data = environment["sources"]["PLEIADES"]
path_s3_bdtopo_data = environment["sources"]["BDTOPO"][2022]["guyane"]
path_local_pleiades_data = environment["local-path"]["PLEIADES"]
path_local_bdtopo_data = environment["local-path"]["BDTOPO"][2022]["guyane"]

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
                       key=os.environ["AWS_ACCESS_KEY_ID"],
                       secret=os.environ["AWS_SECRET_ACCESS_KEY"])

In [64]:
fs.download(
    rpath=f"{bucket}/{path_s3_pleiades_data}",
    lpath=f"../{path_local_pleiades_data}",
    recursive=True
)

## Découpage et filtrage des images

In [65]:
import re
from tqdm import tqdm

In [66]:
images_paths = [f"../{path_local_pleiades_data}/16bits/ORT_2022072050325085_U22N/" + p for p in os.listdir(f"../{path_local_pleiades_data}/16bits/ORT_2022072050325085_U22N/")]

In [67]:
date = datetime.strptime(re.search(r'ORT_(\d{8})', images_paths[0]).group(1), '%Y%m%d')

On importes les `SatelliteImage` dans une liste.

In [68]:
list_images = [
     SatelliteImage.from_raster(
        filename,
        dep = "973",
        date = date, 
        n_bands = 4
    ) for filename in tqdm(images_paths)]

In [69]:
list_images[0]

In [70]:
list_images[0].__dict__

In [71]:
list_images[0].bounds

In [72]:
xmin, ymin, xmax, ymax = list_images[0].bounds
xmin

In [73]:
def compute_gr_geometry(dep):
    fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
    result = pyreadr.read_r('rca_geometry_ilots.rds')
    df_geom = result[None]
    df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
    gdf["dep"] = gdf.depcom.str[:3]
    gdf = gdf[gdf.dep == dep]
    gdf["wkt"] = [geom.buffer(0) for geom in gdf["wkt"]]
    
    return gdf.dissolve(by="gr").to_crs("epsg:" + dep_to_crs[dep])

In [74]:
dep_to_crs

In [75]:
guy_geoms = compute_gr_geometry("973")
guy_geoms

In [76]:
guy_geoms.wkt[0]

In [77]:
guy_geoms.wkt[3]

In [78]:
guy_geoms.cx[xmin:xmax, ymin:ymax]

In [79]:
xmin, ymin, xmax, ymax = list_images[1].bounds
xmin

In [80]:
guy_geoms.cx[xmin:xmax, ymin:ymax]

In [81]:
guy_geoms.wkt

In [82]:
guy_geoms.cx[xmin:xmax, ymin:ymax].wkt.area

In [83]:
list_images[1].bounds

In [84]:
from shapely.geometry import box

geom = box(*list_images[1].bounds)
geom

In [85]:
gpd.GeoDataFrame(gpd.GeoSeries(geom), columns=['geometry']).set_crs("epsg:2972")

In [86]:
gpd.overlay(guy_geoms, gpd.GeoDataFrame(gpd.GeoSeries(geom), columns=['geometry']).set_crs("epsg:2972"), how='intersection')

In [87]:
gpd.overlay(guy_geoms, gpd.GeoDataFrame(gpd.GeoSeries(geom), columns=['geometry']).set_crs("epsg:2972"), how='intersection').geometry.area

In [88]:
type(guy_geoms.wkt[1])

In [89]:
type(geom)

In [90]:
guy_geoms.wkt[1]

In [91]:
geom

In [92]:
guy_geoms.wkt[1].intersection(geom)

In [93]:
print(geom)

In [94]:
guy_geoms.wkt[1].intersection(geom).area

In [95]:
geom.area

In [96]:
gc_list = df_ril[df_ril.type_com == "gc"].depcom.unique()

In [97]:
pc_list = df_ril[df_ril.type_com == "pc"].depcom.unique()

In [98]:
set(gc_list).intersection(pc_list)

Ok pas d'intersection

In [99]:
fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
result = pyreadr.read_r('rca_geometry_ilots.rds')
os.remove('rca_geometry_ilots.rds')
df_geom = result[None]

In [100]:
depcom_ilots = df_geom.depcom.unique()

In [101]:
depcom_ril = np.concatenate([gc_list, pc_list])

In [102]:
len(np.intersect1d(depcom_ilots, depcom_ril))

In [103]:
len(depcom_ril)

In [104]:
len(depcom_ilots)

In [105]:
list(set(depcom_ilots) - set(depcom_ril))

Ok toutes les communes dans le RIL sont dans le df des ilots

In [154]:
def compute_recent_gr_geometries(dep):
    # Get data
    fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
    result = pyreadr.read_r('rca_geometry_ilots.rds')
    os.remove('rca_geometry_ilots.rds')
    df_geom = result[None]
    df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
    
    # Filtering departement 
    gdf["dep"] = gdf.depcom.str[:3]
    gdf = gdf[gdf.dep == dep]
    gdf["wkt"] = [geom.buffer(0) for geom in gdf["wkt"]]
    
    # Return aggregated geometries
    return gdf.dissolve(by="gr").to_crs("epsg:" + dep_to_crs[dep])

On a récupéré d'autres listes de communes

In [107]:
with fs.open('projet-slums-detection/Donnees/RIL/DOM_GC_PC.csv', 'r') as f:
    df_corresp = pd.read_csv(f)

df_corresp

In [121]:
pc_list_corresp = df_corresp[df_corresp.statut == "PC"]["depcom"].astype(str)

In [122]:
pc_list_corresp

In [123]:
pc_list

In [124]:
len(pc_list)

In [125]:
set(pc_list_corresp) == set(pc_list)

In [126]:
set(pc_list).issubset(set(pc_list_corresp))

In [129]:
gc_list_corresp = df_corresp[df_corresp.statut == "GC"]["depcom"].astype(str)

In [130]:
set(gc_list).issubset(set(gc_list_corresp))

In [131]:
df_corresp.depcom

In [132]:
len(set(gc_list_corresp)) + len(set(pc_list_corresp))

In [133]:
len(set(df_corresp.depcom))

In [136]:
set(depcom_ilots).issubset(set(df_corresp.depcom.astype(str)))

Ok on a bien toutes les communes dans les nouvelles listes.

In [138]:
def compute_old_gr_geometries(dep):
    # Get data
    fs.get('projet-slums-detection/Donnees/RIL/rca_geometry_ilots.rds', 'rca_geometry_ilots.rds')
    result = pyreadr.read_r('rca_geometry_ilots.rds')
    os.remove('rca_geometry_ilots.rds')
    df_geom = result[None]
    df_geom['wkt'] = df_geom['wkt'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df_geom, geometry="wkt", crs="EPSG:4326")
    
    # Filtering departement 
    gdf["dep"] = gdf.depcom.str[:3]
    gdf = gdf[gdf.dep == dep]
    gdf["wkt"] = [geom.buffer(0) for geom in gdf["wkt"]]
    
    if dep == "976": 
        # Return aggregated geometries
        return gdf.dissolve(by="gr").to_crs("epsg:" + dep_to_crs[dep])
    else:
        gdf_pc = gdf[gdf.depcom.isin(df_corresp[df_corresp.statut == "PC"]["depcom"].astype(str))]
        gdf_gc = gdf[gdf.depcom.isin(df_corresp[df_corresp.statut == "GC"]["depcom"].astype(str))]
        # Missing cities in these lists...
        
        # Changing rotation group for small cities
        gdf_pc_copy = gdf_pc.copy()
        gdf_pc_copy["gr"] = ((((gdf_pc["gr"].astype(int)) - 2) % 5) + 1).astype(str)
        gdf_old = pd.concat([gdf_pc, gdf_gc])
        return gdf_old.dissolve(by="gr").to_crs("epsg:" + dep_to_crs[dep])

In [139]:
gdf[gdf.depcom.isin(pc_list)].shape

In [140]:
gdf[gdf.depcom.isin(pc_list)]

In [141]:
gdf[gdf.depcom.isin(df_corresp[df_corresp.statut == "PC"]["depcom"].astype(str))].shape

In [143]:
gdf[gdf.depcom.isin(df_corresp[df_corresp.statut == "GC"]["depcom"].astype(str))].shape

In [144]:
4069 + 16197

In [145]:
compute_old_gr_geometries("971")

## Fonction pour filter les images labelisées RIL

In [146]:
from typing import Union

def has_non_valid_label(
    labeled_image,
    delta_threshold: int = 160
) -> bool:
    """
    Determines if image label is valid or not. 
    
    Args:
        labeled_image (Union[SegmentationLabeledSatelliteImage, DetectionLabeledSatelliteImage]): Labeled satellite image.

    Returns:
        bool: True if the image label is not valid, False otherwise.
    """
    if labeled_image.source == "RIL":
        return has_non_valid_ril_label(labeled_image, delta_threshold)
    elif labeled_image.source == "BDTOPO":
        return has_non_valid_bdtopo_label(labeled_image, delta_threshold)
    

def has_non_valid_ril_label(
    labeled_image,
    delta_threshold: int,
    area_pct_threshold: float
) -> bool:
    """
    Determines if RIL image label is valid or not. 
    
    Args:
        labeled_image (Union[SegmentationLabeledSatelliteImage, DetectionLabeledSatelliteImage]): Labeled satellite image.
        delta_threshold (int): Max number of days between labeling date and image date.
        area_pct_theshold (float): Threshold on the percentage of area of the time that should be contained in the rotation group.

    Returns:
        bool: True if the image label is not valid, False otherwise.
    """
    labeling_date = labeled_image.labeling_date
    image_date = labeled_image.satellite_image.date
    
    # Filter image if labeling date is too far from image date
    delta = labeling_date - image_date
    delta = abs(delta.days)
    if delta > delta_threshold:
        return True
    
    # Determine rotation group of labeling_year
    labeling_year = labeling_date.year
    gr = str(((labeling_year - 2018) % 5) + 1)
    
    # Rotation group geometry
    dep = labeled_image.satellite_image.dep
    if labeling_year > 2020:
        gr_geometries = compute_recent_gr_geometries(dep)
    else:
        gr_geometries = compute_old_gr_geometries(dep)
    gr_geometries = gr_geometries.reset_index()
    gr_geometry = gr_geometries[gr_geometries.gr == gr].wkt
    
    geom = box(*labeled_image.satellite_image.bounds)
    intersection_area = float(gr_geometry.intersection(geom).area)
    
    if intersection_area / geom.area < area_pct_threshold:
        return True
    
    # If checks passed, return False
    return False
    
def has_non_valid_bdtopo_label(
    labeled_image,
    delta_threshold: int
) -> bool:
    """
    Determines if BDTOPO image label is valid or not. 
    
    Args:
        labeled_image (Union[SegmentationLabeledSatelliteImage, DetectionLabeledSatelliteImage]): Labeled satellite image.

    Returns:
        bool: True if the image label is not valid, False otherwise.
    """
    return False


In [147]:
list_images[1]

In [148]:
filename = '../data/Cayenne/16bits/ORT_2022072050325085_U22N/ORT_2022072050325085_0353_0545_U22N_16Bits.jp2'

In [149]:
date = datetime.strptime(re.search(r'ORT_(\d{8})', filename).group(1), '%Y%m%d')
date

In [150]:
labeler = RILLabeler(date, dep="973")
mask = labeler.create_segmentation_label(list_images[1])

In [151]:
mask

In [152]:
labeled_image = SegmentationLabeledSatelliteImage(list_images[1], mask, "RIL", datetime.strptime(re.search(r'ORT_(\d{8})', filename).group(1), '%Y%m%d'))

In [155]:
has_non_valid_ril_label(labeled_image, 100000, 0.6)

## Exporting GR geometries

In [156]:
type(compute_recent_gr_geometries("971").reset_index()[["gr", "wkt"]])

In [157]:
compute_recent_gr_geometries("971").reset_index()[["gr", "wkt"]].rename(columns={"wkt": "geometry"})

In [158]:
type(compute_recent_gr_geometries("971").reset_index()[["gr", "wkt"]])

In [159]:
to_save = compute_recent_gr_geometries("971").reset_index()[["gr", "wkt"]]

In [160]:
to_save["wkt"] = to_save["wkt"]

In [161]:
type(to_save)

In [None]:
to_save.to_file("recent_gr_971.shp")

In [None]:
test_971 = gpd.read_file("recent_gr_971.shp")

In [None]:
test_971

In [None]:
test_971.geometry

In [None]:
to_save.to_file("recent_gr_971.geojson", driver='GeoJSON')

In [None]:
test_971 = gpd.read_file("recent_gr_971.geojson")

In [None]:
test_971

In [None]:
test_971.geometry

In [165]:
for dep in ["971", "972", "973", "974", "976", "977", "978"]:
    to_save = compute_recent_gr_geometries(dep).reset_index()[["gr", "wkt"]]
    to_save.to_file("to_s3/new_gr_" + dep + ".geojson", driver='GeoJSON')

In [166]:
for dep in ["971", "972", "973", "974", "976", "977", "978"]:
    to_save = compute_old_gr_geometries(dep).reset_index()[["gr", "wkt"]]
    to_save.to_file("to_s3/old_gr_" + dep + ".geojson", driver='GeoJSON')

## Test lecture depuis s3

In [167]:
with fs.open("projet-slums-detection/Donnees/RIL/old_gr/old_gr_973.geojson") as f:
    test = gpd.read_file(f)
test

## Class structure

In [168]:
class RILFilter():
    """
    """
    
    def __init__(
        self,
        dep: str,
        delta_threshold: int,
        area_pct_threshold: float
    ):
        """
        """
        self.dep = dep
        self.delta_threshold = delta_threshold
        self.area_pct_threshold = area_pct_threshold
        
        environment = get_environment()
        fs = get_file_system()
      
        old_gr_path = os.path.join(
            environment["bucket"],
            environment["sources"]["old_gr"][self.dep]
        )
        new_gr_path = os.path.join(
            environment["bucket"],
            environment["sources"]["new_gr"][self.dep]
        )
        with fs.open(old_gr_path, 'r') as f:
            self.old_gr_geometries = gpd.read_file(f)
        with fs.open(new_gr_path, 'r') as f: 
            self.new_gr_geometries = gpd.read_file(f)
        
    def validate(self, labeled_image):
        """
        """
        if not self.validate_labeling_date(labeled_image):
            return False
        return self.validate_rotation_group(labeled_image)

    def validate_labeling_date(self, labeled_image):
        """
        """
        labeling_date = labeled_image.labeling_date
        image_date = labeled_image.satellite_image.date
        
        # Filter image if labeling date is too far from image date
        delta = labeling_date - image_date
        delta = abs(delta.days)
        if delta > self.delta_threshold:
            return False
        return True
    
    def validate_rotation_group(self, labeled_image):
        """
        """
        # Determine rotation group of labeling year
        labeling_year = labeled_image.labeling_date.year
        gr = str(((labeling_year - 2018) % 5) + 1)

        # Rotation group geometry
        dep = labeled_image.satellite_image.dep
        if labeling_year > 2020:
            gr_geometries = self.new_gr_geometries
        else:
            gr_geometries = self.old_gr_geometries
        gr_geometry = gr_geometries[gr_geometries.gr == gr].geometry
        
        # Intersection between rotation group geometry and 
        # satellite image geometry
        geom = box(*labeled_image.satellite_image.bounds)
        intersection_area = float(gr_geometry.intersection(geom).area)

        # Validate if intersection area is large enough
        if intersection_area / geom.area < self.area_pct_threshold:
            return False
        return True

In [169]:
ril_filter = RILFilter("973", 100000, 0.6)

In [170]:
labeled_image

In [171]:
ril_filter.validate(labeled_image)

## Est-ce que tout le territoire est couvert sur 5 ans ??

Naturellement non car les PC ont changé de groupe de rotation, mais on veut voir

In [172]:
images_paths = [f"../{path_local_pleiades_data}/16bits/ORT_2022072050325085_U22N/" + p for p in os.listdir(f"../{path_local_pleiades_data}/16bits/ORT_2022072050325085_U22N/")]

In [173]:
date = datetime.strptime(re.search(r'ORT_(\d{8})', images_paths[0]).group(1), '%Y%m%d')

On importes les `SatelliteImage` dans une liste.

In [174]:
list_images = [
     SatelliteImage.from_raster(
        filename,
        dep = "973",
        date = date, 
        n_bands = 4
    ) for filename in tqdm(images_paths)]

On découpe les images initialement de dimension 2000 en plusieurs image de dimension `dim`. On ne garde ensuite que les images qui sont pertinentes à l'aide de la méthode `filter_images`. Une image contenant moins de 50% de pixels non noirs n'est pas retenue.

In [175]:
dim = 250

In [176]:
splitted_list_images = [im for sublist in tqdm(list_images) for im in sublist.split(250) if not is_too_black(im)]

In [177]:
len(splitted_list_images)

In [178]:
from labeler import RILLabeler

In [179]:
labeler_RIL = RILLabeler(date, dep="973", buffer_size=6, cap_style=3)

A l'aide de notre `labeler` on créer une `SegmentationLabeledSatelliteImage`.

In [180]:
date

In [181]:
from dateutil.relativedelta import relativedelta
date - relativedelta(years=1)

In [182]:
from filter import RILFilter

In [183]:
ril_filter = RILFilter("973", 100000, 0.6)

In [184]:
for i in range(5):
    labeling_date = date - relativedelta(years=i)
    
    list_labeled_images = [
     SegmentationLabeledSatelliteImage(
        sat_im,
        labeler_RIL.create_segmentation_label(sat_im),
        "RIL",
        labeling_date
     ) for sat_im in tqdm(splitted_list_images)]
    
    filtered_list = [labeled_image for labeled_image in tqdm(list_labeled_images) if ril_filter.validate(labeled_image)]
    
    print(len(filtered_list))

In [185]:
1268 + 1892 + 1801 + 2047 + 1480

In [186]:
8488 / 13254

Sur 5 ans on rate quand même un peu.. On vire peut-être beaucoup de tuiles avec le 0.6 ?

In [187]:
ril_filter = RILFilter("973", 100000, 0.1)

In [188]:
for i in range(5):
    labeling_date = date - relativedelta(years=i)
    
    list_labeled_images = [
     SegmentationLabeledSatelliteImage(
        sat_im,
        labeler_RIL.create_segmentation_label(sat_im),
        "RIL",
        labeling_date
     ) for sat_im in tqdm(splitted_list_images)]
    
    filtered_list = [labeled_image for labeled_image in tqdm(list_labeled_images) if ril_filter.validate(labeled_image)]
    
    print(len(filtered_list))

In [189]:
1895 + 2636 + 2492 + 2771 + 2221

In [190]:
12015 / 13254

Check les gr_geometries

In [191]:
environment = get_environment()

In [192]:
old_gr_path = os.path.join(
    environment["bucket"], environment["sources"]["old_gr"]["973"]
)
new_gr_path = os.path.join(
    environment["bucket"], environment["sources"]["new_gr"]["973"]
)
with fs.open(old_gr_path, "r") as f:
    old_gr_geometries = gpd.read_file(f)
with fs.open(new_gr_path, "r") as f:
    new_gr_geometries = gpd.read_file(f)

In [193]:
old_gr_geometries

In [194]:
new_gr_geometries

In [195]:
old_gr_geometries.dissolve().geometry[0]

In [196]:
new_gr_geometries.dissolve().geometry[0]

In [197]:
list_labeled_images[0].satellite_image.bounds

In [198]:
box(*list_labeled_images[0].satellite_image.bounds)

In [199]:
gpd.GeoDataFrame(gpd.GeoSeries([box(*im.satellite_image.bounds) for im in list_labeled_images]), columns=['geometry']).dissolve().geometry[0]

In [200]:
shapes = []

for i in range(5):
    labeling_date = date - relativedelta(years=i)
    
    list_labeled_images = [
     SegmentationLabeledSatelliteImage(
        sat_im,
        labeler_RIL.create_segmentation_label(sat_im),
        "RIL",
        labeling_date
     ) for sat_im in tqdm(splitted_list_images)]
    
    filtered_list = [labeled_image for labeled_image in tqdm(list_labeled_images) if ril_filter.validate(labeled_image)]
    shapes.append(
        gpd.GeoDataFrame(gpd.GeoSeries([box(*im.satellite_image.bounds) for im in filtered_list]), columns=['geometry']).dissolve().geometry[0]
    )

In [201]:
shapes

In [202]:
shapes[0]

In [203]:
shapes[1]

In [204]:
shapes[2]

In [205]:
shapes[3]

In [206]:
shapes[4]

In [207]:
gpd.GeoDataFrame(gpd.GeoSeries([shape for shape in shapes]), columns=['geometry']).dissolve().geometry[0]

In [208]:
len(splitted_list_images)

In [209]:
len(list_labeled_images)

Ok on dirait qu'on a bien tout.. Cool