# Quick check of agro datasets exploitability for LeWagon MLProject

- availability of features data: parcels, type of soils, type of crop, weather, sequencial etc
- availability of target data: production (yearly)
- quality of data: nan, length of time series

**TODO**

- list datasets
- links between datasets
- size of datasets
- other datasets?
- objectives

## Objectives

- predict annual crop production per year and per region/parcel
- compare predicted annual production to annual crop consumption per region/parcel
- forecast under production and over production
- infer transport requirement for satisfying demand
- update crop production forecast with weather data
- update crop production with satellite images
- importations vs exportations

## Possible extensions

- get market data, market forecast to compute expectable value

<https://agreste.agriculture.gouv.fr/agreste-web/>

In [None]:
# ML libs
import pandas as pd
import numpy as np
# Utils
import os

# Graphic
import matplotlib.pyplot as plt

# 
datasets_dir = "/home/ken/Datasets/Agriculture"

In [None]:
file_name = "cd2023-19_ResultatsDéfinifs-FranceEntiere2021-2022_maj22nov2023.xlsx"
file_path = os.path.join(datasets_dir,file_name)
data = pd.read_excel(file_path, sheet_name=None)

In [None]:
def format_ssa_sommaire_df(ssa_excel_dict):
    ssa_excel_dict['Sommaire']  = ssa_excel_dict['Sommaire'].dropna(how="all").dropna(axis=1)
    ssa_excel_dict['Sommaire'].rename(columns=dict(zip(ssa_excel_dict['Sommaire'].columns, ['Tableaux', 'Onglets'])), inplace=True)
    ssa_excel_dict['Sommaire']['Onglets'] = ssa_excel_dict['Sommaire']['Onglets'].str.strip('#')
    ssa_excel_dict['Sommaire'].drop(2, inplace=True)
    ssa_excel_dict['Sommaire'].set_index("Tableaux", inplace=True)

In [None]:
ssa2023_dict = data.copy()
format_ssa_sommaire_df(ssa2023_dict)

In [None]:
COP = ssa2023_dict[ssa2023_dict['Sommaire'].iloc[0]['Onglets']]
COP

In [None]:
len(COP)

In [None]:
file_name = "SAA_2010-2023_provisoires_donnees_regionales.xlsx"
file_path = os.path.join(datasets_dir, file_name)
SAA_regionales = pd.read_excel(file_path, sheet_name=None)

In [None]:
SAA_regionales['COP']

In [None]:
file_name = "SAA_2010-2023_provisoires_donnees_departementales.xlsx"
file_path = os.path.join(datasets_dir, file_name)
SAA_departementales = pd.read_excel(file_path, sheet_name=None)

In [None]:
SAA_departementales.keys()

In [None]:
SAA_departementales['COP'].drop([0,1,2,3]).describe()

In [None]:
dir_name = "RPG/Projet_carto_RPG_2"
dir_path = os.path.join(datasets_dir, dir_name)
files_list = os.listdir(dir_path)
files_list

In [None]:
import xml.etree.ElementTree as ET

In [None]:
xml_file = os.path.join(dir_path, "Projet_carto_RPG_2-1_FXX.qgs")
tree = ET.parse(xml_file)


In [None]:
root = tree.getroot()

In [None]:
from bs4 import BeautifulSoup

with open(xml_file) as fp:
    soup = BeautifulSoup(fp,'lxml')

In [None]:
soup.find_all('field')

## Pays basque sample dataset of RPG in json format

In [None]:
json_name = "RPG/rpg-2020-registre-parcellaire-graphique.json"
json_path = os.path.join(datasets_dir, json_name)
import json
with open(json_path, "r") as file:
    js = json.load(file)

In [None]:
js_small = js[:100]
js_small

In [None]:

fig, ax = plt.subplots(figsize=(14,14))
#plt.xlim(-5.15,9.57)
#plt.ylim(41.32,51.10)
for i, parcel in enumerate(js):
    errors={}
    try:
        polygons = parcel['geo_shape']['geometry']['coordinates']
        #print(f"index = {i}")
        for polygon in polygons:
            ax.fill(np.array(polygon)[:,0], np.array(polygon)[:,1], label=parcel['code_cultu'])
    except TypeError as e:
        errors[i]=e
print(errors)
    #ax.plot(np.array(polygon)[:,:,0][0], np.array(polygon)[:,:,1][0], linewidth=3, color="w")

In [None]:
js_small[48]

In [None]:
polygon = js_small[0]['geo_shape']['geometry']['coordinates'][0]
plt.fill(np.array(polygon)[:,0], np.array(polygon)[:,1])
#plt.plot(np.array(polygon)[:,:,0][0], np.array(polygon)[:,:,1][0], linewidth=3, color="w")

In [None]:
np.array(polygon)[:,0]

In [None]:
np.array(js[:]['code_cultu']).nunique()

In [None]:
codes_cultu = set([parcel['code_cultu'] for parcel in js])

In [None]:
len(codes_cultu)