In [1]:
import pandas as pd
import numpy as np
import time

### Présentation du fichier
* __689301__ observations (index)
* Une colonne `Longitude` et une colonne `Latitude` exprimées en `CRS WGS384` qu'il faudra convertir en `Mercator`
* __165__ colonnes correspondant aux année dont la valeur est la valeur de sortie du modèle (ou probabilité de présence) comprise entre 0 et 1, avec 1 étant le maximum (i.e. l'espèce rencontre toutes les conditions environnementales requises pour être présente; 0 = l'espèce ne peut pas être présente).

In [2]:
# initialisation du Timer
#########################

start_sec = time.time()

In [3]:
# chargement des données
########################

df = pd.read_csv('dataset_sprat_light_v2.csv', sep=';')

In [4]:
# vérification du nombre d'observations
#######################################

len(df)

689301

In [5]:
# traitement des valeurs manquantes (NaN)
########################################

df = df.dropna()

In [6]:
# vérification du nombre d'observations
#######################################

len(df)

689301

In [7]:
# Séparation des données géographiques et des valeurs modélisées
################################################################

# `d_map` : données géographiques
# `d_val` : valeurs modélisées

df_map = df.iloc[:,:2]
df_val = df.iloc[:,2:]

In [8]:
# mise à l'échelle (= division par 100)
######################################

df_val = df_val / 100

In [9]:
# Identification des observations non nulles
############################################

# Afin de réduire les champs des observations aux observations non nulles, on calcule les lignes 
# dont la somme des valeurs des colonnes temporelles est non nulle.
# On en déduit qu'il existe '43701' observations non nulles que l'on stocke dans 'df_data'

(df_val.sum(axis=1) != 0).sum()

43701

In [10]:
# extraction des observations non nulles dans 'df_data'
#######################################################

df_data = df_val[df_val.sum(axis=1) != 0]
df_data.head()

Unnamed: 0,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
1322,0.041093,0.018304,0.017149,0.02112,0.065412,0.015823,0.01642,0.015823,0.015823,0.015823,...,0.160507,0.016118,0.214745,0.161338,0.370544,0.132713,0.015823,0.063486,0.354522,0.156443
1323,0.03996,0.017808,0.016884,0.020613,0.063224,0.015823,0.0163,0.015823,0.015823,0.015823,...,0.155331,0.016059,0.211703,0.155974,0.355953,0.132422,0.015823,0.061177,0.345742,0.152188
1324,0.03314,0.015823,0.015823,0.018022,0.052629,0.015823,0.015823,0.015823,0.015823,0.015823,...,0.14741,0.015823,0.209675,0.147886,0.346,0.127796,0.015823,0.050573,0.333046,0.145079
1325,0.024041,0.015823,0.015823,0.016453,0.042319,0.015823,0.015823,0.015823,0.015823,0.015823,...,0.139507,0.015823,0.193969,0.139935,0.331154,0.115514,0.015823,0.040504,0.317952,0.137416
1329,0.019685,0.015823,0.015823,0.015911,0.033397,0.015823,0.015823,0.015823,0.015823,0.015823,...,0.134469,0.015823,0.188689,0.135005,0.319891,0.105846,0.015823,0.031972,0.309082,0.131846


In [11]:
# Identification des observations 'nulles
########################################

# Pour les valeurs nulles, celles-ci s'expliquent par le fait que cela correspond à une probabilité nulle de 
# présence de l'espèce, ce qui est une information importante et à conserver, 
# notamment si une agrégation est faite sur une zone géographique.

# On en déduit et isolons '645600' observations nulle dont on ne conservera
# que les colonnes de 'Longitude' et de 'Latitude'.



(df_val.sum(axis=1) == 0).sum()

645600

In [12]:
# Opération de type 'booléen' sur l'index de 'df_map' non inclus dans l'index de 'df_data'
# les valeurs nulles sont effacés du dataframe, mais l'index est conservé.
# ce qui nous permettra de les visualiser en temps voulu et de ne pas perdre d'information scientifique.

df_null = df_map[~(df_map.index.isin(df_data.index))]

### Fusion avec les données géographiques
On est donc en mesure de produire (2) datasets finaux : 
* `df_data` composé de __42173__ observations,
* `df_null` composé de __645600__ observations nulles mais nécessaires dans notre démarche scientifique.

In [13]:
# ajout des données de localisation
##################################

df_data = df_data.merge(df_map, left_index=True, right_index=True, how='inner')

In [14]:
# réarrangement des colonnes
############################

cols = list(df_data.columns)
cols = cols[-2:] + cols[:-2]
df_data = df_data[cols]

df_data.head()

Unnamed: 0,Longitude,Latitude,1850,1851,1852,1853,1854,1855,1856,1857,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
1322,-89.8,47.0,0.041093,0.018304,0.017149,0.02112,0.065412,0.015823,0.01642,0.015823,...,0.160507,0.016118,0.214745,0.161338,0.370544,0.132713,0.015823,0.063486,0.354522,0.156443
1323,-89.8,47.1,0.03996,0.017808,0.016884,0.020613,0.063224,0.015823,0.0163,0.015823,...,0.155331,0.016059,0.211703,0.155974,0.355953,0.132422,0.015823,0.061177,0.345742,0.152188
1324,-89.8,47.2,0.03314,0.015823,0.015823,0.018022,0.052629,0.015823,0.015823,0.015823,...,0.14741,0.015823,0.209675,0.147886,0.346,0.127796,0.015823,0.050573,0.333046,0.145079
1325,-89.8,47.3,0.024041,0.015823,0.015823,0.016453,0.042319,0.015823,0.015823,0.015823,...,0.139507,0.015823,0.193969,0.139935,0.331154,0.115514,0.015823,0.040504,0.317952,0.137416
1329,-89.8,47.7,0.019685,0.015823,0.015823,0.015911,0.033397,0.015823,0.015823,0.015823,...,0.134469,0.015823,0.188689,0.135005,0.319891,0.105846,0.015823,0.031972,0.309082,0.131846


In [15]:
df_null.head()

Unnamed: 0,Longitude,Latitude
0,-90.0,25.0
1,-90.0,25.1
2,-90.0,25.2
3,-90.0,25.3
4,-90.0,25.4


### Création des exports CSV finaux
On exporte enfin notre travail sous la forme de (2) fichiers __csv__ et __compressés sans perte d'information__ :
* `data_sprat_data.csv` (69.5 Mo)
* `data_sprat_null.csv` (6.5 Mo)

soit un total de __76.0Mo__, soit une compression significative de __72%__

In [16]:
df_data.to_csv('export_data_sprat_data.csv', index=True)
df_null.to_csv('export_data_sprat_null.csv', index=True)

In [17]:
# finalisation du Timer
#######################

end_sec = time.time()

print('Calculs effectués en %s secondes' %round(end_sec - start_sec,2))

Calculs effectués en 160.71 secondes
