In [19]:
'''
-----------------------------
STATIONS - PESTICIDES - STATS
-----------------------------

GOAL : notebook python functions to add at root (app initialization on run.py) 
create panda objects / implement query functions / export to JSON 
for data analysis and visualization

- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES
- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)
- CLEAN AND MERGE DATA
- QUERY FUNCTIONS
- EXPORT FUNCTIONS (JSON)

AUTHOR : Julien Paris
DATE   : 24/12/2016

TO DO : 
- 
'''

'\n-----------------------------\nSTATIONS - PESTICIDES - STATS\n-----------------------------\n\nGOAL : notebook python functions to add at root (app initialization on run.py) \ncreate panda objects / implement query functions / export to JSON \nfor data analysis and visualization\n\n- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES\n- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)\n- CLEAN AND MERGE DATA\n- QUERY FUNCTIONS\n- EXPORT FUNCTIONS (JSON)\n\nAUTHOR : Julien Paris\nDATE   : 24/12/2016\n\nTO DO : \n- \n'

In [20]:
### import standard libraries
import os
import itertools
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

# pyproj settings to convert coordinates
from pyproj import Proj, transform
inProj  = Proj(init='epsg:2154') # proj in  : Lambert 93
outProj = Proj(init='epsg:4326') # proj out : WSG 84

In [21]:
### basic folders addresses and names
cwd = os.getcwd()

data_folder = "app/static/data"
stats_folder = "stats"

stats_path = os.path.join(cwd, data_folder, stats_folder)

print "-- cwd :", cwd
print "-- stats path : ", stats_path

for file in os.listdir(stats_path):
    if file.endswith(".csv") or file.endswith(".xlsx"):
        print "--- dataset in '/data' : ", file
        #print cwd+datas_folder+"/"+file

-- cwd : /Users/jpy/Dropbox/_FLASK/concours_pesticides
-- stats path :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2007.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2008.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2009.csv
--- dataset in '/data' :  moy_tot_quantif_2007.csv
--- dataset in '/data' :  moy_tot_quantif_2008.csv
--- dataset in '/data' :  moy_tot_quantif_2009.xlsx
--- dataset in '/data' :  moy_tot_quantif_2010.csv
--- dataset in '/data' :  moy_tot_quantif_2011.csv
--- dataset in '/data' :  moy_tot_quantif_2012.csv
--- dataset in '/data' :  pesticides.csv
--- dataset in '/data' :  stations.csv


In [22]:
### panda dataframes for every db + settings

# common code for all pesticcides
all_pesticides_code = "XXXXXX"

# set encoding of .csv (keep accents)
csv_encoding = "latin-1"

#np.array = time_frame

# root strings for datas names
root_mct = "df_mct_"
root_ma  = "df_ma_"

# list of datas filenames
datas_stations   = "stations.csv"
datas_pesticides = "pesticides.csv"

datas_MCT = [
    "moy_tot_quantif_2007.csv",
    "moy_tot_quantif_2008.csv",
    "moy_tot_quantif_2009.xlsx", #### 
    "moy_tot_quantif_2010.csv",
    "moy_tot_quantif_2011.csv",
    "moy_tot_quantif_2012.csv",
]

datas_MCT = [
    "moy_tot_quantif_2007.xlsx",
    "moy_tot_quantif_2008.xlsx",
    "moy_tot_quantif_2009.xlsx",  
    "moy_tot_quantif_2010.xlsx",
    "moy_tot_quantif_2011.xlsx",
    "moy_tot_quantif_2012.xlsx",
]

datas_MA = [
    "ma_qp_fm_ttres_pesteso_2007.csv",
    "ma_qp_fm_ttres_pesteso_2008.csv",
    "ma_qp_fm_ttres_pesteso_2009.csv",
    "ma_qp_fm_rcsrco_pesteso_2010.csv",
    "ma_qp_fm_rcsrco_pesteso_2011.csv",
    "ma_qp_fm_rcsrco_pesteso_2012.csv",
]

datas_MA_excel = [
    "ma_qp_fm_ttres_pesteso_2007.xlsx",
    "ma_qp_fm_ttres_pesteso_2008.xlsx",
    "ma_qp_fm_ttres_pesteso_2009.xlsx",
    "ma_qp_fm_rcsrco_pesteso_2010.xlsx",
    "ma_qp_fm_rcsrco_pesteso_2011.xlsx",
    "ma_qp_fm_rcsrco_pesteso_2012.xlsx",
]



In [23]:
# set time frame
years   = {"ANNEE" : [2007, 2008, 2009, 2010, 2011, 2012 ] }



In [24]:
### functions : cleaning operations on dataframes

idx = pd.IndexSlice

def stat_file_path(filename):
    path = os.path.join(stats_path, filename)
    return path 


def checkDTypes (df) :
    # check data type
    
    for index in df.index.names :
        print "---- index : ", index

    for col in df.columns :
        #label = col.values
        dtype = df[col].dtype
        
        print "---- dtypes col : ", col, "/", dtype
        

In [25]:
def comas2points(df, list_col_names="all_col"): 
    # convert all weird "," to "." and then to float values
    
    if list_col_names == "all_col" : 
        df.loc[:, :] = df.replace(to_replace=',', value='.', regex=True)
    else : 
        df.loc[:, list_col_names ] = df.loc[:,list_col_names].replace(to_replace=',', value='.', regex=True)
    return df


def ints2floats(df, list_col_names, to="float") :
    
    if to == "float":
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(float)
    elif to == "int" :
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(int)        
    return df


In [26]:
def dfCleanNa(df_list): 
    # clean from NaN values if entire row is NaN
    
    df_list_clean = []
    for df in df_list :
        df_cleaned_01 = df.dropna(how="all") # on empty rows
        df_cleaned_02 = df_cleaned_01.dropna( axis=1, how="all") # on empty columns
        df_list_clean.append(df_cleaned_02)
    
    return df_list_clean


In [27]:

#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#
#   -- DATAS TO DATA FRAMES --                         #
#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#


In [28]:

########################################################
########################################################
########################################################
### -- DF_PESTICIDES --
########################################################
########################################################
########################################################


In [29]:
functions_cols= ["CODE_FONCTION","LIBELLE CODE_FONCTION"]

functions_split = {
    "A"   : "A",
    "B"   : "B",
    "BF"  : "B,F",
    "F"   : "F",
    "FA"  : "F,A",
    "FHM" : "F,H,M",
    "FN"  : "F,N",
    "H"   : "H",
    "I"   : "I",
    "IA"  : "I,A",
    "IAFH": "I,A,F,H",
    "IAM" : "I,A,M",
    "IAN" : "I,A,N",
    "IM"  : "I,M",
    "IN"  : "I,N",
    "Ireg": "I,Reg",
    "N"   : "N",
    "R"   : "R",
    "Reg" : "Reg",
    "RepO": "RepO",
    "Ro"  : "Ro",
    "HFNI": "H,F,N,I",
    "HG"  : "H,G"
}

functions_full = {
    "A"    : "Acaricide",
    "B"    : "Biocide",
    "BF"   : "Biocide, Fongicide",
    "F"    : "Fongicide",
    "FA"   : "Fongicide, Acaricide",
    "FHM"  : "Fongicide, Herbicide, Mollusticide",
    "FN"   : "Fongicide, Nématicide",
    "H"    : "Herbicide",
    "I"    : "Insecticide",
    "IA"   : "Insecticide, Acaricide",
    "IAFH" : "Insecticide, Acaricide, Fongicide, Herbicide",
    "IAM"  : "Insecticide, Acaricide, Mollusticide",
    "IAN"  : "Insecticide, Acaricide, Nématicide",
    "IM"   : "Insecticide, Mollusticide",
    "IN"   : "Insecticide, Nématicide",
    "Ireg" : "Insecticide, Régulateur de croissance",
    "N"    : "Nématicide",
    "R"    : "Rodenticide",
    "Reg"  : "Régulateur de croissance",
    "RepO" : "Répulsif",
    "Ro"   : "Rodenticide",
    "HFNI" : "Herbicide, Fongicide, Nématicide, Insecticide",
    "HG"   : "Herbicide, Graminicide"
}


functions_light = {
    "A"   : "Acaricide",
    "B"   : "Biocide",
    "F"   : "Fongicide",
    "H"   : "Herbicide",
    "I"   : "Insecticide",
    "M"   : "Mollusticide",
    "N"   : "Nématicide",
    "R"   : "Rodenticide",
    "Reg" : "Régulateur de croissance",
    #"reg" : "Régulateur de croissance",
    "RepO": "Répulsif",
    "Ro"  : "Rodenticide",
    "G"   : "Graminicide"
}

### optional
df_functions = pd.Series(functions_light, name="LIBELLE_CODE_FONCTION")
df_functions.index.name = 'CODE_FONCTION'
df_functions.reset_index()
df_functions = df_functions.to_frame()

df_functions #["A"]


Unnamed: 0_level_0,LIBELLE_CODE_FONCTION
CODE_FONCTION,Unnamed: 1_level_1
A,Acaricide
B,Biocide
F,Fongicide
G,Graminicide
H,Herbicide
I,Insecticide
M,Mollusticide
N,Nématicide
R,Rodenticide
Reg,Régulateur de croissance


In [30]:
# read pesticides list

df_pesticides = pd.read_csv( stat_file_path("pesticides.csv"), sep=";", encoding=csv_encoding )
df_pesticides = comas2points(df_pesticides, ["NORME_DCE"])
df_pesticides = ints2floats (df_pesticides, ["NORME_DCE"])

##### dates  : col "DATE_NA_USAGE"
df_pesticides["DATE_NA_USAGE"] = pd.to_datetime(df_pesticides["DATE_NA_USAGE"], infer_datetime_format=True, errors='coerce')

##### index 2 by function : col "CODE_FONCTION"
df_pesticides["CODE_FONCTION"].replace(functions_split, inplace=True)

# set index : CODE_PESTICIDE 
df_pesticides.set_index(["CODE_FAMILLE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
#df_pesticides.set_index(["CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
df_pesticides.sort_index(inplace=True) 

#df_pesticides["CODE_FONCTION"].head(10)


In [31]:
pest_famille_list = list(df_pesticides.index.levels[0])
print "-- len pest_famille_list", len(pest_famille_list)
print pest_famille_list
print

print " -- df_pesticides.index.names    : ", df_pesticides.index.names
print " -- df_pesticides.index.values   : ", df_pesticides.index.values
print " -- df_pesticides.columns.values : ", df_pesticides.columns.values
print " -- df_pesticides.columns        : ", df_pesticides.columns
print 

checkDTypes(df_pesticides)



-- len pest_famille_list 31
[u'Ald\xe9hydes et c\xe9tones', u'Amides', u'Amines', u'Autres \xe9l\xe9ments min\xe9raux', u'Azoles', u'Benz\xe8ne et d\xe9riv\xe9s', u'COHV, solvants chlor\xe9s, fr\xe9ons', u'Carbamate', u'Carbamates', u'Carbamates et thiocarbamates', u'Chloroacetamide ', u'Chloroac\xe9tamide', u'Chloroalcanes', u'Compos\xe9s ph\xe9noliques', u'Diazines', u'Divers (organiques)', u'Fongicides', u'Hydrocarbures et indices li\xe9s', u'Inconnu', u'Indices', u'Metaux et m\xe9tallo\xefdes', u'Organochlor\xe9s', u'Organom\xe9talliques', u'Organophosphor\xe9s', u'Pyridines', u'Pyr\xe9thrino\xefdes', u'Quinazolinones', u'Triazines et m\xe9tabolites', u'Triazoles', u'Triazolopyrimidines sulfonamides', u'Ur\xe9es']

 -- df_pesticides.index.names    :  [u'CODE_FAMILLE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_pesticides.index.values   :  [(nan, 6276, u'Somme pesticides analyses')
 (nan, 6824, u'N,N-Dimet-tolylsulphamid') (nan, 6856, u'Acetochlor ESA')
 ..., (u'Ur\xe9es', 9055, u'1-(

In [32]:
### test slicing
df_pesticides.loc[ idx[:,1130] , : ]["CODE_FONCTION"] #.head(3)


CODE_FAMILLE  CD_PARAMETRE  LB_PARAMETRE
Carbamates    1130          Carbofuran      I,N
Name: CODE_FONCTION, dtype: object

In [33]:
df_pesticides.loc[ idx[:,1432:1474], :] #.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NOM_PARAM2,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE
CODE_FAMILLE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Carbamates,1463,Carbaryl,Carbaryl,"I,Reg",PNA,,,,63-25-2,2008-11-20,C12H11NO2,0.1
Carbamates,1474,Chlorprophame,Chlorprophame,Reg,PA,,,,101-21-3,NaT,C10H12ClNO2,0.1
Divers (organiques),1432,Pyriméthanil,Pyriméthanil,F,PA,,,,53112-28-0,NaT,C12H13N3,0.1
Divers (organiques),1473,Chlorothalonil,Chlorothalonil,F,PA,,,,1897-45-6,NaT,C8Cl4N2,0.1
Organochlorés,1472,Chloropicrine,Chloropicrine,"F,N",PNA,,,,76-06-2,NaT,CCl3NO2,0.1
Organophosphorés,1464,Chlorfenvinphos,Chlorfenvinphos,I,PNA,,,,470-90-6,2007-12-31,C12H14Cl3O4P,0.1


In [34]:
df_pesticides.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1046 entries, (nan, 6276, Somme pesticides analyses) to (Urées, 99012, Isoproturon-2CH3)
Data columns (total 10 columns):
NOM_PARAM2       575 non-null object
CODE_FONCTION    940 non-null object
STATUT           725 non-null object
METABOLITE       104 non-null object
PARENT           101 non-null object
NOM_PARENT       99 non-null object
CODE_CAS         968 non-null object
DATE_NA_USAGE    201 non-null datetime64[ns]
FORMULEB         540 non-null object
NORME_DCE        1028 non-null float64
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 103.5+ KB


In [35]:
#df_pesticides.memory_usage()

In [36]:

########################################################
########################################################
########################################################
### -- DF_STATIONS -- 
########################################################
########################################################
########################################################


In [43]:

lab_stations = "INFOS"
df_stations  = pd.read_csv( stat_file_path(datas_stations), sep=";", encoding=csv_encoding , na_values=[""] )

'''
IMPORTANT : 
name column to link to carto (.shp file) : 
"CD_ME_v2" | "CD_ME_niv1_surf"

for instance : 
"DG330" in column "CD_ME_v2" | "CD_ME_niv1_surf" in df_stations
... corresponds to :
"DG330" in column "CdMasseDEa" in gdf object (geopandas from .shp file)

''' 

# add columns CD_PARAMETRE, LB_PARAMETRE
#df_stations["CD_PARAMETRE"] = 99999
#df_stations["LB_PARAMETRE"] = "all pesticides"

# get columns labels
#col_labels_stations = list(df_stations.columns.values)
#print " -- col_labels :", df_stations[0:5]

# add multilevel hierarchy on columns
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations, "NO_DATE"])
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations])

to_float = ["ALTITUDE", "PROFONDEUR_MAXI_POINT", "X_FICT_L93", "Y_FICT_L93"]

df_stations = comas2points(df_stations, to_float)
df_stations = ints2floats (df_stations, to_float)

#print "-- indices names :", df_stations.index.name

#print df_stations["Unnamed: 26"].unique()
df_stations.drop('Unnamed: 26', axis=1, inplace=True)


In [44]:
### FOR CARTO : add column for long lat in WSG84

def convertCoordinates(row):
    x1,y1 = row["X_FICT_L93"], row["Y_FICT_L93"]
    convertedCoord = transform(inProj,outProj, x1, y1)
    return list(convertedCoord)

def extractFromList(index):
    value = row[colName][index]
    print value
    return value

df_stations["COORD_WSG84"] = df_stations.apply(convertCoordinates,axis=1)
#df_stations["LAT_WSG84"]   = df_stations.apply(lambda row: extractFromList(row['COORD_WSG84'], 0), axis=1)
#df_stations["LONG_WSG84"]  = df_stations.apply(extractFromList(index=1),axis=1)

## cf : http://chrisalbon.com/python/pandas_expand_cells_containing_lists.html
# expand df.tags into its own dataframe
coord = df_stations['COORD_WSG84'].apply(pd.Series)
# rename each variable is tags
#coord = coord.rename(columns = lambda x : 'COORD_' + str(x))
coord.columns = ["LAT_WSG84","LONG_WSG84"] 
#print coord.head()

print coord.head()
print 

# copy CD_STATION column for further uses
df_stations["CD_STATION_"] = df_stations["CD_STATION"]

# join the tags dataframe back to the original dataframe
df_stations = pd.concat( [df_stations, coord], axis=1, join="outer" )
#df_stations.head(3)


   LAT_WSG84  LONG_WSG84
0   5.452862   46.270740
1   5.781881   45.793046
2   5.772809   45.785001
3   5.788505   45.844201
4   5.074473   45.836095



In [45]:
# set indexes for stations
#df_stations.set_index( ["CD_STATION"], inplace=True) 
df_stations.set_index(["NUM_DEP", "NOM_COM",  "CD_ME_niv1_surf", "CD_ME_v2", "CD_STATION"], inplace=True) 
df_stations.sort_index(inplace=True) 

print "-- df_stations.shape : ", df_stations.shape
checkDTypes(df_stations)


-- df_stations.shape :  (13039, 25)
---- index :  NUM_DEP
---- index :  NOM_COM
---- index :  CD_ME_niv1_surf
---- index :  CD_ME_v2
---- index :  CD_STATION
---- dtypes col :  NUM_COM / object
---- dtypes col :  codagence / object
---- dtypes col :  ALTITUDE / float64
---- dtypes col :  PROFONDEUR_MAXI_POINT / float64
---- dtypes col :  Unité_coord_fictifs / object
---- dtypes col :  X_FICT_L93 / float64
---- dtypes col :  Y_FICT_L93 / float64
---- dtypes col :  reseau2009 / object
---- dtypes col :  reseau2010 / object
---- dtypes col :  reseau2011 / object
---- dtypes col :  reseau2012 / object
---- dtypes col :  reseau2013 / object
---- dtypes col :  reseau2014 / object
---- dtypes col :  fi_ma_2007 / object
---- dtypes col :  fi_ma_2008 / object
---- dtypes col :  fi_ma_2009 / object
---- dtypes col :  fi_ma_2010 / object
---- dtypes col :  fi_ma_2011 / object
---- dtypes col :  fi_ma_2012 / object
---- dtypes col :  fi_ma_2013 / object
---- dtypes col :  fi_ma_2014 / object
---- 

In [46]:
df_stations.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13039 entries, (nan, nan, nan, nan, 10261X0039/F3) to (95, WY-DIT-JOLI-VILLAGE, HG107, nan, 01521X0004/HY)
Data columns (total 25 columns):
NUM_COM                  13036 non-null object
codagence                13039 non-null object
ALTITUDE                 13029 non-null float64
PROFONDEUR_MAXI_POINT    7925 non-null float64
Unité_coord_fictifs      13036 non-null object
X_FICT_L93               13039 non-null float64
Y_FICT_L93               13039 non-null float64
reseau2009               13001 non-null object
reseau2010               5741 non-null object
reseau2011               2263 non-null object
reseau2012               2292 non-null object
reseau2013               2300 non-null object
reseau2014               2399 non-null object
fi_ma_2007               7989 non-null object
fi_ma_2008               4977 non-null object
fi_ma_2009               7766 non-null object
fi_ma_2010               1876 non-null object
fi_ma_2011       

In [48]:
df_stations.head(8)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
,,,CG004,01688X0034/AVAL,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
,,,CG004,01688X0039/F1,,AERM,170.0,80.0,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
1.0,AMBERIEU-EN-BUGEY,DG149,DG149,06758X0052/HY,1004.0,AERM&C,310.0,,01004_ _FRDG149,883079.0129,6544021.331,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[5.3646951139, 45.97123764]",06758X0052/HY,5.364695,45.971238
1.0,AMBLEON,DG149,DG149,07007X0001/006A,1006.0,AERM&C,420.0,,01006_ _FRDG149,900470.4782,6520388.322,Hors RCS et RCO,,,...,,,,,,,"[5.5792072567, 45.7536265846]",07007X0001/006A,5.579207,45.753627
1.0,AMBRONAY,DG389,,06754X0040/007A,1007.0,AERM&C,243.0,12.5,01007_FRDG240_FRDG389,880412.7779,6547074.383,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[5.3314414811, 45.9994318351]",06754X0040/007A,5.331441,45.999432
1.0,AMBRONAY,DG389,DG389,06754X0065/P2,1007.0,AERM&C,243.0,21.0,01007_FRDG240_FRDG389,879122.8073,6546538.623,RCS,RCSseul,RCSseul,...,,,,,oui,,"[5.31457812121, 45.9949511315]",06754X0065/P2,5.314578,45.994951
1.0,AMBRONAY,DG389,DG389,06754X0071/P00060,1007.0,AERM&C,243.0,,01007_FRDG240_FRDG389,881453.6578,6545484.554,Hors RCS et RCO,,,...,oui,,,,,,"[5.3442749281, 45.9848443515]",06754X0071/P00060,5.344275,45.984844


In [49]:

########################################################
########################################################
########################################################
### -- DF_MCT (moy concentrations totales)  --
########################################################
########################################################
########################################################


In [167]:
lab_MCT = "MCT"

## read datas MCT

### WARNING : READ .XLSX INSTEAD OF .CSV (BAD DATAS IN .CSV)

df_mct_2007 = pd.read_csv(  stat_file_path(datas_MCT[0]), sep=";") #,  index_col=[1,0])
df_mct_2008 = pd.read_csv(  stat_file_path(datas_MCT[1]), sep=";") #,  index_col=[1,0])
df_mct_2009 = pd.read_excel(stat_file_path(datas_MCT[2]), sep=";") #,  index_col=[1,0])
df_mct_2010 = pd.read_csv(  stat_file_path(datas_MCT[3]), sep=";") #,  index_col=[1,0])
df_mct_2011 = pd.read_csv(  stat_file_path(datas_MCT[4]), sep=";") #,  index_col=[1,0])
df_mct_2012 = pd.read_csv(  stat_file_path(datas_MCT[5]), sep=";") #,  index_col=[1,0])


In [168]:
#df_mct_2007.shape
print " -- df_mct_2007.index.names : ", df_mct_2007.index.names
print " -- df_mct_2007.columns     : ", df_mct_2007.columns

#df_mct_2007.head()

 -- df_mct_2007.index.names :  [None]
 -- df_mct_2007.columns     :  Index([u'ANNEE', u'CD_STATION', u'NBPREL', u'MOYPTOT', u'MAXPTOT',
       u'MINMOLRECH', u'MAXMOLRECH', u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')


In [169]:
#df_mct_2008.head() 

#df_ = df_mct_2010.dropna(how="all")
#df_.loc[:, ("ANNEE")] = df_.loc[:, ("ANNEE")].astype(int)
#df_.head() 

In [170]:
### merge all MCT datas with multiIndex
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-multiple-dataframe-or-panel-objects
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-with-two-multi-indexes
# options/alternatives : .merge .join .concat .append

frames_mct = [df_mct_2007,df_mct_2008, df_mct_2009, df_mct_2010, df_mct_2011, df_mct_2012]

# clean from NaN values if entire row is NaN
frames_mct_cleaned = dfCleanNa(frames_mct)
    
df_MCT = pd.concat(frames_mct_cleaned)

# convert all year column data to integers
df_MCT = ints2floats(df_MCT, ["ANNEE"], to="int")

# convert all weird "," to "." and then to float values
df_MCT   = comas2points(df_MCT)
to_float = ['NBPREL', 'MOYPTOT', 'MAXPTOT', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ']
df_MCT   = ints2floats(df_MCT, to_float)

# add column CD_PARAMETRE, LB_PARAMETRE
df_MCT["CD_PARAMETRE"] = all_pesticides_code
df_MCT["LB_PARAMETRE"] = "all_pesticides"


# set index hierarchy
#df_MCT.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MCT.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

print " -- df_MCT.index.names    : ", df_MCT.index.names
print " -- df_MCT.index.values   : ", df_MCT.index.values
print " -- df_MCT.columns.values : ", df_MCT.columns.values
print " -- df_MCT.columns        : ", df_MCT.columns

df_MCT.sort_index(inplace=True) 


 -- df_MCT.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MCT.index.values   :  [('00054X0169/F1', 2007, 'XXXXXX', 'all_pesticides')
 ('00057X0245/F1', 2007, 'XXXXXX', 'all_pesticides')
 ('00057X0248/F4', 2007, 'XXXXXX', 'all_pesticides') ...,
 ('11056X0123/FIGA', 2012, 'XXXXXX', 'all_pesticides')
 ('11195X0147/FITTEL', 2012, 'XXXXXX', 'all_pesticides')
 ('11221X0134/TRAVO', 2012, 'XXXXXX', 'all_pesticides')]
 -- df_MCT.columns.values :  ['NBPREL' 'MOYPTOT' 'MAXPTOT' 'MINMOLRECH' 'MAXMOLRECH' 'MINMOLQ' 'MAQMOLQ']
 -- df_MCT.columns        :  Index([u'NBPREL', u'MOYPTOT', u'MAXPTOT', u'MINMOLRECH', u'MAXMOLRECH',
       u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')


In [171]:
df_MCT["MOYPTOT_YEAR"] = np.NaN

print df_MCT.shape 
checkDTypes(df_MCT)

df_MCT.head(20)

(11144, 8)
---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBPREL / float64
---- dtypes col :  MOYPTOT / float64
---- dtypes col :  MAXPTOT / float64
---- dtypes col :  MINMOLRECH / float64
---- dtypes col :  MAXMOLRECH / float64
---- dtypes col :  MINMOLQ / float64
---- dtypes col :  MAQMOLQ / float64
---- dtypes col :  MOYPTOT_YEAR / float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00054X0169/F1,2007,XXXXXX,all_pesticides,4.0,0.0,0.0,18.0,96.0,0.0,0.0,
00054X0169/F1,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,64.0,64.0,0.0,0.0,
00057X0245/F1,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,18.0,96.0,0.0,0.0,
00057X0245/F1,2008,XXXXXX,all_pesticides,1.0,0.02,0.02,60.0,60.0,1.0,1.0,
00057X0245/F1,2010,XXXXXX,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0,
00057X0245/F1,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,64.0,64.0,0.0,0.0,
00057X0248/F4,2007,XXXXXX,all_pesticides,2.0,0.02,0.04,61.0,96.0,0.0,1.0,
00057X0248/F4,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,60.0,60.0,0.0,0.0,
00057X0248/F4,2009,XXXXXX,all_pesticides,1.0,0.0,0.0,62.0,62.0,0.0,0.0,
00057X0248/F4,2010,XXXXXX,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0,


In [27]:

########################################################
########################################################
########################################################
### -- DF_MA (moy analyses)  --
########################################################
########################################################
########################################################


In [28]:
lab_MA = "MA"

def multilevel_MA (df, year):

    # set indexes : STATION and CODE_PESTICIDE
    #df.set_index(["CD_STATION", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
    #df.set_index(["CD_STATION"], inplace=True)

    # get columns labels for df_ma
    #col_labels_df_ma = list(df.columns.values)
    #print " -- col_labels :", col_labels_df_ma

    # add multilevel hierarchy on columns
    #df.columns = pd.MultiIndex.from_product([lab_MA, col_labels_df_ma, year])
    
    # convert all weird "," to "." and then to float values
    to_float = ["MA_MOY", "NORME_DCE"]
    df       = comas2points(df, to_float)
    df       = ints2floats (df, to_float)

    df["ANNEE"] = year
    
    return df


In [29]:
### WARNING : READ .XLSX INSTEAD OF .CSV (BAD DATAS IN .CSV)

df_ma_2007 = pd.read_csv( stat_file_path(datas_MA[0]), sep=";", encoding = csv_encoding )
df_ma_2007 = multilevel_MA(df_ma_2007, 2007)
    
df_ma_2008 = pd.read_csv( stat_file_path(datas_MA[1]), sep=";", encoding = csv_encoding)
df_ma_2008 = multilevel_MA(df_ma_2008, 2008)

df_ma_2009 = pd.read_csv( stat_file_path(datas_MA[2]), sep=";", encoding = csv_encoding)
df_ma_2009 = multilevel_MA(df_ma_2009, 2009)

df_ma_2010 = pd.read_csv( stat_file_path(datas_MA[3]), sep=";", encoding = csv_encoding)
df_ma_2010 = multilevel_MA(df_ma_2010, 2010)

df_ma_2011 = pd.read_csv( stat_file_path(datas_MA[4]), sep=";", encoding = csv_encoding)
df_ma_2011 = multilevel_MA(df_ma_2011, 2011)

df_ma_2012 = pd.read_csv( stat_file_path(datas_MA[5]), sep=";", encoding = csv_encoding)
df_ma_2012 = multilevel_MA(df_ma_2012, 2012)


In [30]:
#df_ma_2010.head() 

#df_ma_2011.head() 

#df_ma_2012.head() 

In [31]:
### merge all MA datas 

frames_MA = [df_ma_2007, df_ma_2008, df_ma_2009, df_ma_2010, df_ma_2011, df_ma_2012]

# clean from NaN values if entire row is NaN
frames_MA_cleaned = dfCleanNa(frames_MA)

# concatenate datas MA
df_MA = pd.concat(frames_MA_cleaned)

# set index hierarchy
#df_MA.set_index(["CD_STATION"], inplace=True)
#df_MA.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MA.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

#df_MA.sort_index(inplace=True) 
df_MA.sortlevel(inplace=True) 

print " -- df_MA.index.names    : ", df_MA.index.names
print " -- df_MA.index.values   : ", df_MA.index.values
print " -- df_MA.columns.values : ", df_MA.columns.values
print " -- df_MA.columns        : ", df_MA.columns


 -- df_MA.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MA.index.values   :  [(u'00053X0002/SO1', 2007, 1102, u'Aldicarbe')
 (u'00053X0002/SO1', 2007, 1107, u'Atrazine')
 (u'00053X0002/SO1', 2007, 1108, u'Atrazine d\xe9s\xe9thyl') ...,
 (u'11282X0005/ARAGUI', 2007, 2924, u'Benfuracarbe')
 (u'11282X0005/ARAGUI', 2007, 2951, u'Iprovalicarb')
 (u'11282X0005/ARAGUI', 2007, 5475, u'Thiofanox sulfoxyde')]
 -- df_MA.columns.values :  [u'NBANASPERTS1' u'MA_MOY' u'NBQUANTIF' u'NORME_DCE']
 -- df_MA.columns        :  Index([u'NBANASPERTS1', u'MA_MOY', u'NBQUANTIF', u'NORME_DCE'], dtype='object')


In [32]:
### MA : add columns for averages and custom indicators
df_MA["MA_MOY_YEAR"] = np.NaN


In [33]:
checkDTypes(df_MA)

df_MA.head(25)


---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBANASPERTS1 / int64
---- dtypes col :  MA_MOY / float64
---- dtypes col :  NBQUANTIF / int64
---- dtypes col :  NORME_DCE / float64
---- dtypes col :  MA_MOY_YEAR / float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1,
00053X0002/SO1,2007,1136,Chlortoluron,1,0.01,0,0.1,
00053X0002/SO1,2007,1137,Cyanazine,1,0.01,0,0.1,
00053X0002/SO1,2007,1177,Diuron,1,0.01,0,0.1,
00053X0002/SO1,2007,1205,Ioxynil,1,0.025,0,0.1,
00053X0002/SO1,2007,1208,Isoproturon,1,0.01,0,0.1,
00053X0002/SO1,2007,1209,Linuron,1,0.01,0,0.1,


In [34]:

########################################################
########################################################
########################################################
### --- QUERIES ON DFs
########################################################
########################################################
########################################################


In [35]:
## cf : http://pandas.pydata.org/pandas-docs/stable/indexing.html#the-query-method-experimental

def queryByIndexValue (df, indexName, indexLabelList):
    queryString = '%s in %s' %( indexLabelList, indexName)
    #print queryString
    result = df.query(queryString)
    return result
#df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

def queryByColValue (df, colName, comparator, colValue ):
    queryString = '(%s %s %s)' %( colName, comparator, colValue)
    #print queryString
    result = df.query(queryString)
    return result


def getIndexValuesList(df, indexName):
    result = df.index.get_level_values(indexName).unique()
    return list(result)

def getColValuesList(df, colName ) :
    result = df[colName].unique()
    return list(result)


def listIndexUniqueValues(df) :
    dictIndex = {}
    for indexName in df.index.names :
        listValues = getIndexValuesList(df, indexName)
        dictIndex[indexName] = listValues
    return dictIndex


In [36]:

########################################################
########################################################
########################################################
### --- DF_AV /// by : 
###           year - pesticides (levels rows)
###           departements (levels columns) 
########################################################
########################################################



In [37]:
years_list = [2007, 2008, 2009, 2010, 2011, 2012 ] 
print "-- len years_list", len(years_list)

departements_list = list(df_stations.index.levels[0])
print "-- len departements_list", len(departements_list)
#print departements_list

pesticides_list = list(df_pesticides.index.levels[1])
pesticides_list.append(all_pesticides_code)
print "-- len pesticides_list", len(pesticides_list)
print pesticides_list

-- len years_list 6
-- len departements_list 95
-- len pesticides_list 1044
[2, 1083, 1092, 1093, 1094, 1100, 1101, 1102, 1103, 1104, 1105, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1119, 1120, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1159, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1192, 1193, 1194, 1197, 1198, 1200, 1201, 1202, 1203, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1236, 1237, 1238, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1277, 1279, 1280, 1281, 1282, 1287, 1288, 1289, 1290, 1291, 1298, 1308, 1310, 1329, 1333, 1336, 1341, 1353, 13

In [38]:
### create df_AV dataframe dummy

tuples = list(itertools.product(years_list, pesticides_list))
len_rows = len(tuples)
list_ = [np.NaN]*len_rows
dict_ = {"test" : list_ }

index = pd.MultiIndex.from_tuples(tuples, names=['year', 'CD_PARAMETRE'])
df_AV = pd.DataFrame(np.asarray(list_), index=index)
#df_AV.head()

In [39]:
for dpt in departements_list :
    df_AV[str(dpt)] = np.NaN
df_AV["TOT_FRANCE"] = np.NaN

df_AV.drop(0, axis=1, inplace=True)


In [40]:
df_AV.tail()


Unnamed: 0_level_0,Unnamed: 1_level_0,01,02,03,04,05,06,07,08,09,10,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2012,99013,,,,,,,,,,,...,,,,,,,,,,
2012,99020,,,,,,,,,,,...,,,,,,,,,,
2012,99022,,,,,,,,,,,...,,,,,,,,,,
2012,99024,,,,,,,,,,,...,,,,,,,,,,
2012,XXXXXX,,,,,,,,,,,...,,,,,,,,,,


In [41]:
### fill df_AV & correspondinf MCT | MA


In [161]:
### ---> optimization A
### pre-store staions per dpt in dict
### iterate through departements

dic_dpt_stations = {}
dic_dpt_stations_count = {}

for dpt in departements_list : 

    # get list of CD_STATION within dpt 
    stations_list = df_stations.query("NUM_DEP == '%s' " %(dpt) )
    stations_list_ = list(stations_list["CD_STATION_"])
    
    dic_dpt_stations[dpt] = stations_list_
    dic_dpt_stations_count[dpt] = len(stations_list_)
    

#print dic_dpt_stations_count

test_1 = { k: dic_dpt_stations_count[k] for k in dic_dpt_stations_count.keys()[:1]}
print test_1

test_2 = { k: dic_dpt_stations[k] for k in dic_dpt_stations.keys()[:1]}
print test_2



{u'24': 128}
{u'24': [u'08085X0023/P', u'08076X0017/ERH', u'07596X0010/F', u'07842X0005/HY', u'07821X0001/SOURCE', u'08066X0047/F', u'08066X0019/F', u'08085X0040/HY', u'07595X0022/F', u'08326X0004/HY', u'08326X0006/F', u'07584X0007/F', u'07104X0501/HY', u'08088X0015/F', u'08087X0001/HY', u'08086X0031/S', u'08085X0032/HY', u'08322X0015/P', u'07346X0002/HY', u'07345X0018/F', u'07842X0007/F2', u'08311X0001/HY', u'07583X0003/HY', u'08067X0002/HY', u'07827X0007/SOURCE', u'08305X0002/F', u'08301X0002/F', u'08305X0030/F', u'08316X0016/HY', u'07582X0005/HY', u'07827X0017/HY', u'08087X0021/F', u'07826X0010/HY', u'07107X0031/F', u'08072X0010/HY', u'07361X0014/HY', u'07361X0002/HY', u'07361X0004/S', u'07841X0019/F', u'07847X0001/HY', u'07846X0012/HY', u'07846X0013/HY', u'07348X0010/HY', u'07811X0011/F', u'08075X0014/F', u'08075X0012/HY', u'08301X0015/P', u'08065X0025/F', u'08073X0017/HY', u'08077X0030/ERH', u'08077X0005/F', u'08077X0026/S1', u'07597X0007/A25', u'07346X0013/HY', u'08066X0005/F', u

In [219]:
############ TEST ####################
### test 1 on df_MCT

print "df_MCT.shape", df_MCT.shape
print 

_years_list   = [2007, 2008]
_cd_parametre = 1177 #all_pesticides_code
_column_name  = "MA_MOY" #"MOYPTOT"
_dpt_list     = ['24', '44']

for _year in _years_list :
               
    for _dpt in _dpt_list :

        print "_year %s, _dpt %s" %(_year, _dpt)
        print "++ df_MCT.shape               ", df_MA.shape

        _df_moy_tot_year = df_MA.loc[ idx[:, _year, _cd_parametre ] , [_column_name] ]
        print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape

        _stations_list_temp = dic_dpt_stations[_dpt]
        #print T_stations_list_temp
        print "-- len(_stations_list_temp)  ", len(_stations_list_temp)

        _df_moy_tot_year_dpt = _df_moy_tot_year.loc[ idx[ _stations_list_temp ,:, :] , :]
        print "-- _df_moy_tot_year_dpt.shape", _df_moy_tot_year_dpt.shape

        #print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape
        print


df_MCT.shape (11144, 8)

_year 2007, _dpt 24
++ df_MCT.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (7633, 1)
-- len(_stations_list_temp)   128
-- _df_moy_tot_year_dpt.shape (91, 1)

_year 2007, _dpt 44
++ df_MCT.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (7633, 1)
-- len(_stations_list_temp)   48
-- _df_moy_tot_year_dpt.shape (45, 1)

_year 2008, _dpt 24
++ df_MCT.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (4682, 1)
-- len(_stations_list_temp)   128
-- _df_moy_tot_year_dpt.shape (60, 1)

_year 2008, _dpt 44
++ df_MCT.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (4682, 1)
-- len(_stations_list_temp)   48
-- _df_moy_tot_year_dpt.shape (43, 1)



In [190]:
############ TEST ####################
### test 1 on df_MCT

_station = "00057X0248/F4"
_dpt = '24'
_year = 2007
_cd_parametre = all_pesticides_code

print "station : %s / dpt : %s / year : %s" %(_station, _dpt, _year)

# get list of CD_STATION within dpt 
_stations_list = df_stations.query("NUM_DEP == '%s'" %(_dpt) )
_stations_list_ = list(_stations_list["CD_STATION_"])
print "len(_stations_list_)", len(_stations_list_)

#df_moy_dpt  = df_MCT.query('CD_STATION=="%s" and ANNEE==%s ' %(station, year) )
_df_moy_dpt_ = df_MCT.query('%s in CD_STATION and ANNEE==%s and CD_PARAMETRE == "%s" ' %( _stations_list_, _year, _cd_parametre ))

print "_df_moy_dpt_.shape", _df_moy_dpt_.shape


station : 00057X0248/F4 / dpt : 24 / year : 2007
len(_stations_list_) 128
_df_moy_dpt_.shape (36, 8)


In [187]:
############ TEST ####################
### test 2 on df_MA

_dpt = '24'
_year = 2007
_cd_parametre = str(1177)
_query_cd_parametre = "CD_PARAMETRE==%s" %(_cd_parametre)

_df_moy_ = df_MA.query("ANNEE == %s and %s" %(_year, _query_cd_parametre) )
_mean_year = _df_moy_["MA_MOY"].mean()
print pd.isnull(mean_year), ":", mean_year

#df_moy_

False : 0.0192648886816


In [233]:

###################################################################
### add columns for averages and custom indicators
def MOYPTOT_YEAR(df, year, cd_parametre, start_time, debug=True ):
    
    # create slicers
    idx = pd.IndexSlice

    # variables : "MOYPTOT" on df_MCT / "MA_MOY" on df_MA
    
    if cd_parametre == all_pesticides_code :
        column_name = "MOYPTOT"
        column_mean = "MOYPTOT_YEAR"
        #query_cd_parametre = 'CD_PARAMETRE == "%s"' %(cd_parametre)
        #df_moy_dpt_ = df_MCT.query('%s in CD_STATION and ANNEE==%s and CD_PARAMETRE == "%s" ' %( stations_list_, year, cd_parametre ))

    else :
        column_name = "MA_MOY"
        column_mean = "MA_MOY_YEAR"
        #query_cd_parametre = 'CD_PARAMETRE == %s' %(cd_parametre)
        #df_moy_dpt_ = df_MA.query('%s in CD_STATION and ANNEE==%s and CD_PARAMETRE == %s ' %( stations_list_, year, cd_parametre ))
    
    try : 
        #df_moy_tot_year = df.query("ANNEE == %s and %s" %(year, query_cd_parametre) )
        df_moy_tot_year = df.loc[ idx[:,year, cd_parametre] , [column_name] ]

        #print "-- %s GLOBAL - debug / mean_year %s for %s / shape df_moy_tot_year = %s" %(column_mean, year , cd_parametre, df_moy_tot_year.shape ) 

        mean_year = df_moy_tot_year[column_name].mean()
    
    except : 
        # if no cd_parametre key for this year 
        mean_year = np.NaN
    
    if debug == True :
        delta_time = datetime.now() - start_time
        print "-- %s - mean_year %s for %s : %s (delta time : %s)" %(column_mean, year , cd_parametre, mean_year, delta_time) 
    
    
    ### escapes if mean_year == nan (leave df_Av NaN value)
    if pd.isnull(mean_year) == True :
        pass
    
    else :
        # cf : http://stackoverflow.com/questions/28002197/pandas-proper-way-to-set-values-based-on-condition-for-subset-of-multiindex-da
        # cf : http://pandas-docs.github.io/pandas-docs-travis/advanced.html#advanced-indexing-with-hierarchical-index
                
        # copy mean_year in corresponding dataframe (df)
        df.loc[ idx[:,year, cd_parametre] , [column_mean] ] = mean_year
        
        # copy mean_year in df_AV
        df_AV.loc[ idx[year, cd_parametre] , ['TOT_FRANCE'] ] = mean_year
        
        
        ### iterate through departements
        for dpt, stations_list in dic_dpt_stations.iteritems() : 
                        
            # compute mean for dpt
            #df_moy_dpt_year = df.query('%s in CD_STATION and ANNEE==%s and %s' %(stations_list_,year, query_cd_parametre ))
            # optimization B -->
            
            #df_moy_dpt_year = df_moy_tot_year.query('%s in CD_STATION' %(stations_list_ ))
            df_moy_tot_dpt_year = df_moy_tot_year.loc[ idx[stations_list,:, :], : ]
            
            #print "-- %s DPT - mean_year_dpt %s for %s - dpt %s (%s stations) / shape df_moy_dpt_year = %s" %(column_mean, year, cd_parametre, dpt, len(stations_list), df_moy_tot_dpt_year.shape ) 
            
            mean_year_dpt = df_moy_tot_dpt_year[column_name].mean()
            
            if debug == True :
                print "-- %s ----- mean_year_dpt %s for %s - dpt %s (%s stations) : %s" %(column_mean, year, cd_parametre, dpt, len(stations_list), mean_year_dpt) 
                
            # copy mean_year_dpt in df_AV
            df_AV.loc[ idx[year, cd_parametre] , [dpt] ] = mean_year_dpt
            
            

In [239]:
### compute for df_MCT - iterate through years and dpt 

### check time deltas for eficiency 
start_time = datetime.now()
print str(start_time)

debug_MOYPTOT_YEAR_MCT = False

for year in years_list :

    start_lap = datetime.now()
    
    print ">>>>>>>> MOYPTOT_YEAR for %s >>>>>>>>" %(year)
    MOYPTOT_YEAR(df_MCT, year, all_pesticides_code, start_time, debug=debug_MOYPTOT_YEAR_MCT)
    
    if debug_MOYPTOT_YEAR_MCT == True :
        delta_lap = datetime.now() - start_lap
        print ">>>>>>>> finished MOYPTOT_YEAR for %s --- delta_lap : %s >>>>>>>>" %(year, delta_lap)
        print

print 
delta_time = datetime.now() - start_time
print ">>>>>>>> AVERAGE MCT --- FINISHED --- delta_time : %s" %(delta_time) 
   

>>>>>>>> MOYPTOT_YEAR for 2007 >>>>>>>>
>>>>>>>> MOYPTOT_YEAR for 2008 >>>>>>>>
>>>>>>>> MOYPTOT_YEAR for 2009 >>>>>>>>
>>>>>>>> MOYPTOT_YEAR for 2010 >>>>>>>>
>>>>>>>> MOYPTOT_YEAR for 2011 >>>>>>>>
>>>>>>>> MOYPTOT_YEAR for 2012 >>>>>>>>

>>>>>>>> AVERAGE MCT --- FINISHED --- delta_time : 0:00:04.907407


In [240]:
### WARNING : TAKES ++ TIME TO PROCESS !!! aprox 40 min
### compute for df_MA - - iterate through years and dpt

### check time deltas for eficiency 
start_time = datetime.now()
print str(start_time)

debug_MOYPTOT_YEAR_MA = False

for year in years_list :

    print ">>>>>>>> MA_MOY_YEAR for year %s " %( year)

    for pesticide in pesticides_list[:-1] :
        
        if debug_MOYPTOT_YEAR_MA == True : 
            start_lap = datetime.now()
            print ">>>>>>>> MA_MOY_YEAR for year %s / pesticide %s " %( year, pesticide)
            
        MOYPTOT_YEAR(df_MA, year, pesticide, start_time, debug=debug_MOYPTOT_YEAR_MA )    
        
        if debug_MOYPTOT_YEAR_MA == True : 
            delta_lap = datetime.now() - start_lap
            print ">>>>>>>> finished MA_MOY_YEAR for %s --- delta_lap : %s >>>>>>>>" %(year, delta_lap)
            print
        
        if debug_MOYPTOT_YEAR_MA and year == 2007 :
            break

print
delta_time = datetime.now() - start_time
print ">>>>>>>> AVERAGE MA --- FINISHED --- delta_time : %s" %(delta_time) 


>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 2 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1083 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1092 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1093 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1094 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1100 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1101 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1102 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1103 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1104 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1105 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1107 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1108 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1109 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1110 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1111 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1112 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 1113 
>>>>>>>> MA_MOY_YEAR for year 2007 / pesticide 11

In [241]:
df_AV.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,01,02,03,04,05,06,07,08,09,10,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,2,,,,,,,,,,,...,,,,,,,,,,
2007,1083,0.023464,0.010379,0.019545,0.024118,0.024571,0.01,0.01,0.003418,,0.005305,...,0.009063,0.005,0.01,0.01,0.019323,0.01,0.01,0.01,0.022549,0.015277
2007,1092,0.024488,0.045,0.040705,0.024677,0.024857,0.02,0.02,0.04,,0.05,...,0.025857,0.01375,0.05,0.02,0.03023,0.0425,0.025,0.04375,0.0375,0.030659
2007,1093,,,,,,,,,,,...,,,,,,,,,,
2007,1094,0.023464,0.01,0.013269,0.024118,0.024571,0.01,0.01,0.008,,0.01,...,0.01,0.005,0.01,0.022273,0.005526,0.005,0.005,0.005,0.021732,0.013958


In [242]:
df_MCT.loc[ idx[:,2012,:,:], : ].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10982X0003/SEGRE,2012,XXXXXX,all_pesticides,4.0,0.0,0.0,409.0,409.0,0.0,0.0,0.189392
11013X0002/F,2012,XXXXXX,all_pesticides,5.0,0.0378,0.059,105.0,409.0,0.0,2.0,0.189392
11056X0123/FIGA,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11195X0147/FITTEL,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11221X0134/TRAVO,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392


In [243]:
df_MA.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,0.026375
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,0.021479
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,0.038045
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1,0.020551
00053X0002/SO1,2007,1136,Chlortoluron,1,0.01,0,0.1,0.018465


In [244]:

########################################################
########################################################
########################################################
### tests queries 
########################################################
########################################################
########################################################


In [245]:
'''main complete and clean DF :
    - df_pesticides
    - df_stations
    - df_MCT
    - df_MA
'''

df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )
df_sliced_02 = queryByIndexValue(df_stations, "NUM_DEP", ["44"] )
df_sliced_03 = queryByIndexValue(df_MCT, "ANNEE", [2009,2010] )
df_sliced_04 = queryByIndexValue(df_MA, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

df_sliced_05 = queryByColValue(df_MA, "MA_MOY", ">", "NORME_DCE")

print "-- listIndexUniqueValues : ", listIndexUniqueValues(df_sliced_02)
print
print "-- getIndexValuesList : ", getIndexValuesList(df_sliced_04, "CD_PARAMETRE") 
print
print "-- getColValuesList : ", getColValuesList(df_MA, "NORME_DCE") 
print 
#print "-- getColValuesList : ", getColValuesList(df_sliced_02, "NOM_COM")

-- listIndexUniqueValues :  {'CD_ME_niv1_surf': [u'GG117', u'GG114', u'GG038', u'GG118', u'GG026', u'GG115', u'GG022', u'GG027', u'GG139', u'GG119', u'GG140', u'GG015'], 'CD_STATION': [u'05073X0019/S9', u'04818X0181/F2', u'04818X0547/P44', u'04818X0574/F47', u'04503X0067/PZ1', u'04503X0068/PZ2', u'04503X0009/FS9', u'04503X0013/FS14', u'04503X0014/F', u'04503X0079/F9BIS', u'04507X0006/FS15', u'04507X0051/F15B', u'04502X0045/SGB2', u'04807X0018/P', u'04807X0020/F', u'04807X0048/N17', u'04503X0048/FS6', u'04502X0032/F1', u'04502X0033/F2', u'04816X0400/F', u'05353X0015/F', u'05078X0003/P3', u'05078X0033/P7', u'04193X0020/P1-1', u'04193X0025/PD2', u'04494X0014/P', u'05092X0025/PS16', u'04518X0037/NOR26', u'04518X0066/P', u'04518X0072/PB10BI', u'04514X0006/F1', u'04514X0007/F2', u'04514X0013/PB8', u'04514X0016/PB11', u'04503X0047/FS5', u'04522X0014/S', u'04513X0007/F1', u'04513X0029/F2', u'04502X0039/F', u'04498X0018/F', u'04518X0045/MSM1', u'04191X0010/P', u'05086X0028/SEL3', u'04217X0003/F

In [246]:
df_sliced_01 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
62,CLERQUES,AG001,AG001,00066X0042/SO,62228,AEAP,55.0,,62228_ _FRAG001,629257.6535,7077908.332,RCS,RCSseul,RCSseul,...,oui,oui,oui,,oui,oui,"[1.99856804486, 50.7936309591]",00066X0042/SO,1.998568,50.793631
62,WISSANT,AG001,,00053X0002/SO1,62899,AEAP,20.0,,62899_ _FRAG001,607696.7319,7090397.222,Hors RCS et RCO,,,...,,,,,,,"[1.69046823657, 50.9028310319]",00053X0002/SO1,1.690468,50.902831


In [247]:
df_sliced_02.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
44,ARTHON-EN-RETZ,GG117,GG117,05073X0019/S9,44005,AELB,9.0,13.4,44005_FRGG022_FRGG117,326378.1608,6680644.213,Hors RCS et RCO,,,...,,,,,,,"[-1.93119637304, 47.1210323175]",05073X0019/S9,-1.931196,47.121032
44,BASSE-GOULAINE,GG114,GG114,04818X0181/F2,44009,AELB,5.0,27.6,44009_FRGG022_FRGG114,361884.5177,6690106.297,RCS,RCSseul,RCSseul,...,oui,oui,oui,oui,oui,oui,"[-1.47068526406, 47.2250868006]",04818X0181/F2,-1.470685,47.225087
44,BASSE-GOULAINE,GG114,GG114,04818X0547/P44,44009,AELB,3.34,23.6,44009_FRGG022_FRGG114,362546.7259,6690852.717,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[-1.46250512544, 47.2321328624]",04818X0547/P44,-1.462505,47.232133
44,BASSE-GOULAINE,GG114,GG114,04818X0574/F47,44009,AELB,4.0,,44009_FRGG022_FRGG114,363236.6444,6690653.62,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[-1.45325262097, 47.2306937831]",04818X0574/F47,-1.453253,47.230694
44,CAMPBON,GG038,,04503X0067/PZ1,44025,AELB,5.0,38.0,44025_FRGG022_FRGG038,327462.0189,6715030.574,Hors RCS et RCO,,,...,,,,,,,"[-1.94531513692, 47.4305993659]",04503X0067/PZ1,-1.945315,47.430599
44,CAMPBON,GG038,,04503X0068/PZ2,44025,AELB,5.0,8.0,44025_FRGG022_FRGG038,324828.8841,6715532.797,Hors RCS et RCO,,,...,,,,,,,"[-1.98059675895, 47.4336225782]",04503X0068/PZ2,-1.980597,47.433623
44,CAMPBON,GG038,GG038,04503X0009/FS9,44025,AELB,5.31,65.7,44025_FRGG022_FRGG038,325500.243,6714776.129,RCS,RCSseul,RCSseul,...,oui,oui,oui,,,,"[-1.97107610789, 47.427205237]",04503X0009/FS9,-1.971076,47.427205


In [248]:
df_sliced_03.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00057X0245/F1,2010,XXXXXX,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0,0.185894
00057X0248/F4,2009,XXXXXX,all_pesticides,1.0,0.0,0.0,62.0,62.0,0.0,0.0,0.272842
00057X0248/F4,2010,XXXXXX,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0,0.185894
00061X0118/F8,2009,XXXXXX,all_pesticides,2.0,0.025,0.04,62.0,63.0,1.0,2.0,0.272842
00061X0118/F8,2010,XXXXXX,all_pesticides,2.0,0.045,0.06,63.0,63.0,2.0,2.0,0.185894


In [249]:
df_sliced_04

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.0500,0,0.1,0.026375
00053X0002/SO1,2007,1107,Atrazine,1,0.0100,0,0.1,0.021479
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.0050,0,0.1,0.038045
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.0050,0,0.1,0.020551
00053X0002/SO1,2007,1136,Chlortoluron,1,0.0100,0,0.1,0.018465
00053X0002/SO1,2007,1137,Cyanazine,1,0.0100,0,0.1,0.016201
00053X0002/SO1,2007,1177,Diuron,1,0.0100,0,0.1,0.019265
00053X0002/SO1,2007,1205,Ioxynil,1,0.0250,0,0.1,0.019398
00053X0002/SO1,2007,1208,Isoproturon,1,0.0100,0,0.1,0.018009
00053X0002/SO1,2007,1209,Linuron,1,0.0100,0,0.1,0.019124


In [250]:
df_sliced_05.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00061X0120/F10,2009,1113,Bentazone,1,0.22,1,0.1,0.015073
00066X0042/SO,2007,1108,Atrazine déséthyl,2,0.19,2,0.1,0.038045
00066X0042/SO,2008,1108,Atrazine déséthyl,2,0.2,2,0.1,
00066X0042/SO,2009,1108,Atrazine déséthyl,2,0.215,2,0.1,0.030689
00066X0042/SO,2010,1108,Atrazine déséthyl,2,0.22,2,0.1,0.034148


In [251]:

########################################################
########################################################
########################################################
### -- MERGE DATAS ??? -- 
########################################################
########################################################


In [252]:
#df_stations_MCT_MA = pd.concat( [df_stations_MCT, df_MA] )
#df_stations_MCT_MA.head()


#print df_stations_MA_MCT.columns


# pivot tables
#df_mct_2008.T

In [253]:

########################################################
########################################################
########################################################
### -- analysis --
########################################################
########################################################

## selections : http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label
### TO DO 




In [270]:

########################################################
########################################################
########################################################
### -- exports --
########################################################
########################################################


### export functions

test_df = df_stations.head()


In [272]:
test_record = df_stations.loc[["44"], : ]
test_record.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
44,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
44,,,CG004,01688X0034/AVAL,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
44,,,CG004,01688X0039/F1,,AERM,170.0,80.0,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
44,ARTHON-EN-RETZ,GG117,GG117,05073X0019/S9,44005.0,AELB,9.0,13.4,44005_FRGG022_FRGG117,326378.1608,6680644.213,Hors RCS et RCO,,,...,,,,,,,"[-1.93119637304, 47.1210323175]",05073X0019/S9,-1.931196,47.121032
44,BASSE-GOULAINE,GG114,GG114,04818X0181/F2,44009.0,AELB,5.0,27.6,44009_FRGG022_FRGG114,361884.5177,6690106.297,RCS,RCSseul,RCSseul,...,oui,oui,oui,oui,oui,oui,"[-1.47068526406, 47.2250868006]",04818X0181/F2,-1.470685,47.225087


In [258]:
df_stations.head(1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856


In [None]:
### return json 


In [273]:
test_record_reset = test_record.reset_index()
test_record_reset.set_index("CD_STATION", inplace=True)
test_record_reset.head()

Unnamed: 0_level_0,NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
CD_STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10261X0039/F3,44,,,,,AEAG,10.0,33.0,,0.0,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
01688X0034/AVAL,44,,,CG004,,AERM,235.0,,,0.0,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
01688X0039/F1,44,,,CG004,,AERM,170.0,80.0,,0.0,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
05073X0019/S9,44,ARTHON-EN-RETZ,GG117,GG117,44005.0,AELB,9.0,13.4,44005_FRGG022_FRGG117,326378.1608,...,,,,,,,"[-1.93119637304, 47.1210323175]",05073X0019/S9,-1.931196,47.121032
04818X0181/F2,44,BASSE-GOULAINE,GG114,GG114,44009.0,AELB,5.0,27.6,44009_FRGG022_FRGG114,361884.5177,...,oui,oui,oui,oui,oui,oui,"[-1.47068526406, 47.2250868006]",04818X0181/F2,-1.470685,47.225087


In [274]:
#json_stations = df_stations.head(2).to_json(orient="split")
json_stations = test_record_reset.to_json(orient="index") ### set unique index as first json key
#print json_stations

### pretty prints
parsed = json.loads(json_stations)
print json.dumps(parsed, indent=2, sort_keys=True)

{
  "01688X0034/AVAL": {
    "ALTITUDE": 235.0, 
    "CD_ME_niv1_surf": null, 
    "CD_ME_v2": "CG004", 
    "CD_STATION_": "01688X0034/AVAL", 
    "COORD_WSG84": [
      -1.3630812101, 
      -5.9838563092
    ], 
    "LAT_WSG84": -1.3630812101, 
    "LONG_WSG84": -5.9838563092, 
    "NOM_COM": null, 
    "NUM_COM": null, 
    "NUM_DEP": "44", 
    "PROFONDEUR_MAXI_POINT": null, 
    "Unit\u00e9_coord_fictifs": null, 
    "X_FICT_L93": 0.0, 
    "Y_FICT_L93": 0.0, 
    "codagence": "AERM", 
    "fi_ma_2007": null, 
    "fi_ma_2008": null, 
    "fi_ma_2009": "oui", 
    "fi_ma_2010": null, 
    "fi_ma_2011": null, 
    "fi_ma_2012": null, 
    "fi_ma_2013": null, 
    "fi_ma_2014": null, 
    "reseau2009": "Hors RCS et RCO", 
    "reseau2010": null, 
    "reseau2011": null, 
    "reseau2012": null, 
    "reseau2013": null, 
    "reseau2014": null
  }, 
  "01688X0039/F1": {
    "ALTITUDE": 170.0, 
    "CD_ME_niv1_surf": null, 
    "CD_ME_v2": "CG004", 
    "CD_STATION_": "01688X0039/F1"