In [34]:
''' 
STATIONS - PESTICIDES - STATS
-----------------------------

GOAL : notebook python functions to add at root (app initialization on run.py) 
create panda objects / implement query functions / export to JSON 
for data analysis and visualization

- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES
- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)
- CLEAN AND MERGE DATA
- QUERY FUNCTIONS
- EXPORT FUNCTIONS (JSON)

AUTHOR : Julien Paris
DATE   : 24/12/2016

TO DO : 
- 
'''

' \nSTATIONS - PESTICIDES - STATS\n-----------------------------\n\nGOAL : notebook python functions to add at root (app initialization on run.py) \ncreate panda objects / implement query functions / export to JSON \nfor data analysis and visualization\n\n- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES\n- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)\n- CLEAN AND MERGE DATA\n- QUERY FUNCTIONS\n- EXPORT FUNCTIONS (JSON)\n\nAUTHOR : Julien Paris\nDATE   : 24/12/2016\n\nTO DO : \n- \n'

In [35]:
### import standard libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# pyproj settings to convert coordinates
from pyproj import Proj, transform
inProj  = Proj(init='epsg:2154') # proj in  : Lambert 93
outProj = Proj(init='epsg:4326') # proj out : WSG 84

In [3]:
### basic folders addresses and names
cwd = os.getcwd()
data_folder = "/data"
data_folder_ = "data/"


print "-- cwd :", cwd
for file in os.listdir(cwd + data_folder):
    if file.endswith(".csv") or file.endswith(".xlsx"):
        print "--- dataset in '/data' : ", file
        #print cwd+datas_folder+"/"+file

-- cwd : /Users/jpy/Dropbox/_FLASK/concours_pesticides/_jupyter_notebook
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2007.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2008.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2009.csv
--- dataset in '/data' :  moy_tot_quantif_2007.csv
--- dataset in '/data' :  moy_tot_quantif_2008.csv
--- dataset in '/data' :  moy_tot_quantif_2009.xlsx
--- dataset in '/data' :  moy_tot_quantif_2010.csv
--- dataset in '/data' :  moy_tot_quantif_2011.csv
--- dataset in '/data' :  moy_tot_quantif_2012.csv
--- dataset in '/data' :  pesticides.csv
--- dataset in '/data' :  stations.csv


In [4]:
### panda dataframes for every db + settings

# set encoding of .csv (keep accents)
csv_encoding = "latin-1"

# set time frame
time_frame   = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 ]
#np.array = time_frame

# root strings for datas names
root_mct = "df_mct_"
root_ma  = "df_ma_"

# list of datas filenames
datas_stations   = "stations.csv"
datas_pesticides = "pesticides.csv"

datas_MCT = [
    "moy_tot_quantif_2007.csv",
    "moy_tot_quantif_2008.csv",
    "moy_tot_quantif_2009.xlsx", #### 
    "moy_tot_quantif_2010.csv",
    "moy_tot_quantif_2011.csv",
    "moy_tot_quantif_2012.csv",
]

datas_MA = [
    "ma_qp_fm_ttres_pesteso_2007.csv",
    "ma_qp_fm_ttres_pesteso_2008.csv",
    "ma_qp_fm_ttres_pesteso_2009.csv",
    "ma_qp_fm_rcsrco_pesteso_2010.csv",
    "ma_qp_fm_rcsrco_pesteso_2011.csv",
    "ma_qp_fm_rcsrco_pesteso_2012.csv",
]

### tests on few sets ...

In [5]:
### functions : cleaning operations on dataframes

def checkDTypes (df) :
    # check data type
    
    #for index in df.indices :
    #    print index

    for col in df.columns :
        #label = col.values
        dtype = df[col].dtype
        
        print "---- dtypes : ", col, "/", dtype
        

In [6]:
def comas2points(df, list_col_names="all_col"): 
    # convert all weird "," to "." and then to float values
    
    if list_col_names == "all_col" : 
        df.loc[:, :] = df.replace(to_replace=',', value='.', regex=True)
    else : 
        df.loc[:, list_col_names ] = df.loc[:,list_col_names].replace(to_replace=',', value='.', regex=True)
    return df


def ints2floats(df, list_col_names, to="float") :
    
    if to == "float":
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(float)
    elif to == "int" :
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(int)        
    return df


In [7]:
def dfCleanNa(df_list): 
    # clean from NaN values if entire row is NaN
    
    df_list_clean = []
    for df in df_list :
        df_cleaned_01 = df.dropna(how="all") # on empty rows
        df_cleaned_02 = df_cleaned_01.dropna( axis=1, how="all") # on empty columns
        df_list_clean.append(df_cleaned_02)
    
    return df_list_clean


In [8]:
### -- DATAS TO DATA FRAMES -- ####################################

In [9]:
### -- pesticides --

df_pesticides = pd.read_csv("data/pesticides.csv", sep=";", encoding=csv_encoding)
df_pesticides = comas2points(df_pesticides, ["NORME_DCE"])
df_pesticides = ints2floats (df_pesticides, ["NORME_DCE"])

# set index : CODE_PESTICIDE 
df_pesticides.set_index(["CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

checkDTypes(df_pesticides)

#print df_pesticides.loc[:,["CODE_CAS"]]

df_pesticides.head()


---- dtypes :  NOM_PARAM2 / object
---- dtypes :  CODE_FAMILLE / object
---- dtypes :  CODE_FONCTION / object
---- dtypes :  STATUT / object
---- dtypes :  METABOLITE / object
---- dtypes :  PARENT / object
---- dtypes :  NOM_PARENT / object
---- dtypes :  CODE_CAS / object
---- dtypes :  DATE_NA_USAGE / object
---- dtypes :  FORMULEB / object
---- dtypes :  NORME_DCE / float64


Unnamed: 0_level_0,Unnamed: 1_level_0,NOM_PARAM2,CODE_FAMILLE,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE
CD_PARAMETRE,LB_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1197,Heptachlore,Heptachlore,Organochlorés,I,PNA,,,,76-44-8,,C10H5Cl7,0.03
1748,Heptachlore époxyde cis,Heptachlore époxyde cis,Organochlorés,I,PNA,,,,1024-57-3,,,0.03
1749,Heptachlore époxyde trans,Heptachlore époxyde trans,Organochlorés,I,PNA,,,,28044-83-9,,,0.03
1103,Aldrine,Aldrine,Organochlorés,I,PNA,,,,309-00-2,10/04/1994,C12H8Cl6,0.03
1173,Dieldrine,Dieldrine,Organochlorés,I,PNA,,,,60-57-1,10/04/1994,C12H8Cl6O,0.03


In [10]:
### -- stations --
lab_stations = "INFOS"
df_stations  = pd.read_csv( "data/"+datas_stations, sep=";", encoding=csv_encoding , na_values=[""] )

# add columns CD_PARAMETRE, LB_PARAMETRE
#df_stations["CD_PARAMETRE"] = 99999
#df_stations["LB_PARAMETRE"] = "all pesticides"

# set indexes
df_stations.set_index( ["CD_STATION"], inplace=True) 
#df_stations.set_index(["CD_STATION", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True) 


# get columns labels
#col_labels_stations = list(df_stations.columns.values)
#print " -- col_labels :", df_stations[0:5]

# add multilevel hierarchy on columns
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations, "NO_DATE"])
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations])

to_float = ["ALTITUDE", "PROFONDEUR_MAXI_POINT", "X_FICT_L93", "Y_FICT_L93"]

df_stations = comas2points(df_stations, to_float)
df_stations = ints2floats (df_stations, to_float)

print "-- indices names :", df_stations.index.name


### add column for long lat in WSG84

def convertCoordinates(row):
    x1,y1 = row["X_FICT_L93"], row["Y_FICT_L93"]
    convertedCoord = transform(inProj,outProj, x1, y1)
    return list(convertedCoord)

def extractFromList(index):
    value = row[colName][index]
    print value
    return value

df_stations["COORD_WSG84"] = df_stations.apply(convertCoordinates,axis=1)
#df_stations["LAT_WSG84"]   = df_stations.apply(lambda row: extractFromList(row['COORD_WSG84'], 0), axis=1)
#df_stations["LONG_WSG84"]  = df_stations.apply(extractFromList(index=1),axis=1)

## cf : http://chrisalbon.com/python/pandas_expand_cells_containing_lists.html
# expand df.tags into its own dataframe
coord = df_stations['COORD_WSG84'].apply(pd.Series)
# rename each variable is tags
#coord = coord.rename(columns = lambda x : 'COORD_' + str(x))
coord.columns = ["LAT_WSG84","LONG_WSG84"] 
#print coord.head()
# join the tags dataframe back to the original dataframe
df_stations = pd.concat( [df_stations, coord], axis=1, join="outer" )

print coord.head()
print 

print df_stations["Unnamed: 26"].unique()
df_stations.drop('Unnamed: 26', axis=1, inplace=True)



checkDTypes(df_stations)
print "-- df_stations.shape : ", df_stations.shape



df_stations.head()


-- indices names : CD_STATION
                LAT_WSG84  LONG_WSG84
CD_STATION                           
06521X0019/SCE   5.452862   46.270740
07015X0009/F     5.781881   45.793046
07015X0010/P     5.772809   45.785001
07011X0009/F     5.788505   45.844201
06991X0001/S     5.074473   45.836095

[ nan]
---- dtypes :  NUM_COM / object
---- dtypes :  NOM_COM / object
---- dtypes :  NUM_DEP / object
---- dtypes :  codagence / object
---- dtypes :  ALTITUDE / float64
---- dtypes :  PROFONDEUR_MAXI_POINT / float64
---- dtypes :  Unité_coord_fictifs / object
---- dtypes :  X_FICT_L93 / float64
---- dtypes :  Y_FICT_L93 / float64
---- dtypes :  CD_ME_v2 / object
---- dtypes :  CD_ME_niv1_surf / object
---- dtypes :  reseau2009 / object
---- dtypes :  reseau2010 / object
---- dtypes :  reseau2011 / object
---- dtypes :  reseau2012 / object
---- dtypes :  reseau2013 / object
---- dtypes :  reseau2014 / object
---- dtypes :  fi_ma_2007 / object
---- dtypes :  fi_ma_2008 / object
---- dtypes :  f

Unnamed: 0_level_0,NUM_COM,NOM_COM,NUM_DEP,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,CD_ME_v2,...,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,LAT_WSG84,LONG_WSG84
CD_STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06521X0019/SCE,1125,CORVEISSIAT,1,AERM&C,459.0,,01125_ _FRDG140,888869.8607,6577473.549,,...,oui,oui,,,,,,"[5.4528616689, 46.2707399979]",5.452862,46.27074
07015X0009/F,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.0,15.2,01133_FRDG511_FRDG330,916062.9395,6525297.883,DG330,...,,,,,,,,"[5.78188086715, 45.7930456698]",5.781881,45.793046
07015X0010/P,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.8,16.0,01133_FRDG511_FRDG330,915390.0333,6524380.237,DG330,...,oui,,,,,,,"[5.77280939367, 45.7850005571]",5.772809,45.785001
07011X0009/F,1138,CULOZ,1,AERM&C,236.0,17.0,01138_FRDG511_FRDG330,916376.604,6530993.354,DG330,...,oui,oui,,,,,,"[5.78850460832, 45.8442013565]",5.788505,45.844201
06991X0001/S,1142,DAGNEUX,1,AERM&C,196.0,22.0,01142_FRDG240_FRDG390,861009.2195,6528387.253,DG390,...,,,oui,oui,oui,oui,oui,"[5.07447262804, 45.8360949687]",5.074473,45.836095


In [11]:
### -- pesticides / MCT (moy concentrations totales)  --
lab_MCT = "MCT"

## read datas MCT

df_mct_2007 = pd.read_csv("data/"+datas_MCT[0], sep=";") #,  index_col=[1,0])
df_mct_2008 = pd.read_csv("data/"+datas_MCT[1], sep=";") #,  index_col=[1,0])
df_mct_2009 = pd.read_excel("data/"+datas_MCT[2], sep=";") #,index_col=[1,0])
df_mct_2010 = pd.read_csv("data/"+datas_MCT[3], sep=";") #,  index_col=[1,0])
df_mct_2011 = pd.read_csv("data/"+datas_MCT[4], sep=";") #,  index_col=[1,0])
df_mct_2012 = pd.read_csv("data/"+datas_MCT[5], sep=";") #,  index_col=[1,0])


In [12]:
#df_mct_2007.shape
print " -- df_mct_2007.index.names : ", df_mct_2007.index.names
print " -- df_mct_2007.columns     : ", df_mct_2007.columns

df_mct_2007.head()

 -- df_mct_2007.index.names :  [None]
 -- df_mct_2007.columns     :  Index([u'ANNEE', u'CD_STATION', u'NBPREL', u'MOYPTOT', u'MAXPTOT',
       u'MINMOLRECH', u'MAXMOLRECH', u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')


Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ
0,2007,00054X0169/F1,4,0,0,18,96,0,0
1,2007,00057X0245/F1,2,0,0,18,96,0,0
2,2007,00057X0248/F4,2,2,4,61,96,0,1
3,2007,00061X0118/F8,4,125,2,18,96,0,1
4,2007,00066X0042/SO,2,28,35,19,19,2,2


In [13]:
#df_mct_2008.head() 

In [14]:
df_ = df_mct_2010.dropna(how="all")
df_.loc[:, ("ANNEE")] = df_.loc[:, ("ANNEE")].astype(int)
#df_.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
### merge all MCT datas with multiIndex
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-multiple-dataframe-or-panel-objects
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-with-two-multi-indexes
# options/alternatives : .merge .join .concat .append

frames_mct = [df_mct_2007,df_mct_2008, df_mct_2009, df_mct_2010, df_mct_2011, df_mct_2012]

# clean from NaN values if entire row is NaN
frames_mct_cleaned = dfCleanNa(frames_mct)
    
df_MCT = pd.concat(frames_mct_cleaned)

# convert all year column data to integers
df_MCT = ints2floats(df_MCT, ["ANNEE"], to="int")

# convert all weird "," to "." and then to float values
df_MCT   = comas2points(df_MCT)
to_float = ['NBPREL', 'MOYPTOT', 'MAXPTOT', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ']
df_MCT   = ints2floats(df_MCT, to_float)
#df_MCT.loc[:, ("NBPREL"):("MAQMOLQ")] = df_MCT.loc[:, ("NBPREL"):("MAQMOLQ")].astype(float)

# add column CD_PARAMETRE, LB_PARAMETRE
df_MCT["CD_PARAMETRE"] = 9999
df_MCT["LB_PARAMETRE"] = "all_pesticides"

# set index hierarchy
#df_MCT.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MCT.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

print " -- df_MCT.index.names    : ", df_MCT.index.names
print " -- df_MCT.index.values   : ", df_MCT.index.values
print " -- df_MCT.columns.values : ", df_MCT.columns.values
print " -- df_MCT.columns        : ", df_MCT.columns

df_MCT.sort_index(inplace=True) 

checkDTypes(df_MCT)

df_MCT.head(20)

 -- df_MCT.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MCT.index.values   :  [('00054X0169/F1', 2007, 9999, 'all_pesticides')
 ('00057X0245/F1', 2007, 9999, 'all_pesticides')
 ('00057X0248/F4', 2007, 9999, 'all_pesticides') ...,
 ('11056X0123/FIGA', 2012, 9999, 'all_pesticides')
 ('11195X0147/FITTEL', 2012, 9999, 'all_pesticides')
 ('11221X0134/TRAVO', 2012, 9999, 'all_pesticides')]
 -- df_MCT.columns.values :  ['NBPREL' 'MOYPTOT' 'MAXPTOT' 'MINMOLRECH' 'MAXMOLRECH' 'MINMOLQ' 'MAQMOLQ']
 -- df_MCT.columns        :  Index([u'NBPREL', u'MOYPTOT', u'MAXPTOT', u'MINMOLRECH', u'MAXMOLRECH',
       u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')
---- dtypes :  NBPREL / float64
---- dtypes :  MOYPTOT / float64
---- dtypes :  MAXPTOT / float64
---- dtypes :  MINMOLRECH / float64
---- dtypes :  MAXMOLRECH / float64
---- dtypes :  MINMOLQ / float64
---- dtypes :  MAQMOLQ / float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00054X0169/F1,2007,9999,all_pesticides,4.0,0.0,0.0,18.0,96.0,0.0,0.0
00054X0169/F1,2012,9999,all_pesticides,1.0,0.0,0.0,64.0,64.0,0.0,0.0
00057X0245/F1,2007,9999,all_pesticides,2.0,0.0,0.0,18.0,96.0,0.0,0.0
00057X0245/F1,2008,9999,all_pesticides,1.0,0.02,0.02,60.0,60.0,1.0,1.0
00057X0245/F1,2010,9999,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0
00057X0245/F1,2012,9999,all_pesticides,1.0,0.0,0.0,64.0,64.0,0.0,0.0
00057X0248/F4,2007,9999,all_pesticides,2.0,0.02,0.04,61.0,96.0,0.0,1.0
00057X0248/F4,2008,9999,all_pesticides,1.0,0.0,0.0,60.0,60.0,0.0,0.0
00057X0248/F4,2009,9999,all_pesticides,1.0,0.0,0.0,62.0,62.0,0.0,0.0
00057X0248/F4,2010,9999,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0


In [16]:
### -- pesticides / MA (moy analyses)  --
lab_MA = "MA"

def multilevel_MA (df, year):

    # set indexes : STATION and CODE_PESTICIDE
    #df.set_index(["CD_STATION", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
    #df.set_index(["CD_STATION"], inplace=True)

    # get columns labels for df_ma
    #col_labels_df_ma = list(df.columns.values)
    #print " -- col_labels :", col_labels_df_ma

    # add multilevel hierarchy on columns
    #df.columns = pd.MultiIndex.from_product([lab_MA, col_labels_df_ma, year])
    
    # convert all weird "," to "." and then to float values
    to_float = ["MA_MOY", "NORME_DCE"]
    df       = comas2points(df, to_float)
    df       = ints2floats (df, to_float)

    df["ANNEE"] = year
    
    return df


In [17]:
df_ma_2007 = pd.read_csv("data/"+datas_MA[0], sep=";", encoding = csv_encoding)
df_ma_2007 = multilevel_MA(df_ma_2007, 2007)
    
df_ma_2008 = pd.read_csv("data/"+datas_MA[1], sep=";", encoding = csv_encoding)
df_ma_2008 = multilevel_MA(df_ma_2008, 2008)

df_ma_2009 = pd.read_csv("data/"+datas_MA[2], sep=";", encoding = csv_encoding)
df_ma_2009 = multilevel_MA(df_ma_2009, 2009)

df_ma_2010 = pd.read_csv("data/"+datas_MA[3], sep=";", encoding = csv_encoding)
df_ma_2010 = multilevel_MA(df_ma_2010, 2010)

df_ma_2011 = pd.read_csv("data/"+datas_MA[4], sep=";", encoding = csv_encoding)
df_ma_2011 = multilevel_MA(df_ma_2011, 2011)

df_ma_2012 = pd.read_csv("data/"+datas_MA[5], sep=";", encoding = csv_encoding)
df_ma_2012 = multilevel_MA(df_ma_2012, 2012)


In [18]:
df_ma_2010.head() 

Unnamed: 0,CD_STATION,CD_PARAMETRE,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,LB_PARAMETRE,ANNEE
0,00057X0245/F1,1101,1,0.01,0,0.1,Alachlore,2010
1,00057X0245/F1,1103,1,0.0025,0,0.03,Aldrine,2010
2,00057X0245/F1,1104,1,0.01,0,0.1,Amétryne,2010
3,00057X0245/F1,1105,1,0.05,0,0.1,Aminotriazole,2010
4,00057X0245/F1,1107,1,0.01,0,0.1,Atrazine,2010


In [19]:
#df_ma_2011.head() 

In [20]:
#df_ma_2012.head() 

In [21]:
### merge all MA datas 

frames_MA = [df_ma_2007, df_ma_2008, df_ma_2009, df_ma_2010, df_ma_2011, df_ma_2012]

# clean from NaN values if entire row is NaN
frames_MA_cleaned = dfCleanNa(frames_MA)

# concatenate datas MA
df_MA = pd.concat(frames_MA_cleaned)

# set index hierarchy
#df_MA.set_index(["CD_STATION"], inplace=True)
#df_MA.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MA.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

#df_MA.sort_index(inplace=True) 
df_MA.sortlevel(inplace=True) 

print " -- df_MA.index.names    : ", df_MA.index.names
print " -- df_MA.index.values   : ", df_MA.index.values
print " -- df_MA.columns.values : ", df_MA.columns.values
print " -- df_MA.columns        : ", df_MA.columns

checkDTypes(df_MA)

df_MA #.head()


 -- df_MA.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MA.index.values   :  [(u'00053X0002/SO1', 2007, 1102, u'Aldicarbe')
 (u'00053X0002/SO1', 2007, 1107, u'Atrazine')
 (u'00053X0002/SO1', 2007, 1108, u'Atrazine d\xe9s\xe9thyl') ...,
 (u'11282X0005/ARAGUI', 2007, 2924, u'Benfuracarbe')
 (u'11282X0005/ARAGUI', 2007, 2951, u'Iprovalicarb')
 (u'11282X0005/ARAGUI', 2007, 5475, u'Thiofanox sulfoxyde')]
 -- df_MA.columns.values :  [u'NBANASPERTS1' u'MA_MOY' u'NBQUANTIF' u'NORME_DCE']
 -- df_MA.columns        :  Index([u'NBANASPERTS1', u'MA_MOY', u'NBQUANTIF', u'NORME_DCE'], dtype='object')
---- dtypes :  NBANASPERTS1 / int64
---- dtypes :  MA_MOY / float64
---- dtypes :  NBQUANTIF / int64
---- dtypes :  NORME_DCE / float64


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.050,0,0.1
00053X0002/SO1,2007,1107,Atrazine,1,0.010,0,0.1
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1
00053X0002/SO1,2007,1136,Chlortoluron,1,0.010,0,0.1
00053X0002/SO1,2007,1137,Cyanazine,1,0.010,0,0.1
00053X0002/SO1,2007,1177,Diuron,1,0.010,0,0.1
00053X0002/SO1,2007,1205,Ioxynil,1,0.025,0,0.1
00053X0002/SO1,2007,1208,Isoproturon,1,0.010,0,0.1
00053X0002/SO1,2007,1209,Linuron,1,0.010,0,0.1


In [22]:
### --- tests .loc / queries

## cf : http://pandas.pydata.org/pandas-docs/stable/indexing.html#the-query-method-experimental

def queryByIndexValue (df, indexName, indexLabelList):
    queryString = '%s in %s' %( indexLabelList, indexName)
    #print queryString
    result = df.query(queryString)
    return result

def queryByColValue (df, colName, comparator, colValue ):
    queryString = '(%s %s %s)' %( colName, comparator, colValue)
    #print queryString
    result = df.query(queryString)
    return result


def getIndexValuesList(df, indexName):
    result = df.index.get_level_values(indexName).unique()
    return list(result)

def getColValuesList(df, colName ) :
    result = df[colName].unique()
    return list(result)


def listIndexUniqueValues(df) :
    dictIndex = {}
    for indexName in df.index.names :
        listValues = getIndexValuesList(df, indexName)
        dictIndex[indexName] = listValues
    return dictIndex


In [23]:
### tests queries 

df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )
df_sliced_02 = queryByIndexValue(df_stations, "NUM_DEP", ["44"] )
df_sliced_03 = queryByIndexValue(df_MCT, "ANNEE", [2009,2010] )
df_sliced_04 = queryByIndexValue(df_MA, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

df_sliced_05 = queryByColValue(df_MA, "MA_MOY", ">", "NORME_DCE")

print "-- listIndexUniqueValues : ", listIndexUniqueValues(df_sliced_02)
print
print "-- getIndexValuesList : ", getIndexValuesList(df_sliced_04, "CD_PARAMETRE") 
print
print "-- getColValuesList : ", getColValuesList(df_MA, "NORME_DCE") 
print 
print "-- getColValuesList : ", getColValuesList(df_sliced_02, "NOM_COM")

-- listIndexUniqueValues :  {'CD_STATION': [u'05073X0019/S9', u'04818X0547/P44', u'04818X0574/F47', u'04818X0181/F2', u'04503X0014/F', u'04503X0009/FS9', u'04503X0013/FS14', u'04507X0006/FS15', u'04507X0051/F15B', u'04503X0079/F9BIS', u'04503X0067/PZ1', u'04503X0068/PZ2', u'04502X0045/SGB2', u'04807X0020/F', u'04807X0048/N17', u'04807X0018/P', u'04503X0048/FS6', u'04502X0032/F1', u'04502X0033/F2', u'04816X0400/F', u'05353X0015/F', u'05078X0003/P3', u'05078X0033/P7', u'04193X0025/PD2', u'04193X0020/P1-1', u'04494X0014/P', u'05092X0025/PS16', u'04514X0016/PB11', u'04514X0013/PB8', u'04514X0007/F2', u'04518X0066/P', u'04518X0037/NOR26', u'04518X0072/PB10BI', u'04514X0006/F1', u'04503X0047/FS5', u'04522X0014/S', u'04513X0007/F1', u'04513X0029/F2', u'04502X0039/F', u'04498X0018/F', u'04518X0045/MSM1', u'04191X0010/P', u'05086X0028/SEL3', u'04217X0003/F', u'04507X0043/SOURCE', u'05095X0042/P', u'04225X0014/F', u'04225X0050/P2']}

-- getIndexValuesList :  [1102, 1107, 1108, 1109, 1136, 1137, 

In [24]:
df_sliced_01 

Unnamed: 0_level_0,NUM_COM,NOM_COM,NUM_DEP,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,CD_ME_v2,...,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,LAT_WSG84,LONG_WSG84
CD_STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00066X0042/SO,62228,CLERQUES,62,AEAP,55.0,,62228_ _FRAG001,629257.6535,7077908.332,AG001,...,oui,oui,oui,oui,,oui,oui,"[1.99856804486, 50.7936309591]",1.998568,50.793631
00053X0002/SO1,62899,WISSANT,62,AEAP,20.0,,62899_ _FRAG001,607696.7319,7090397.222,,...,,,,,,,,"[1.69046823657, 50.9028310319]",1.690468,50.902831


In [25]:
df_sliced_02.head()

Unnamed: 0_level_0,NUM_COM,NOM_COM,NUM_DEP,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,CD_ME_v2,...,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,LAT_WSG84,LONG_WSG84
CD_STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
05073X0019/S9,44005,ARTHON-EN-RETZ,44,AELB,9.0,13.4,44005_FRGG022_FRGG117,326378.1608,6680644.213,GG117,...,oui,,,,,,,"[-1.93119637304, 47.1210323175]",-1.931196,47.121032
04818X0547/P44,44009,BASSE-GOULAINE,44,AELB,3.34,23.6,44009_FRGG022_FRGG114,362546.7259,6690852.717,GG114,...,oui,,,,,,,"[-1.46250512544, 47.2321328624]",-1.462505,47.232133
04818X0574/F47,44009,BASSE-GOULAINE,44,AELB,4.0,,44009_FRGG022_FRGG114,363236.6444,6690653.62,GG114,...,oui,,,,,,,"[-1.45325262097, 47.2306937831]",-1.453253,47.230694
04818X0181/F2,44009,BASSE-GOULAINE,44,AELB,5.0,27.6,44009_FRGG022_FRGG114,361884.5177,6690106.297,GG114,...,oui,oui,oui,oui,oui,oui,oui,"[-1.47068526406, 47.2250868006]",-1.470685,47.225087
04503X0014/F,44025,CAMPBON,44,AELB,6.0,43.5,44025_FRGG022_FRGG038,325786.0052,6716500.751,GG038,...,oui,oui,,,,,,"[-1.96873154678, 47.4428611184]",-1.968732,47.442861


In [26]:
df_sliced_03.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00057X0245/F1,2010,9999,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0
00057X0248/F4,2009,9999,all_pesticides,1.0,0.0,0.0,62.0,62.0,0.0,0.0
00057X0248/F4,2010,9999,all_pesticides,1.0,0.0,0.0,63.0,63.0,0.0,0.0
00061X0118/F8,2009,9999,all_pesticides,2.0,0.025,0.04,62.0,63.0,1.0,2.0
00061X0118/F8,2010,9999,all_pesticides,2.0,0.045,0.06,63.0,63.0,2.0,2.0


In [27]:
df_sliced_04

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.0500,0,0.1
00053X0002/SO1,2007,1107,Atrazine,1,0.0100,0,0.1
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.0050,0,0.1
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.0050,0,0.1
00053X0002/SO1,2007,1136,Chlortoluron,1,0.0100,0,0.1
00053X0002/SO1,2007,1137,Cyanazine,1,0.0100,0,0.1
00053X0002/SO1,2007,1177,Diuron,1,0.0100,0,0.1
00053X0002/SO1,2007,1205,Ioxynil,1,0.0250,0,0.1
00053X0002/SO1,2007,1208,Isoproturon,1,0.0100,0,0.1
00053X0002/SO1,2007,1209,Linuron,1,0.0100,0,0.1


In [28]:
df_sliced_05

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00061X0120/F10,2009,1113,Bentazone,1,0.220000,1,0.1
00066X0042/SO,2007,1108,Atrazine déséthyl,2,0.190000,2,0.1
00066X0042/SO,2008,1108,Atrazine déséthyl,2,0.200000,2,0.1
00066X0042/SO,2009,1108,Atrazine déséthyl,2,0.215000,2,0.1
00066X0042/SO,2010,1108,Atrazine déséthyl,2,0.220000,2,0.1
00066X0042/SO,2011,1108,Atrazine déséthyl,2,0.200000,2,0.1
00075X0158/F15,2008,1506,Glyphosate,2,0.175000,1,0.1
00075X0158/F15,2009,1506,Glyphosate,2,0.125000,1,0.1
00108X0004/GC1,2007,1108,Atrazine déséthyl,1,0.140000,1,0.1
00108X0004/GC1,2008,1108,Atrazine déséthyl,1,0.180000,1,0.1


In [29]:
### -- MERGE DATAS -- ####################################

In [30]:
#df_stations_MCT_MA = pd.concat( [df_stations_MCT, df_MA] )
#df_stations_MCT_MA.head()


#print df_stations_MA_MCT.columns


# pivot tables
#df_mct_2008.T

In [31]:
### -- analysis --
## selections : http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label
### TO DO 




In [32]:
### -- exports --

### export functions



In [33]:

### return json
json_stations = df_stations.head(2).to_json(orient="index")
### pretty prints
parsed = json.loads(json_stations)
print json.dumps(parsed, indent=2, sort_keys=True)

{
  "06521X0019/SCE": {
    "ALTITUDE": 459.0, 
    "CD_ME_niv1_surf": "DG140", 
    "CD_ME_v2": null, 
    "COORD_WSG84": [
      5.4528616689, 
      46.2707399979
    ], 
    "LAT_WSG84": 5.4528616689, 
    "LONG_WSG84": 46.2707399979, 
    "NOM_COM": "CORVEISSIAT", 
    "NUM_COM": "01125", 
    "NUM_DEP": "01", 
    "PROFONDEUR_MAXI_POINT": null, 
    "Unit\u00e9_coord_fictifs": "01125_ _FRDG140", 
    "X_FICT_L93": 888869.8607, 
    "Y_FICT_L93": 6577473.549, 
    "codagence": "AERM&C", 
    "fi_ma_2007": "oui", 
    "fi_ma_2008": "oui", 
    "fi_ma_2009": "oui", 
    "fi_ma_2010": null, 
    "fi_ma_2011": null, 
    "fi_ma_2012": null, 
    "fi_ma_2013": null, 
    "fi_ma_2014": null, 
    "reseau2009": "Hors RCS et RCO", 
    "reseau2010": "horsRCSRCODRIRE", 
    "reseau2011": null, 
    "reseau2012": null, 
    "reseau2013": null, 
    "reseau2014": null
  }, 
  "07015X0009/F": {
    "ALTITUDE": 229.0, 
    "CD_ME_niv1_surf": "DG330", 
    "CD_ME_v2": "DG330", 
    "COORD_WSG84