In [1]:
'''
-----------------------------
STATIONS - PESTICIDES - STATS
-----------------------------

GOAL : notebook python functions to add at root (app initialization on run.py) 
create panda objects / implement query functions / export to JSON 
for data analysis and visualization

- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES
- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)
- CLEAN AND MERGE DATA
- QUERY FUNCTIONS
- EXPORT FUNCTIONS (JSON)

AUTHOR : Julien Paris
DATE   : 01/01/2017

TO DO : 
- 
'''

'\n-----------------------------\nSTATIONS - PESTICIDES - STATS\n-----------------------------\n\nGOAL : notebook python functions to add at root (app initialization on run.py) \ncreate panda objects / implement query functions / export to JSON \nfor data analysis and visualization\n\n- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES\n- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)\n- CLEAN AND MERGE DATA\n- QUERY FUNCTIONS\n- EXPORT FUNCTIONS (JSON)\n\nAUTHOR : Julien Paris\nDATE   : 28/12/2016\n\nTO DO : \n- \n'

In [2]:
### import standard libraries
import os
import itertools
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

# pyproj settings to convert coordinates
from pyproj import Proj, transform
inProj  = Proj(init='epsg:2154') # proj in  : Lambert 93
outProj = Proj(init='epsg:4326') # proj out : WSG 84

In [3]:
### basic folders addresses and names
cwd = os.getcwd()

data_folder      = "app/static/data"
stats_folder     = "stats"
_web             = "_web" 
stats_web_folder = stats_folder + _web

stats_path     = os.path.join(cwd, data_folder, stats_folder)
stats_web_path = os.path.join(cwd, data_folder, stats_web_folder)

print "-- cwd :", cwd
print "-- stats path : "    , stats_path
print "-- stats web path : ", stats_web_path

for file in os.listdir(stats_path):
    if file.endswith(".csv") or file.endswith(".xlsx") or file.endswith(".xls"):
        print "--- dataset in '/data' : ", file
        #print cwd+datas_folder+"/"+file

-- cwd : /Users/jpy/Dropbox/_FLASK/concours_pesticides
-- stats path :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats
-- stats web path :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012_copy.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2007.xlsx
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2007_copy.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2008.xlsx
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2008_copy.csv
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2009.xlsx
--- dataset in '/data' :  ma_qp_fm_ttres_pesteso_2009_copy.csv
--- dataset in '/data' :  moy

In [4]:
# set encoding and variables for .csv (keep accents)

csv_encoding = "latin-1"

_csv     = ".csv"
_sep_csv = ";"
_xls     = ".xls"
_xlsx    = ".xlsx"
_copy    = "_copy"

### IF SET TO "False" REDO COPIES FROM ORGINAL XLSX
copies_done = True

def excel_to_csv_temp(df_from_excel, df_name):
    print "df_name : ", df_name
    outfilename = os.path.join( stats_path, df_name + _copy + _csv )
    print "outfilename : ", outfilename
    df_from_excel.to_csv(outfilename, sep=_sep_csv, encoding = csv_encoding )


In [5]:
### panda dataframes for every db + settings

# common code for all pesticcides
all_pesticides_code = "XXXXXX"


#np.array = time_frame

# root strings for datas names
root_mct = "df_mct_"
root_ma  = "df_ma_"

# list of datas filenames
data_stations   = {"files": "stations"  , "ext" : _xlsx, "path" : stats_path }
data_pesticides = {"files": "pesticides", "ext" : _xls , "path" : stats_path }


### good source in .xlsx
data_MCT = { 
    "ext" : _xlsx,
    "path": stats_path,
    "files": [
        "moy_tot_quantif_2007",
        "moy_tot_quantif_2008",
        "moy_tot_quantif_2009",  
        "moy_tot_quantif_2010",
        "moy_tot_quantif_2011",
        "moy_tot_quantif_2012",
    ]
}


### good source in .xlsx
data_MA = { 
    "ext" : _xlsx,
    "path": stats_path,
    "files": [
        "ma_qp_fm_ttres_pesteso_2007",
        "ma_qp_fm_ttres_pesteso_2008",
        "ma_qp_fm_ttres_pesteso_2009",
        "ma_qp_fm_rcsrco_pesteso_2010",
        "ma_qp_fm_rcsrco_pesteso_2011",
        "ma_qp_fm_rcsrco_pesteso_2012",
        ]
}



In [6]:
# set time frame
#years   = {"ANNEE" : [2007, 2008, 2009, 2010, 2011, 2012 ] }


In [7]:
### functions : cleaning operations on dataframes

idx = pd.IndexSlice

def stat_file_path(filename):
    path = os.path.join(stats_path, filename)
    return path 


def checkDTypes (df) :
    # check data type
    
    for index in df.index.names :
        print "---- index : ", index

    for col in df.columns :
        #label = col.values
        dtype = df[col].dtype
        
        print "---- dtypes col : ", col, "/", dtype
        

In [8]:
def comas2points(df, list_col_names="all_col"): 
    # convert all weird "," to "." and then to float values
    
    if list_col_names == "all_col" : 
        df.loc[:, :] = df.replace(to_replace=',', value='.', regex=True)
    else : 
        df.loc[:, list_col_names ] = df.loc[:,list_col_names].replace(to_replace=',', value='.', regex=True)
    return df


def ints2floats(df, list_col_names, to="float") :
    
    if to == "float":
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(float)
    elif to == "int" :
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(int)        
    return df


In [9]:
def dfCleanNa(df_list): 
    # clean from NaN values if entire row is NaN
    
    df_list_clean = []
    for df in df_list :
        df_cleaned_01 = df.dropna(how="all") # on empty rows
        df_cleaned_02 = df_cleaned_01.dropna( axis=1, how="all") # on empty columns
        df_list_clean.append(df_cleaned_02)
    
    return df_list_clean


In [10]:

#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#
#   -- DATAS TO DATA FRAMES --                         #
#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#


In [11]:

########################################################
########################################################
########################################################
### -- DF_PESTICIDES --
########################################################
########################################################
########################################################


In [144]:
### JUST DO IT ONCE
# copy original pesticides data

if copies_done == False :
    
    pesticides_original_data = os.path.join( data_pesticides["path"], data_pesticides["files"] + data_pesticides["ext"] )
    print pesticides_original_data

    df_pesticides_original_data = pd.read_excel( pesticides_original_data )

    excel_to_csv_temp( df_pesticides_original_data, data_pesticides["files"] )


In [207]:
functions_cols= ["CODE_FONCTION","LIBELLE CODE_FONCTION"]

functions_split = {
    "A"   : "A",
    "B"   : "B",
    "BF"  : "B,F",
    "F"   : "F",
    "FA"  : "F,A",
    "FHM" : "F,H,M",
    "FN"  : "F,N",
    "H"   : "H",
    "I"   : "I",
    "IA"  : "I,A",
    "IAFH": "I,A,F,H",
    "IAM" : "I,A,M",
    "IAN" : "I,A,N",
    "IM"  : "I,M",
    "IN"  : "I,N",
    "Ireg": "I,Reg",
    "N"   : "N",
    "R"   : "Ro", #### twin with Ro
    "Reg" : "Reg",
    "RepO": "RepO",
    "Ro"  : "Ro", ####
    "HFNI": "H,F,N,I",
    "HG"  : "H,G",
    
    "PP"  : "PP"
}

'''
functions_full = {
    "A"    : "Acaricide",
    "B"    : "Biocide",
    "BF"   : "Biocide, Fongicide",
    "F"    : "Fongicide",
    "FA"   : "Fongicide, Acaricide",
    "FHM"  : "Fongicide, Herbicide, Mollusticide",
    "FN"   : "Fongicide, Nématicide",
    "H"    : "Herbicide",
    "I"    : "Insecticide",
    "IA"   : "Insecticide, Acaricide",
    "IAFH" : "Insecticide, Acaricide, Fongicide, Herbicide",
    "IAM"  : "Insecticide, Acaricide, Mollusticide",
    "IAN"  : "Insecticide, Acaricide, Nématicide",
    "IM"   : "Insecticide, Mollusticide",
    "IN"   : "Insecticide, Nématicide",
    "Ireg" : "Insecticide, Régulateur de croissance",
    "N"    : "Nématicide",
    "R"    : "Rodenticide", ### twin with Ro
    "Reg"  : "Régulateur de croissance",
    "RepO" : "Répulsif",
    "Ro"   : "Rodenticide", ####
    "HFNI" : "Herbicide, Fongicide, Nématicide, Insecticide",
    "HG"   : "Herbicide, Graminicide"
}
'''

functions_light = {
    "A"   : "Acaricide",
    "B"   : "Biocide",
    "F"   : "Fongicide",
    "H"   : "Herbicide",
    "I"   : "Insecticide",
    "M"   : "Mollusticide",
    "N"   : "Nématicide",
    #"R"   : "Rodenticide", ### twin with Ro
    "Reg" : "Régulateur de croissance",
    #"reg" : "Régulateur de croissance",
    "RepO": "Répulsif",
    "Ro"  : "Rodenticide", ### twin with R
    "G"   : "Graminicide",
    "PP"  : "No ref on 'PP'" #### unknown
}

### 7441 / Furilazole  / PP
### 7513 / Fenchlorazole-ethyl / PP

### optional
df_functions = pd.Series(functions_light, name="LIBELLE_CODE_FONCTION")
df_functions.index.name = 'CODE_FONCTION'
df_functions.reset_index()
df_functions = df_functions.to_frame()

df_functions #["A"]


Unnamed: 0_level_0,LIBELLE_CODE_FONCTION
CODE_FONCTION,Unnamed: 1_level_1
A,Acaricide
B,Biocide
F,Fongicide
G,Graminicide
H,Herbicide
I,Insecticide
M,Mollusticide
N,Nématicide
PP,No ref on 'PP'
Reg,Régulateur de croissance


In [208]:
test_function = str(df_pesticides.loc[("Carbamates",1093 )]["CODE_FONCTION"])
print test_function

LB_PARAMETRE
Thiodicarbe    I,M
Name: CODE_FONCTION, dtype: object


In [210]:
# read pesticides list

pesticides_csv_filepath = os.path.join( stats_path, data_pesticides["files"] + _copy + _csv)
print pesticides_csv_filepath

df_pesticides = pd.read_csv( pesticides_csv_filepath, sep=_sep_csv, encoding=csv_encoding )
#df_pesticides = comas2points(df_pesticides, ["NORME_DCE"])
#df_pesticides = ints2floats (df_pesticides, ["NORME_DCE"])
df_pesticides.drop('Unnamed: 0', axis=1, inplace=True)

##### dates  : col "DATE_NA_USAGE"
df_pesticides["DATE_NA_USAGE"] = pd.to_datetime(df_pesticides["DATE_NA_USAGE"], infer_datetime_format=True, errors='coerce')

##### replace : col "CODE_FONCTION"
df_pesticides["CODE_FONCTION"].replace(functions_split, inplace=True)


### add explanations FUNCTIONS
def add_function(row):
    
    #print row["CODE_FONCTION"]
    
    functions = np.NaN
    
    if pd.isnull(row["CODE_FONCTION"]) == False :
    #if row["CODE_FONCTION"] != np.NaN :
        function_list = [ functions_light[f] for f in row["CODE_FONCTION"].split(",") ]
        functions = str.join(", ", function_list )
        #print functions
    
    return functions
    #print

df_pesticides["FONCTIONS"] = df_pesticides.apply( add_function, axis=1 )

# set index : CODE_PESTICIDE 
df_pesticides.set_index(["CODE_FAMILLE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
#df_pesticides.set_index(["CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
df_pesticides.sort_index(inplace=True) 

#df_pesticides["CODE_FONCTION"].head(10)
df_pesticides.head()

/Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats/pesticides_copy.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NOM_PARAM2,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,6276,Somme pesticides analyses,Pesticides totaux,,,,,,Non renseigné,NaT,,0.5,
,6824,"N,N-Dimet-tolylsulphamid",DMST,,,,,,66840-71-9,NaT,C9H14N2O2S,0.1,
,6856,Acetochlor ESA,t-sulfonic acid,,,oui,1903.0,Acétochlore,187022-11-3,NaT,C14H21NO5S,0.1,
,6862,Acetochlor OXA,t-oxanilic acid,,,oui,1903.0,Acétochlore,194992-44-4,NaT,C14H19NO4,0.1,
,7514,Thiophanate-ethyl,,F,PNA,,,,23564-06-9,NaT,C14H18N4O4S2,0.1,Fongicide


In [211]:
pest_famille_list = list(df_pesticides.index.levels[0])
print "-- len pest_famille_list", len(pest_famille_list)
print pest_famille_list
print

print " -- df_pesticides.index.names    : ", df_pesticides.index.names
print " -- df_pesticides.index.values   : ", df_pesticides.index.values
print " -- df_pesticides.columns.values : ", df_pesticides.columns.values
print " -- df_pesticides.columns        : ", df_pesticides.columns
print 

checkDTypes(df_pesticides)



-- len pest_famille_list 31
[u'Ald\xe9hydes et c\xe9tones', u'Amides', u'Amines', u'Autres \xe9l\xe9ments min\xe9raux', u'Azoles', u'Benz\xe8ne et d\xe9riv\xe9s', u'COHV, solvants chlor\xe9s, fr\xe9ons', u'Carbamate', u'Carbamates', u'Carbamates et thiocarbamates', u'Chloroacetamide ', u'Chloroac\xe9tamide', u'Chloroalcanes', u'Compos\xe9s ph\xe9noliques', u'Diazines', u'Divers (organiques)', u'Fongicides', u'Hydrocarbures et indices li\xe9s', u'Inconnu', u'Indices', u'Metaux et m\xe9tallo\xefdes', u'Organochlor\xe9s', u'Organom\xe9talliques', u'Organophosphor\xe9s', u'Pyridines', u'Pyr\xe9thrino\xefdes', u'Quinazolinones', u'Triazines et m\xe9tabolites', u'Triazoles', u'Triazolopyrimidines sulfonamides', u'Ur\xe9es']

 -- df_pesticides.index.names    :  [u'CODE_FAMILLE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_pesticides.index.values   :  [(nan, 6276, u'Somme pesticides analyses')
 (nan, 6824, u'N,N-Dimet-tolylsulphamid') (nan, 6856, u'Acetochlor ESA')
 ..., (u'Ur\xe9es', 9055, u'1-(

In [212]:
df_pesticides.shape

(1046, 11)

In [213]:
### test slicing
df_pesticides.loc[ idx[:,1130] , : ]["CODE_FONCTION"] #.head(3)


CODE_FAMILLE  CD_PARAMETRE  LB_PARAMETRE
Carbamates    1130          Carbofuran      I,N
Name: CODE_FONCTION, dtype: object

In [215]:
df_pesticides.loc[ df_pesticides["CODE_FONCTION"] == "PP"] #.head(3) 
### 7441 / Furilazole  / PP
### 7513 / Fenchlorazole-ethyl / PP


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NOM_PARAM2,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Inconnu,7441,Furilazole,Furilazole,PP,PA,,,,121776-33-8,NaT,C11H13Cl2NO3,0.1,No ref on 'PP'
Inconnu,7513,Fenchlorazole-ethyl,Fenchlorazole-ethyl,PP,,,,,103112-35-2,NaT,C12H8Cl5N3O2,0.1,No ref on 'PP'


In [120]:
### test slicing
df_pesticides.loc[ idx[:,1432:1474], :] #.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NOM_PARAM2,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE
CODE_FAMILLE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Carbamates,1463,Carbaryl,Carbaryl,"I,Reg",PNA,,,,63-25-2,2008-11-20,C12H11NO2,0.1
Carbamates,1474,Chlorprophame,Chlorprophame,Reg,PA,,,,101-21-3,NaT,C10H12ClNO2,0.1
Divers (organiques),1432,Pyriméthanil,Pyriméthanil,F,PA,,,,53112-28-0,NaT,C12H13N3,0.1
Divers (organiques),1473,Chlorothalonil,Chlorothalonil,F,PA,,,,1897-45-6,NaT,C8Cl4N2,0.1
Organochlorés,1472,Chloropicrine,Chloropicrine,"F,N",PNA,,,,76-06-2,NaT,CCl3NO2,0.1
Organophosphorés,1464,Chlorfenvinphos,Chlorfenvinphos,I,PNA,,,,470-90-6,2007-12-31,C12H14Cl3O4P,0.1


In [19]:
#df_pesticides.info()

In [20]:
#df_pesticides.memory_usage()

In [217]:

########################################################
########################################################
########################################################
### -- DF_STATIONS -- 
########################################################
########################################################
########################################################


In [218]:
### JUST DO IT ONCE
# copy original stations data

if copies_done == False :

    stations_original_data = os.path.join( data_stations["path"], data_stations["files"] + data_stations["ext"] )
    print stations_original_data

    df_stations_original_data = pd.read_excel( stations_original_data )

    excel_to_csv_temp( df_stations_original_data, data_stations["files"] )


In [219]:
### read stations.csv

#lab_stations = "INFOS"
#df_stations  = pd.read_csv( stat_file_path(datas_stations), sep=";", encoding=csv_encoding , na_values=[""] )

stations_csv_filepath = os.path.join( stats_path, data_stations["files"] + _copy + _csv)
print stations_csv_filepath

df_stations = pd.read_csv( stations_csv_filepath, sep=_sep_csv, encoding=csv_encoding, na_values=[""] )
df_stations.drop('Unnamed: 0', axis=1, inplace=True)

'''
IMPORTANT : 
name column to link to carto (.shp file) : 
"CD_ME_v2" | "CD_ME_niv1_surf"

for instance : 
"DG330" in column "CD_ME_v2" | "CD_ME_niv1_surf" in df_stations
... corresponds to :
"DG330" in column "CdMasseDEa" in gdf object (geopandas from .shp file)

''' 

# add columns CD_PARAMETRE, LB_PARAMETRE
#df_stations["CD_PARAMETRE"] = 99999
#df_stations["LB_PARAMETRE"] = "all pesticides"

# get columns labels
#col_labels_stations = list(df_stations.columns.values)
#print " -- col_labels :", df_stations[0:5]

# add multilevel hierarchy on columns
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations, "NO_DATE"])
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations])

#to_float = ["ALTITUDE", "PROFONDEUR_MAXI_POINT", "X_FICT_L93", "Y_FICT_L93"]
#df_stations = comas2points(df_stations, to_float)
#df_stations = ints2floats (df_stations, to_float)

#print "-- indices names :", df_stations.index.name

#print df_stations["Unnamed: 26"].unique()
#df_stations.drop('Unnamed: 26', axis=1, inplace=True)

#print df_stations.columns.values

df_stations.head(3)


/Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats/stations_copy.csv


Unnamed: 0,CD_STATION,NUM_COM,NOM_COM,NUM_DEP,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,...,reseau2013,reseau2014,fi_ma_2007,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014
0,06521X0019/SCE,1125,CORVEISSIAT,1,AERM&C,459.0,,01125_ _FRDG140,888869.860702,6577474.0,...,,,oui,oui,oui,,,,,
1,07015X0009/F,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.0,15.2,01133_FRDG511_FRDG330,916062.939502,6525298.0,...,,,oui,,,,,,,
2,07015X0010/P,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.8,16.0,01133_FRDG511_FRDG330,915390.033302,6524380.0,...,,,,oui,,,,,,


In [220]:
### FOR CARTO : add column for long lat in WSG84

def convertCoordinates(row):
    x1,y1 = row["X_FICT_L93"], row["Y_FICT_L93"]
    convertedCoord = transform(inProj,outProj, x1, y1)
    return list(convertedCoord)

def extractFromList(index):
    value = row[colName][index]
    print value
    return value

df_stations["COORD_WSG84"] = df_stations.apply(convertCoordinates,axis=1)
#df_stations["LAT_WSG84"]   = df_stations.apply(lambda row: extractFromList(row['COORD_WSG84'], 0), axis=1)
#df_stations["LONG_WSG84"]  = df_stations.apply(extractFromList(index=1),axis=1)

## cf : http://chrisalbon.com/python/pandas_expand_cells_containing_lists.html
# expand df.tags into its own dataframe
coord = df_stations['COORD_WSG84'].apply(pd.Series)
# rename each variable with its tag
#coord = coord.rename(columns = lambda x : 'COORD_' + str(x))
coord.columns = ["LAT_WSG84","LONG_WSG84"] 
#print coord.head()

print coord.head()
print 

# copy CD_STATION column for further uses
df_stations["CD_STATION_"] = df_stations["CD_STATION"]

# join the coord dataframe back to the original dataframe
df_stations = pd.concat( [df_stations, coord], axis=1, join="outer" )
#df_stations.head(3)


   LAT_WSG84  LONG_WSG84
0   5.452862   46.270740
1   5.781881   45.793046
2   5.772809   45.785001
3   5.788505   45.844201
4   5.074473   45.836095



In [221]:
print list(df_stations.columns)

[u'CD_STATION', u'NUM_COM', u'NOM_COM', u'NUM_DEP', u'codagence', u'ALTITUDE', u'PROFONDEUR_MAXI_POINT', u'Unit\xe9_coord_fictifs', u'X_FICT_L93', u'Y_FICT_L93', u'CD_ME_v2', u'CD_ME_niv1_surf', u'reseau2009', u'reseau2010', u'reseau2011', u'reseau2012', u'reseau2013', u'reseau2014', u'fi_ma_2007', u'fi_ma_2008', u'fi_ma_2009', u'fi_ma_2010', u'fi_ma_2011', u'fi_ma_2012', u'fi_ma_2013', u'fi_ma_2014', 'COORD_WSG84', 'CD_STATION_', 'LAT_WSG84', 'LONG_WSG84']


In [222]:
# set indexes for stations
#df_stations.set_index( ["CD_STATION"], inplace=True) 
df_stations.set_index(["NUM_DEP", "NOM_COM",  "CD_ME_niv1_surf", "CD_ME_v2", "CD_STATION"], inplace=True) 
df_stations.sort_index(inplace=True) 

print "-- df_stations.shape : ", df_stations.shape
checkDTypes(df_stations)


-- df_stations.shape :  (13039, 25)
---- index :  NUM_DEP
---- index :  NOM_COM
---- index :  CD_ME_niv1_surf
---- index :  CD_ME_v2
---- index :  CD_STATION
---- dtypes col :  NUM_COM / object
---- dtypes col :  codagence / object
---- dtypes col :  ALTITUDE / float64
---- dtypes col :  PROFONDEUR_MAXI_POINT / object
---- dtypes col :  Unité_coord_fictifs / object
---- dtypes col :  X_FICT_L93 / float64
---- dtypes col :  Y_FICT_L93 / float64
---- dtypes col :  reseau2009 / object
---- dtypes col :  reseau2010 / object
---- dtypes col :  reseau2011 / object
---- dtypes col :  reseau2012 / object
---- dtypes col :  reseau2013 / object
---- dtypes col :  reseau2014 / object
---- dtypes col :  fi_ma_2007 / object
---- dtypes col :  fi_ma_2008 / object
---- dtypes col :  fi_ma_2009 / object
---- dtypes col :  fi_ma_2010 / object
---- dtypes col :  fi_ma_2011 / object
---- dtypes col :  fi_ma_2012 / object
---- dtypes col :  fi_ma_2013 / object
---- dtypes col :  fi_ma_2014 / object
---- d

In [223]:
df_stations.shape

(13039, 25)

In [224]:
#df_stations.info()


In [225]:
### list of Masses d'Eau : 
### "CD_ME_niv1_surf" | "CD_ME_v2" in stats == "CdMasseDEa" in .shp

MEs_niv1_list = list(df_stations.index.get_level_values("CD_ME_niv1_surf").unique() )
MEs_niv1_list.sort()

MEs_niv2_list = list(df_stations.index.get_level_values("CD_ME_v2").unique() )
MEs_niv2_list.sort()


In [226]:
print len(MEs_niv1_list)
print MEs_niv1_list[:10]

550
[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [227]:
print len(MEs_niv2_list)
print MEs_niv2_list[:10]

566
[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [228]:
MEs_all_list = list(set(MEs_niv1_list + MEs_niv2_list))
MEs_all_list.sort()
print MEs_all_list[:10]


[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [229]:
#for ME in MEs_all_list[1:] : 
#    if ME.startswith("GG"):
#        print ME

In [230]:
df_stations.head(8)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
,,,CG004,01688X0034/AVAL,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
,,,CG004,01688X0039/F1,,AERM,170.0,80.0,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
1.0,AMBERIEU-EN-BUGEY,DG149,DG149,06758X0052/HY,1004.0,AERM&C,310.0,,01004_ _FRDG149,883079.012902,6544021.0,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[5.36469511404, 45.9712376427]",06758X0052/HY,5.364695,45.971238
1.0,AMBLEON,DG149,DG149,07007X0001/006A,1006.0,AERM&C,420.0,,01006_ _FRDG149,900470.478202,6520388.0,Hors RCS et RCO,,,...,,,,,,,"[5.57920725651, 45.7536265801]",07007X0001/006A,5.579207,45.753627
1.0,AMBRONAY,DG389,,06754X0040/007A,1007.0,AERM&C,243.0,12.5,01007_FRDG240_FRDG389,880412.777902,6547074.0,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[5.3314414812, 45.9994318369]",06754X0040/007A,5.331441,45.999432
1.0,AMBRONAY,DG389,DG389,06754X0065/P2,1007.0,AERM&C,243.0,21.0,01007_FRDG240_FRDG389,879122.807302,6546539.0,RCS,RCSseul,RCSseul,...,,,,,oui,,"[5.31457812135, 45.9949511342]",06754X0065/P2,5.314578,45.994951
1.0,AMBRONAY,DG389,DG389,06754X0071/P00060,1007.0,AERM&C,243.0,,01007_FRDG240_FRDG389,881453.657802,6545485.0,Hors RCS et RCO,,,...,oui,,,,,,"[5.34427492816, 45.9848443524]",06754X0071/P00060,5.344275,45.984844


In [34]:

########################################################
########################################################
########################################################
### -- DF_MCT (moy concentrations totales)  --
########################################################
########################################################
########################################################


In [35]:
### JUST DO IT ONCE
# copy original MCT datato CSV

if copies_done == False :

    for MCT_file in data_MCT["files"] : 

        MCT_original_data = os.path.join( data_MCT["path"], MCT_file + data_MCT["ext"] )
        print MCT_original_data

        df_MCT_original_data = pd.read_excel( MCT_original_data )

        excel_to_csv_temp( df_MCT_original_data, MCT_file)
    


In [36]:
#lab_MCT = "MCT"

## read datas MCT

df_mct_2007 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][0]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2008 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][1]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2009 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][2]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2010 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][3]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2011 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][4]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2012 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][5]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])

df_mct_2007.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2008.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2009.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2010.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2011.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2012.drop('Unnamed: 0', axis=1, inplace=True)



In [266]:
#df_mct_2007.shape
print " -- df_mct_2007.index.names : ", df_mct_2007.index.names
print " -- df_mct_2007.columns     : ", df_mct_2007.columns

df_mct_2007.head()

 -- df_mct_2007.index.names :  [None]
 -- df_mct_2007.columns     :  Index([u'ANNEE', u'CD_STATION', u'NBPREL', u'MOYPTOT', u'MAXPTOT',
       u'MINMOLRECH', u'MAXMOLRECH', u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')


Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ
0,2007,00054X0169/F1,4,0.0,0.0,18,96,0,0
1,2007,00057X0245/F1,2,0.0,0.0,18,96,0,0
2,2007,00057X0248/F4,2,0.02,0.04,61,96,0,1
3,2007,00061X0118/F8,4,0.0125,0.02,18,96,0,1
4,2007,00066X0042/SO,2,0.28,0.35,19,19,2,2


In [265]:
#checkDTypes(df_mct_2007)

---- index :  None
---- dtypes col :  ANNEE / int64
---- dtypes col :  CD_STATION / object
---- dtypes col :  NBPREL / int64
---- dtypes col :  MOYPTOT / float64
---- dtypes col :  MAXPTOT / float64
---- dtypes col :  MINMOLRECH / int64
---- dtypes col :  MAXMOLRECH / int64
---- dtypes col :  MINMOLQ / int64
---- dtypes col :  MAQMOLQ / int64


In [39]:
#checkDTypes(df_mct_2008)

In [40]:
#checkDTypes(df_mct_2009)

In [41]:
#checkDTypes(df_mct_2010)

In [42]:
#checkDTypes(df_mct_2011)

In [43]:
#checkDTypes(df_mct_2012)

In [44]:
#df_mct_2008.head() 

#df_ = df_mct_2010.dropna(how="all")
#df_.loc[:, ("ANNEE")] = df_.loc[:, ("ANNEE")].astype(int)
#df_.head() 

In [45]:
### merge all MCT datas with multiIndex
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-multiple-dataframe-or-panel-objects
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-with-two-multi-indexes
# options/alternatives : .merge .join .concat .append

frames_mct = [df_mct_2007,df_mct_2008, df_mct_2009, df_mct_2010, df_mct_2011, df_mct_2012]

# clean from NaN values if entire row is NaN
frames_mct_cleaned = dfCleanNa(frames_mct)
    
df_MCT = pd.concat(frames_mct_cleaned)

# convert all year column data to integers
df_MCT.loc[:, "ANNEE"] = df_MCT.loc[:, "ANNEE"].astype(int) 

'''
# convert all year column data to integers
df_MCT = ints2floats(df_MCT, ["ANNEE"], to="int")

# convert all weird "," to "." and then to float values
df_MCT   = comas2points(df_MCT)
to_float = ['NBPREL', 'MOYPTOT', 'MAXPTOT', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ']
df_MCT   = ints2floats(df_MCT, to_float)

'''

# add column CD_PARAMETRE, LB_PARAMETRE
df_MCT["CD_PARAMETRE"] = all_pesticides_code
df_MCT["LB_PARAMETRE"] = "all_pesticides"


# set index hierarchy
#df_MCT.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MCT.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

print " -- df_MCT.index.names    : ", df_MCT.index.names
print " -- df_MCT.index.values   : ", df_MCT.index.values
print " -- df_MCT.columns.values : ", df_MCT.columns.values
print " -- df_MCT.columns        : ", df_MCT.columns

df_MCT.sort_index(inplace=True) 


 -- df_MCT.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MCT.index.values   :  [(u'00054X0169/F1', 2007, 'XXXXXX', 'all_pesticides')
 (u'00057X0245/F1', 2007, 'XXXXXX', 'all_pesticides')
 (u'00057X0248/F4', 2007, 'XXXXXX', 'all_pesticides') ...,
 (u'11056X0123/FIGA', 2012, 'XXXXXX', 'all_pesticides')
 (u'11195X0147/FITTEL', 2012, 'XXXXXX', 'all_pesticides')
 (u'11221X0134/TRAVO', 2012, 'XXXXXX', 'all_pesticides')]
 -- df_MCT.columns.values :  [u'NBPREL' u'MOYPTOT' u'MAXPTOT' u'MINMOLRECH' u'MAXMOLRECH' u'MINMOLQ'
 u'MAQMOLQ']
 -- df_MCT.columns        :  Index([u'NBPREL', u'MOYPTOT', u'MAXPTOT', u'MINMOLRECH', u'MAXMOLRECH',
       u'MINMOLQ', u'MAQMOLQ'],
      dtype='object')


In [46]:
print df_MCT.shape 


(11144, 7)


In [47]:
df_MCT["MOYPTOT_YEAR"] = np.NaN

checkDTypes(df_MCT)


---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBPREL / float64
---- dtypes col :  MOYPTOT / float64
---- dtypes col :  MAXPTOT / float64
---- dtypes col :  MINMOLRECH / float64
---- dtypes col :  MAXMOLRECH / float64
---- dtypes col :  MINMOLQ / float64
---- dtypes col :  MAQMOLQ / float64
---- dtypes col :  MOYPTOT_YEAR / float64


In [267]:
df_MCT.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11195X0147/FITTEL,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11221X0134/TRAVO,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,0.094931
11221X0134/TRAVO,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.119668
11221X0134/TRAVO,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11233X0118/PUGNAC,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,0.094931
11233X0118/PUGNAC,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.119668
11234X0127/BARA,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,0.094931
11234X0127/BARA,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.119668
11282X0005/ARAGUI,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,0.094931
11282X0005/ARAGUI,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.119668


In [49]:

########################################################
########################################################
########################################################
### -- DF_MA (moy analyses)  --
########################################################
########################################################
########################################################


In [50]:
### JUST DO IT ONCE !! GREEDY FOR MA DATA
# copy original MA data to CSV

if copies_done == False :

    start_time = datetime.now()
    print "-- start_time for making csv copies of MA data : ", start_time

    for MA_file in data_MA["files"] : 

        MA_original_data = os.path.join( data_MA["path"], MA_file + data_MA["ext"] )
        print MA_original_data

        df_MA_original_data = pd.read_excel( MA_original_data )

        excel_to_csv_temp( df_MA_original_data, MA_file)

    print "-- FINISH / time spent for making csv copies of MA data : ", datetime.now() - start_time


In [51]:
#lab_MA = "MA"

### DEPRECATED

def multilevel_MA (df, year):

    # set indexes : STATION and CODE_PESTICIDE
    #df.set_index(["CD_STATION", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
    #df.set_index(["CD_STATION"], inplace=True)

    # get columns labels for df_ma
    #col_labels_df_ma = list(df.columns.values)
    #print " -- col_labels :", col_labels_df_ma

    # add multilevel hierarchy on columns
    #df.columns = pd.MultiIndex.from_product([lab_MA, col_labels_df_ma, year])
    
    # convert all weird "," to "." and then to float values
    #to_float = ["MA_MOY", "NORME_DCE"]
    df       = comas2points(df, to_float)
    df       = ints2floats (df, to_float)

    df["ANNEE"] = year
    
    return df


In [52]:
### WARNING : GREEDY IF READ .XLSX --> READ .CSV COPIES

## read datas MCT

df_ma_2007 = pd.read_csv( os.path.join(stats_path, data_MA["files"][0]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2008 = pd.read_csv( os.path.join(stats_path, data_MA["files"][1]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2009 = pd.read_csv( os.path.join(stats_path, data_MA["files"][2]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2010 = pd.read_csv( os.path.join(stats_path, data_MA["files"][3]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2011 = pd.read_csv( os.path.join(stats_path, data_MA["files"][4]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2012 = pd.read_csv( os.path.join(stats_path, data_MA["files"][5]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])

'''
df_ma_2007 = pd.read_excel( stat_file_path(datas_MA_excel[0]) )
#df_ma_2007 = pd.read_csv( stat_file_path(datas_MA_csv[0]), sep=";", encoding = csv_encoding )
    
df_ma_2008 = pd.read_excel( stat_file_path(datas_MA_excel[1]) )
#df_ma_2008 = pd.read_csv( stat_file_path(datas_MA_csv[1]), sep=";", encoding = csv_encoding)

df_ma_2009 = pd.read_excel( stat_file_path(datas_MA_excel[2]) )
#df_ma_2009 = pd.read_csv( stat_file_path(datas_MA_csv[2]), sep=";", encoding = csv_encoding)

df_ma_2010 = pd.read_excel( stat_file_path(datas_MA_excel[3]) )
#df_ma_2010 = pd.read_csv( stat_file_path(datas_MA_csv[3]), sep=";", encoding = csv_encoding)

df_ma_2011 = pd.read_excel( stat_file_path(datas_MA_excel[4]) )
#df_ma_2011 = pd.read_csv( stat_file_path(datas_MA_csv[4]), sep=";", encoding = csv_encoding)

df_ma_2012 = pd.read_excel( stat_file_path(datas_MA_excel[5]) )
#df_ma_2012 = pd.read_csv( stat_file_path(datas_MA_csv[5]), sep=";", encoding = csv_encoding)

'''

df_ma_2007.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2008.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2009.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2010.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2011.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2012.drop('Unnamed: 0', axis=1, inplace=True)

df_ma_2007["ANNEE"] = 2007
df_ma_2008["ANNEE"] = 2008
df_ma_2009["ANNEE"] = 2009
df_ma_2010["ANNEE"] = 2010
df_ma_2011["ANNEE"] = 2011
df_ma_2012["ANNEE"] = 2012



In [53]:
'''
### add multilevel on index + cleaning

df_ma_2007 = multilevel_MA(df_ma_2007, 2007)
df_ma_2008 = multilevel_MA(df_ma_2008, 2008)
df_ma_2009 = multilevel_MA(df_ma_2009, 2009)
df_ma_2010 = multilevel_MA(df_ma_2010, 2010)
df_ma_2011 = multilevel_MA(df_ma_2011, 2011)
df_ma_2012 = multilevel_MA(df_ma_2012, 2012)
'''

'\n### add multilevel on index + cleaning\n\ndf_ma_2007 = multilevel_MA(df_ma_2007, 2007)\ndf_ma_2008 = multilevel_MA(df_ma_2008, 2008)\ndf_ma_2009 = multilevel_MA(df_ma_2009, 2009)\ndf_ma_2010 = multilevel_MA(df_ma_2010, 2010)\ndf_ma_2011 = multilevel_MA(df_ma_2011, 2011)\ndf_ma_2012 = multilevel_MA(df_ma_2012, 2012)\n'

In [54]:
#df_ma_2010.head() 

#df_ma_2011.head() 

#df_ma_2012.head() 

In [55]:
### WARNING : GREEDY
### merge all MA datas 

frames_MA = [df_ma_2007, df_ma_2008, df_ma_2009, df_ma_2010, df_ma_2011, df_ma_2012]

# clean from NaN values if entire row is NaN
frames_MA_cleaned = dfCleanNa(frames_MA)

# concatenate datas MA
df_MA = pd.concat(frames_MA_cleaned)

# set index hierarchy
#df_MA.set_index(["CD_STATION"], inplace=True)
#df_MA.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MA.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

#df_MA.sort_index(inplace=True) 
df_MA.sortlevel(inplace=True) 

print " -- df_MA.index.names    : ", df_MA.index.names
print " -- df_MA.index.values   : ", df_MA.index.values
print " -- df_MA.columns.values : ", df_MA.columns.values
print " -- df_MA.columns        : ", df_MA.columns


 -- df_MA.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MA.index.values   :  [(u'00053X0002/SO1', 2007, 1102, u'Aldicarbe')
 (u'00053X0002/SO1', 2007, 1107, u'Atrazine')
 (u'00053X0002/SO1', 2007, 1108, u'Atrazine d\xe9s\xe9thyl') ...,
 (u'11282X0005/ARAGUI', 2007, 2924, u'Benfuracarbe')
 (u'11282X0005/ARAGUI', 2007, 2951, u'Iprovalicarb')
 (u'11282X0005/ARAGUI', 2007, 5475, u'Thiofanox sulfoxyde')]
 -- df_MA.columns.values :  [u'NBANASPERTS1' u'MA_MOY' u'NBQUANTIF' u'NORME_DCE']
 -- df_MA.columns        :  Index([u'NBANASPERTS1', u'MA_MOY', u'NBQUANTIF', u'NORME_DCE'], dtype='object')


In [56]:
### MA : add columns for averages and custom indicators
df_MA["MA_MOY_YEAR"] = np.NaN


In [57]:
print df_MA.shape

(2779684, 5)


In [58]:
### MA : add columns for averages and custom indicators
### is_MA_MOY_sup_to_NORME_DCE --> GREEDY : delta_time : 0:11:56


def is_MA_MOY_sup_to_NORME_DCE(row):
    moy_  = row["MA_MOY"]
    norm_ = row["NORME_DCE"]
    isSup = moy_ > norm_
    return isSup


start_time = datetime.now()
print ">>> start is_MA_MOY_sup_to_NORME_DCE --> %s" %(start_time)

df_MA["MAMOY_SUP_TO_NORME"] = df_MA.apply(is_MA_MOY_sup_to_NORME_DCE,axis=1)

delta_time = datetime.now() - start_time
print ">>> start is_MA_MOY_sup_to_NORME_DCE / delta_time : %s" %(delta_time)



>>> start is_MA_MOY_sup_to_NORME_DCE --> 2017-01-01 14:04:08.525918
>>> start is_MA_MOY_sup_to_NORME_DCE / delta_time : 0:11:56.408267


In [58]:
checkDTypes(df_MA)

---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBANASPERTS1 / int64
---- dtypes col :  MA_MOY / float64
---- dtypes col :  NBQUANTIF / int64
---- dtypes col :  NORME_DCE / float64
---- dtypes col :  MA_MOY_YEAR / float64


In [59]:

df_MA.head(25)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1,
00053X0002/SO1,2007,1136,Chlortoluron,1,0.01,0,0.1,
00053X0002/SO1,2007,1137,Cyanazine,1,0.01,0,0.1,
00053X0002/SO1,2007,1177,Diuron,1,0.01,0,0.1,
00053X0002/SO1,2007,1205,Ioxynil,1,0.025,0,0.1,
00053X0002/SO1,2007,1208,Isoproturon,1,0.01,0,0.1,
00053X0002/SO1,2007,1209,Linuron,1,0.01,0,0.1,


In [206]:
idx = pd.IndexSlice

df_MA.loc[ idx[ : , :, 1107 ], : ].head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,0.021479
00053X0004/F1,2007,1107,Atrazine,1,0.01,0,0.1,0.021479


In [60]:

########################################################
########################################################
########################################################
### --- QUERIES ON DFs
########################################################
########################################################
########################################################


In [61]:
## cf : http://pandas.pydata.org/pandas-docs/stable/indexing.html#the-query-method-experimental

def queryByIndexValue (df, indexName, indexLabelList):
    queryString = '%s in %s' %( indexLabelList, indexName)
    #print queryString
    result = df.query(queryString)
    return result
#df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

def queryByColValue (df, colName, comparator, colValue ):
    queryString = '(%s %s %s)' %( colName, comparator, colValue)
    #print queryString
    result = df.query(queryString)
    return result


def getIndexValuesList(df, indexName):
    result = df.index.get_level_values(indexName).unique()
    return list(result)

def getColValuesList(df, colName ) :
    result = df[colName].unique()
    return list(result)


def listIndexUniqueValues(df) :
    dictIndex = {}
    for indexName in df.index.names :
        listValues = getIndexValuesList(df, indexName)
        dictIndex[indexName] = listValues
    return dictIndex


In [62]:

########################################################
########################################################
########################################################
### --- DF_AV /// by : 
###           year - pesticides (levels rows)
###           year - departements (levels columns) 
########################################################
########################################################
########################################################


In [63]:
years_list = [2007, 2008, 2009, 2010, 2011, 2012 ] 
print "-- len years_list", len(years_list)

departements_list = list(df_stations.index.levels[0])
print "-- len departements_list", len(departements_list)
#print departements_list

pesticides_list = list(df_pesticides.index.levels[1])
pesticides_list.append(all_pesticides_code)
print "-- len pesticides_list", len(pesticides_list)
print pesticides_list

-- len years_list 6
-- len departements_list 95
-- len pesticides_list 1044
[2, 1083, 1092, 1093, 1094, 1100, 1101, 1102, 1103, 1104, 1105, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1119, 1120, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1159, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1192, 1193, 1194, 1197, 1198, 1200, 1201, 1202, 1203, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1236, 1237, 1238, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1277, 1279, 1280, 1281, 1282, 1287, 1288, 1289, 1290, 1291, 1298, 1308, 1310, 1329, 1333, 1336, 1341, 1353, 13

In [64]:
### create df_AV dataframe dummy

tuples   = list(itertools.product(years_list, pesticides_list))
len_rows = len(tuples)
list_    = [np.NaN]*len_rows
dict_    = {"test" : list_ }

index = pd.MultiIndex.from_tuples(tuples, names=['year', 'CD_PARAMETRE'])

df_AV = pd.DataFrame(np.asarray(list_), index=index)
df_AV.head()



Unnamed: 0_level_0,Unnamed: 1_level_0,0
year,CD_PARAMETRE,Unnamed: 2_level_1
2007,2,
2007,1083,
2007,1092,
2007,1093,
2007,1094,


In [65]:
df_AV_dpt = df_AV.copy()

for dpt in departements_list :
    df_AV_dpt[str(dpt)] = np.NaN
df_AV_dpt["TOT_FRANCE"] = np.NaN

df_AV_dpt.drop(0, axis=1, inplace=True)


In [231]:
print df_AV_dpt.shape
df_AV_dpt.tail()


(6264, 96)


Unnamed: 0_level_0,Unnamed: 1_level_0,01,02,03,04,05,06,07,08,09,10,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2012,99013,,,,,,,,,,,...,,,,,,,,,,
2012,99020,,,,,,,,,,,...,,,,,,,,,,
2012,99022,,,,,,,,,,,...,,,,,,,,,,
2012,99024,,,,,,,,,,,...,,,,,,,,,,
2012,XXXXXX,0.466632,0.159532,0.103263,0.708188,,0.022414,0.068667,0.135451,0.362,0.143224,...,0.0133,0.048241,0.258008,0.05766,0.264667,0.096042,,0.76975,0.148939,0.189392


In [67]:
df_AV_ME = df_AV.copy()

for ME in MEs_all_list[1:] :
    df_AV_ME[str(ME)] = np.NaN
df_AV_ME["TOT_FRANCE"] = np.NaN

df_AV_ME.drop(0, axis=1, inplace=True)


In [68]:
print df_AV_ME.shape
df_AV_ME.tail()


(6264, 585)


Unnamed: 0_level_0,Unnamed: 1_level_0,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,AG010,...,HG402,HG501,HG502,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2012,99013,,,,,,,,,,,...,,,,,,,,,,
2012,99020,,,,,,,,,,,...,,,,,,,,,,
2012,99022,,,,,,,,,,,...,,,,,,,,,,
2012,99024,,,,,,,,,,,...,,,,,,,,,,
2012,XXXXXX,,,,,,,,,,,...,,,,,,,,,,


In [69]:

########################################################
### fill df_AV_dpt + df_AV_ME & correspondinf MCT | MA
########################################################


In [70]:
### ---> optimization A --- dic_dpt_stations

### pre-store staions per dpt in dict
### iterate through departements

dic_dpt_stations = {}
dic_dpt_stations_count = {}

for dpt in departements_list : 

    # get list of CD_STATION within dpt 
    stations_list  = df_stations.query("NUM_DEP == '%s' " %(dpt) )
    stations_list_ = list(stations_list["CD_STATION_"])
    
    dic_dpt_stations[dpt]       = stations_list_
    dic_dpt_stations_count[dpt] = len(stations_list_)
    

#print dic_dpt_stations_count

test_1 = { k: dic_dpt_stations_count[k] for k in dic_dpt_stations_count.keys()[:1]}
print test_1

test_2 = { k: dic_dpt_stations[k] for k in dic_dpt_stations.keys()[:1]}
print test_2



{u'24': 128}
{u'24': [u'08085X0023/P', u'08076X0017/ERH', u'07596X0010/F', u'07842X0005/HY', u'07821X0001/SOURCE', u'08066X0047/F', u'08066X0019/F', u'08085X0040/HY', u'07595X0022/F', u'08326X0004/HY', u'08326X0006/F', u'07584X0007/F', u'07104X0501/HY', u'08088X0015/F', u'08087X0001/HY', u'08086X0031/S', u'08085X0032/HY', u'08322X0015/P', u'07346X0002/HY', u'07345X0018/F', u'07842X0007/F2', u'08311X0001/HY', u'07583X0003/HY', u'08067X0002/HY', u'07827X0007/SOURCE', u'08305X0002/F', u'08301X0002/F', u'08305X0030/F', u'08316X0016/HY', u'07582X0005/HY', u'07827X0017/HY', u'08087X0021/F', u'07826X0010/HY', u'07107X0031/F', u'08072X0010/HY', u'07361X0014/HY', u'07361X0002/HY', u'07361X0004/S', u'07841X0019/F', u'07847X0001/HY', u'07846X0012/HY', u'07846X0013/HY', u'07348X0010/HY', u'07811X0011/F', u'08075X0014/F', u'08075X0012/HY', u'08301X0015/P', u'08065X0025/F', u'08073X0017/HY', u'08077X0030/ERH', u'08077X0005/F', u'08077X0026/S1', u'07597X0007/A25', u'07346X0013/HY', u'08066X0005/F', u

In [71]:
### test slice by ME index

test_slice = df_stations #.head(5)
test_slice

ME_1 = "DG149"
#ME_2 = "CG004"
ME_2 = "DG149"

test_loc_ME_1 = test_slice.loc[ idx[:,:,  :  , ME_1 ], : ]
test_loc_ME_2 = test_slice.loc[ idx[:,:, ME_2,  :   ], : ]
test_concat = pd.concat([test_loc_ME_1, test_loc_ME_2])
#test_concat

In [72]:
#print list(test_concat["CD_STATION_"])

In [73]:
MEs_all_list[0:5]


[nan, u'AG001', u'AG002', u'AG003', u'AG004']

In [74]:
#df_empty = pd.DataFrame()
#df_empty
#df_test_concat = pd.concat([df_empty, test_concat])
#df_test_concat

In [75]:
### ---> optimization A --- dic_ME_stations

### pre-store staions per ME in dict
### iterate through ME
### "CD_ME_niv1_surf" | "CD_ME_v2" in stats == "CdMasseDEa" in .shp

df_empty = pd.DataFrame()

dic_ME_stations = {}
dic_ME_stations_count = {}

for ME in MEs_all_list[1:] :     

    # get list of CD_STATION within ME 
    try : 
        stations_list_niv1 = df_stations.loc[ idx [ :, :, ME, :  ], : ]
    except :
        stations_list_niv1 = df_empty
    try : 
        stations_list_niv2 = df_stations.loc[ idx [ :, :, : , ME ], : ]
    except :
        stations_list_niv2 = df_empty
        
    stations_list  = pd.concat( [ stations_list_niv1, stations_list_niv2 ] )
    stations_list_ = list(stations_list["CD_STATION_"])

    dic_ME_stations[ME] = stations_list_
    dic_ME_stations_count[ME] = len(stations_list_)
    

#print dic_ME_stations_count

test_1 = { k: dic_ME_stations_count[k] for k in dic_ME_stations_count.keys()[:1]}
print test_1

#test_2 = { k: dic_ME_stations[k] for k in dic_ME_stations.keys()[:1]}
#print test_2



{u'HG217': 30}


In [76]:
############ TEST ####################
### test 1/a on df_MCT vs ME


_years_list = [2007, 2008]
_ME_list    = ["DG149", "CG004"]
source      = "MCT" 
#source      = "MA"

if source == "MCT": 
    df_source = df_MCT
    _cd_parametre = all_pesticides_code
    _column_name  = "MOYPTOT"

elif source == "MA": 
    df_source = df_MA
    _cd_parametre = 1177 
    _column_name  = "MA_MOY"

    
print "TEST MEs / %s.shape" %("df_"+ source), df_source.shape
print 


for _year in _years_list :
               
    for _ME in _ME_list :

        print "_year : %s / _ME : %s" %( _year, _ME )
        print "++ %s.shape              " %("df_"+source), df_source.shape

        _df_moy_tot_year = df_source.loc[ idx[ :, _year, _cd_parametre ] , [_column_name] ]
        print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape

        _stations_list_temp = dic_ME_stations[_ME]
        print "-- len(_stations_list_temp)  ", len(_stations_list_temp)

        _df_moy_tot_year_ME = _df_moy_tot_year.loc[ idx[ _stations_list_temp ,:, :] , :]
        print "-- _df_moy_tot_year_dpt.shape", _df_moy_tot_year_ME.shape

        #print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape
        print


TEST MEs / df_MCT.shape (11144, 8)

_year : 2007 / _ME : DG149
++ df_MCT.shape               (11144, 8)
-- _df_moy_tot_year.shape     (1978, 1)
-- len(_stations_list_temp)   135
-- _df_moy_tot_year_dpt.shape (7, 1)

_year : 2007 / _ME : CG004
++ df_MCT.shape               (11144, 8)
-- _df_moy_tot_year.shape     (1978, 1)
-- len(_stations_list_temp)   430
-- _df_moy_tot_year_dpt.shape (17, 1)

_year : 2008 / _ME : DG149
++ df_MCT.shape               (11144, 8)
-- _df_moy_tot_year.shape     (1665, 1)
-- len(_stations_list_temp)   135
-- _df_moy_tot_year_dpt.shape (2, 1)

_year : 2008 / _ME : CG004
++ df_MCT.shape               (11144, 8)
-- _df_moy_tot_year.shape     (1665, 1)
-- len(_stations_list_temp)   430
-- _df_moy_tot_year_dpt.shape (17, 1)



In [77]:
############ TEST ####################
### test 1/b on df_MCT/df_MA vs dpt


_years_list = [2007, 2008]
_dpt_list   = ['24', '44']
#source     = "MCT" 
source      = "MA"

if source == "MCT": 
    df_source = df_MCT
    _cd_parametre = all_pesticides_code
    _column_name  = "MOYPTOT"

elif source == "MA": 
    df_source = df_MA
    _cd_parametre = 1177 
    _column_name  = "MA_MOY"

    
print "TEST DPTs / %s.shape" %("df_"+source), df_source.shape
print 

for _year in _years_list :
               
    for _dpt in _dpt_list :

        print "_year : %s / _dpt : %s" %( _year, _dpt )
        print "++ %s.shape               " %("df_"+source), df_source.shape

        _df_moy_tot_year = df_source.loc[ idx[:, _year, _cd_parametre ] , [_column_name] ]
        print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape

        _stations_list_temp = dic_dpt_stations[_dpt]
        print "-- len(_stations_list_temp)  ", len(_stations_list_temp)

        _df_moy_tot_year_dpt = _df_moy_tot_year.loc[ idx[ _stations_list_temp ,:, :] , :]
        print "-- _df_moy_tot_year_dpt.shape", _df_moy_tot_year_dpt.shape

        #print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape
        print


TEST DPTs / df_MA.shape (2779684, 5)

_year : 2007 / _dpt : 24
++ df_MA.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (7633, 1)
-- len(_stations_list_temp)   128
-- _df_moy_tot_year_dpt.shape (91, 1)

_year : 2007 / _dpt : 44
++ df_MA.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (7633, 1)
-- len(_stations_list_temp)   48
-- _df_moy_tot_year_dpt.shape (45, 1)

_year : 2008 / _dpt : 24
++ df_MA.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (4682, 1)
-- len(_stations_list_temp)   128
-- _df_moy_tot_year_dpt.shape (60, 1)

_year : 2008 / _dpt : 44
++ df_MA.shape                (2779684, 5)
-- _df_moy_tot_year.shape     (4682, 1)
-- len(_stations_list_temp)   48
-- _df_moy_tot_year_dpt.shape (43, 1)



In [78]:
############ TEST ####################
### test 1 on df_MCT

_station = "00057X0248/F4"
_dpt  = '24'
_year = 2007
_cd_parametre = all_pesticides_code

print "station : %s / dpt : %s / year : %s" %(_station, _dpt, _year)

# get list of CD_STATION within dpt 
_stations_list  = df_stations.query("NUM_DEP == '%s'" %(_dpt) )
_stations_list_ = list(_stations_list["CD_STATION_"])
print "len(_stations_list_)", len(_stations_list_)

#df_moy_dpt  = df_MCT.query('CD_STATION=="%s" and ANNEE==%s ' %(station, year) )
_df_moy_dpt_ = df_MCT.query('%s in CD_STATION and ANNEE==%s and CD_PARAMETRE == "%s" ' %( _stations_list_, _year, _cd_parametre ))

print "_df_moy_dpt_.shape", _df_moy_dpt_.shape


station : 00057X0248/F4 / dpt : 24 / year : 2007
len(_stations_list_) 128
_df_moy_dpt_.shape (36, 8)


In [79]:
############ TEST ####################
### test 2 on df_MA

_dpt  = '24'
_year = 2007
_cd_parametre = str(1177)
_query_cd_parametre = "CD_PARAMETRE==%s" %(_cd_parametre)

_df_moy_   = df_MA.query("ANNEE == %s and %s" %(_year, _query_cd_parametre) )
_mean_year = _df_moy_["MA_MOY"].mean()
print pd.isnull(_mean_year), ":", _mean_year

#df_moy_

False : 0.0192648886824


In [232]:

###################################################################
### MAIN AV FUNCTION
###################################################################

###################################################################
### add columns for averages and custom indicators
###################################################################

def MoyDF_YearPest_BY_DptME(dpt_ME, year, cd_parametre, start_time, _1stRd, debug=True ):
    
    # create slicers
    idx = pd.IndexSlice

    # variables : "MOYPTOT" on df_MCT / "MA_MOY" on df_MA
    
    if cd_parametre == all_pesticides_code :
        df = df_MCT
        column_name = "MOYPTOT"
        column_mean = "MOYPTOT_YEAR"
        
    else :
        df = df_MA
        column_name = "MA_MOY"
        column_mean = "MA_MOY_YEAR"
    
    #########################################
    
    if   dpt_ME == "dpt":
        df_AV = df_AV_dpt
        dic_dptME_stations = dic_dpt_stations
    
    elif dpt_ME == "ME" :
        df_AV = df_AV_ME
        dic_dptME_stations = dic_ME_stations
    
    #########################################
    
    try : 
        #df_moy_tot_year = df.query("ANNEE == %s and %s" %(year, query_cd_parametre) )
        df_moy_tot_year = df.loc[ idx[:,year, cd_parametre] , [column_name] ]

        #print "-- %s GLOBAL - debug / mean_year %s for %s / shape df_moy_tot_year = %s" %(column_mean, year , cd_parametre, df_moy_tot_year.shape ) 

        mean_year = df_moy_tot_year[column_name].mean()
    
    except : 
        # if no cd_parametre key for this year 
        mean_year = np.NaN
    
    #########################################
    
    if debug == True :
        delta_time = datetime.now() - start_time
        print "-- %s - mean_year %s for %s : %s (delta time : %s)" %(column_mean, year , cd_parametre, mean_year, delta_time) 
    
    #########################################
    
    ### escapes if mean_year == nan (leave df_Av NaN value)
    if pd.isnull(mean_year) == True :
        pass
    
    else :
        # cf : http://stackoverflow.com/questions/28002197/pandas-proper-way-to-set-values-based-on-condition-for-subset-of-multiindex-da
        # cf : http://pandas-docs.github.io/pandas-docs-travis/advanced.html#advanced-indexing-with-hierarchical-index
        
        
        ### just copy total mean values during first round
        if _1stRd :
            
            # copy mean_year in corresponding dataframe (df)
            df.loc[ idx[ :, year, cd_parametre ] , [ column_mean ] ] = mean_year

            # copy mean_year in df_AV_dpt|df_AV_ME
            df_AV.loc[ idx[year, cd_parametre] , ['TOT_FRANCE'] ] = mean_year
        
        
        ### iterate through departements|ME
        for dptME, stations_list in dic_dptME_stations.iteritems() : 
                        
            # compute mean for dpt|ME
            
            df_moy_tot_dptME_year = df_moy_tot_year.loc[ idx[stations_list,:, :], : ]
            
            #print "-- %s DPT - df_moy_tot_dpt|ME_year %s for %s - dpt|ME %s (%s stations) / shape df_moy_dptME_year = %s" %(column_mean, year, cd_parametre, dptME, len(stations_list), df_moy_tot_dptME_year.shape ) 
            
            mean_year_dptME = df_moy_tot_dptME_year[column_name].mean()
            
            if debug == True :
                print "-- %s ----- mean_year_dpt|ME %s for %s - dpt|ME %s (%s stations) : %s" %(column_mean, year, cd_parametre, dptME, len(stations_list), mean_year_dptME) 
                
            # copy mean_year_dpt|ME in df_AV
            df_AV.loc[ idx[year, cd_parametre] , [dptME] ] = mean_year_dptME
            
            

In [233]:
### compute for df_MCT - iterate through years and dpt --> delta_time : 0:00:04

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MCT = False

_1stRd = True
_df    = "MCT"
_vs    = "dpt"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    start_lap = datetime.now()
    
    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for %s >>>>>>>>" %(_df, _vs, year)
    MoyDF_YearPest_BY_DptME( _vs, year, all_pesticides_code, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MCT)
    
    if debug_MOYPTOT_YEAR_MCT == True :
        delta_lap = datetime.now() - start_lap
        print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
        print

print 
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 
   

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt >>>>>>>> 2017-01-01 18:25:09.563079 

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2007 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2008 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2009 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2010 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2011 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2012 >>>>>>>>

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs dpt --- FINISHED --- delta_time : 0:00:05.828078


In [234]:
### compute for df_MCT - iterate through years and ME --> delta_time : 0:00:16

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MCT = False

_1stRd = True
_df    = "MCT"
_vs    = "ME"

print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    start_lap = datetime.now()
    
    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for %s >>>>>>>>" %(_df, _vs, year)
    MoyDF_YearPest_BY_DptME( _vs, year, all_pesticides_code, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MCT)
    
    if debug_MOYPTOT_YEAR_MCT == True :
        delta_lap = datetime.now() - start_lap
        print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
        print

print 
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs ME >>>>>>>> 2017-01-01 18:25:16.980209 

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2007 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2008 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2009 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2010 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2011 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2012 >>>>>>>>

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs ME --- FINISHED --- delta_time : 0:00:18.273524


In [97]:
### WARNING : TAKES ++ TIME TO PROCESS !!! aprox 40 min
### compute for df_MA - - iterate through years and dpt

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MA = False
debug_MA              = False ### break after first year if True

_1stRd = False ## don't copy mean year / already done at 1st round
_df    = "MA"
_vs    = "dpt"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s " %(_df, _vs, year)

    for pesticide in pesticides_list[:-1] :
        
        if debug_MA == True : 
            start_lap = datetime.now()
            print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s / pesticide %s " %( _df, _vs, year, pesticide)
            
        MoyDF_YearPest_BY_DptME( _vs, year, pesticide, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MA )    
        
        if debug_MOYPTOT_YEAR_MA == True : 
            delta_lap = datetime.now() - start_lap
            print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %( _df, _vs, year, delta_lap)
            print
            
            # break after 1st pesticide
            break
        
    # break after 1st year : 2007 
    if debug_MA : 
        break

print
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 


>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt >>>>>>>> 2017-01-01 15:25:25.052799 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2007 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2008 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2009 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2010 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2011 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2012 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt --- FINISHED --- delta_time : 0:36:36.054669


In [256]:
### WARNING : TAKES ++++ TIME TO PROCESS !!! delta_time : 01:20:00
### compute for df_MA - - iterate through years and ME --> 01:30:00 approx

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MA = False  
debug_MA              = False  ### break after 1st year if True

_1stRd = False ## don't copy mean year / already done at 1st round
_df    = "MA"
_vs    = "ME"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list[1:] :

    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s " %(_df, _vs, year)

    for pesticide in pesticides_list[:-1] :
        
        if debug_MA == True : 
            start_lap = datetime.now()
            print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s / pesticide %s " %(_df, _vs, year, pesticide)
            
        MoyDF_YearPest_BY_DptME( _vs, year, pesticide, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MA )    
        
        if debug_MOYPTOT_YEAR_MA == True : 
            delta_lap = datetime.now() - start_lap
            print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
            print
            
            break
        
    #break after 1st year : 2007 
    if debug_MA : 
        break


print
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 


>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME >>>>>>>> 2017-01-01 19:59:21.390769 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2008 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2009 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2010 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2011 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2012 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME --- FINISHED --- delta_time : 1:04:23.460606


In [244]:
print df_AV_dpt.shape
df_AV_dpt.head()
#df_AV_dpt.tail()

(6264, 96)


Unnamed: 0_level_0,Unnamed: 1_level_0,01,02,03,04,05,06,07,08,09,10,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,2,,,,,,,,,,,...,,,,,,,,,,
2007,1083,0.023464,0.010379,0.019545,0.024118,0.024571,0.01,0.01,0.003418,,0.005305,...,0.009063,0.005,0.01,0.01,0.019323,0.01,0.01,0.01,0.022549,0.015277
2007,1092,0.024488,0.045,0.040705,0.024677,0.024857,0.02,0.02,0.04,,0.05,...,0.025857,0.01375,0.05,0.02,0.03023,0.0425,0.025,0.04375,0.0375,0.030659
2007,1093,,,,,,,,,,,...,,,,,,,,,,
2007,1094,0.023464,0.01,0.013269,0.024118,0.024571,0.01,0.01,0.008,,0.01,...,0.01,0.005,0.01,0.022273,0.005526,0.005,0.005,0.005,0.021732,0.013958


In [257]:
print df_AV_ME.shape
df_AV_ME.head()

(6264, 585)


Unnamed: 0_level_0,Unnamed: 1_level_0,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,AG010,...,HG402,HG501,HG502,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE
year,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,2,,,,,,,,,,,...,,,,,,,,,,
2007,1083,0.01,0.01,0.01,0.01,0.010909,0.0165,,0.01,0.01,0.012,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.00875,
2007,1092,0.045833,0.036667,0.04381,0.036667,0.040889,0.045254,0.05,0.048095,0.040909,0.045849,...,0.026914,0.05,0.030167,0.028571,0.032381,0.030139,0.029111,0.027232,0.05,
2007,1093,,,,,,,,,,,...,,,,,,,,,,
2007,1094,,,,,,,,,,,...,0.005,0.01,0.005,0.005,0.005625,0.005,0.005,0.005,0.01,


In [258]:
df_MCT.loc[ idx[:,2012,:,:], : ].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10982X0003/SEGRE,2012,XXXXXX,all_pesticides,4.0,0.0,0.0,409.0,409.0,0.0,0.0,0.189392
11013X0002/F,2012,XXXXXX,all_pesticides,5.0,0.0378,0.059,105.0,409.0,0.0,2.0,0.189392
11056X0123/FIGA,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11195X0147/FITTEL,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392
11221X0134/TRAVO,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,23.0,23.0,0.0,0.0,0.189392


In [271]:
df_MA.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,0.026375
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,0.021479
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,0.038045
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1,0.020551
00053X0002/SO1,2007,1136,Chlortoluron,1,0.01,0,0.1,0.018465


In [260]:

############################################
############################################
############################################
### EXPORTS FOR WEB CONSUMMING 
############################################
############################################
############################################


### df_pesticides --> CSV 
### df_stations   --> CSV + GEOJSON 
### df_MCT        --> CSV 
### df_MA         --> CSV 
### df_AV_dpt     --> CSV 
### df_AV_MA      --> CSV 



#csv_encoding = "latin-1"

#_csv     = ".csv"
#_sep_csv = ";"
#_web     = "_web" 



In [274]:
df_stations.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
,,,CG004,01688X0034/AVAL,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856


In [261]:

## drop useless columns for web use u'Unit\xe9_coord_fictifs'
drop_col_stations = [ u'Unité_coord_fictifs', 'X_FICT_L93','Y_FICT_L93', 'COORD_WSG84', 'CD_STATION_']
df_stations_web = df_stations.drop( drop_col_stations, axis=1 )

df_stations.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,reseau2011,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
,,,CG004,01688X0034/AVAL,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
,,,CG004,01688X0039/F1,,AERM,170.0,80.0,,0.0,0.0,Hors RCS et RCO,,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
1.0,AMBERIEU-EN-BUGEY,DG149,DG149,06758X0052/HY,1004.0,AERM&C,310.0,,01004_ _FRDG149,883079.012902,6544021.0,Hors RCS et RCO,horsRCSRCODRIRE,,...,,,,,,,"[5.36469511404, 45.9712376427]",06758X0052/HY,5.364695,45.971238
1.0,AMBLEON,DG149,DG149,07007X0001/006A,1006.0,AERM&C,420.0,,01006_ _FRDG149,900470.478202,6520388.0,Hors RCS et RCO,,,...,,,,,,,"[5.57920725651, 45.7536265801]",07007X0001/006A,5.579207,45.753627


In [273]:
print df_MCT#[["MINMOLRECH", "MAXMOLRECH"]].astype(int)

                                                     NBPREL  MOYPTOT  MAXPTOT  \
CD_STATION        ANNEE CD_PARAMETRE LB_PARAMETRE                               
00054X0169/F1     2007  XXXXXX       all_pesticides     4.0   0.0000     0.00   
                  2012  XXXXXX       all_pesticides     1.0   0.0000     0.00   
00057X0245/F1     2007  XXXXXX       all_pesticides     2.0   0.0000     0.00   
                  2008  XXXXXX       all_pesticides     1.0   0.0200     0.02   
                  2010  XXXXXX       all_pesticides     1.0   0.0000     0.00   
                  2012  XXXXXX       all_pesticides     1.0   0.0000     0.00   
00057X0248/F4     2007  XXXXXX       all_pesticides     2.0   0.0200     0.04   
                  2008  XXXXXX       all_pesticides     1.0   0.0000     0.00   
                  2009  XXXXXX       all_pesticides     1.0   0.0000     0.00   
                  2010  XXXXXX       all_pesticides     1.0   0.0000     0.00   
                  2012  XXXX

In [281]:
### round values in df_ to save space :

decimals = 3 ## round : 0.1234566 --> to : 0.123

integers = 0
#MCT : MINMOLRECH MAXMOLRECH MINMOLQ MAQMOLQ

# round values in MCT / MA
df_MCT_web = df_MCT.round( { 
                             'MA_MOY_YEAR' : decimals,
                             #'MINMOLRECH'  : integers,
                             #'MAXMOLRECH'  : integers,
                             #'MINMOLQ'     : integers,
                             #'MAQMOLQ'     : integers
                           } )
df_MCT_web = df_MCT_web[ ['MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ' ] ].astype(int)
#df_MCT_web  = df_MCT.index.droplevel( "LB_PARAMETRE" )

df_MA_web  = df_MA.round(  { 'MOYPTOT_YEAR': decimals } )
#df_MA_web  = df_MA.index.drop( "LB_PARAMETRE" )


# round all AV values 
df_AV_dpt_web = df_AV_dpt.round(decimals)
df_AV_ME_web  = df_AV_ME.round(decimals)


# drop NaN row in AV
df_AV_dpt_web = df_AV_dpt_web.dropna( axis=0, how="all") # on empty rows
df_AV_ME_web  = df_AV_ME_web.dropna(  axis=0, how="all") # on empty rows


In [None]:
df_MCT_web.index.droplevel(3)
#df_MA_web.index.droplevel(3)

df_MCT_web.head(3)

In [276]:
### list df to save for web appli : df, name

df_order_save = [
    "pest_functions",
    "pesticides",
    "stations",
    "MCT",
    "MA",
    "AV_dpt",
    "AV_ME"
]

df_to_web = {
    "pest_functions" : df_functions,
    "pesticides"     : df_pesticides,
    "stations"       : df_stations_web,
    "MCT"            : df_MCT_web,
    "MA"             : df_MA_web,
    "AV_dpt"         : df_AV_dpt_web,
    "AV_ME"          : df_AV_ME_web      
    }

#for k, df in df_to_web.iteritems() :
    #print k

In [277]:

csv_encoding_web = "utf-8"

def df_to_csv_web(df_, df_name):
    
    print ">>> df_to_csv_web / df_%s " %(df_name)
    outfilename = os.path.join( stats_web_path, df_name + _web + _csv )
    
    print "... outfilename : ", outfilename
    df_.to_csv(outfilename, sep=_sep_csv, encoding = csv_encoding_web )
    print ">>> df_to_csv_web finished for df_%s --> to %s " %(df_name, df_name+_web+_csv)
    print 


### save all pandas df_ to .csv for further uses
#for df_name, df in df_to_web.iteritems() :
for df_name in df_order_save :
    df_to_csv_web( df_to_web[df_name], df_name ) 


## test
#test_to_web_csv = "AV_ME"
#df_to_csv_web( df_to_web[test_to_web_csv], test_to_web_csv )


>>> df_to_csv_web / df_pest_functions 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/pest_functions_web.csv
>>> df_to_csv_web finished for df_pest_functions --> to pest_functions_web.csv 

>>> df_to_csv_web / df_pesticides 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/pesticides_web.csv
>>> df_to_csv_web finished for df_pesticides --> to pesticides_web.csv 

>>> df_to_csv_web / df_stations 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/stations_web.csv
>>> df_to_csv_web finished for df_stations --> to stations_web.csv 

>>> df_to_csv_web / df_MCT 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/MCT_web.csv
>>> df_to_csv_web finished for df_MCT --> to MCT_web.csv 

>>> df_to_csv_web / df_MA 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/MA_web.csv
>>> df_to_csv_web f

In [None]:

########################################################
########################################################
########################################################
### tests queries 
########################################################
########################################################
########################################################


In [None]:
'''main complete and clean DF :
    - df_pesticides
    - df_stations
    - df_MCT
    - df_MA
'''

df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )
df_sliced_02 = queryByIndexValue(df_stations, "NUM_DEP", ["44"] )
df_sliced_03 = queryByIndexValue(df_MCT, "ANNEE", [2009,2010] )
df_sliced_04 = queryByIndexValue(df_MA, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

df_sliced_05 = queryByColValue(df_MA, "MA_MOY", ">", "NORME_DCE")

print "-- listIndexUniqueValues : ", listIndexUniqueValues(df_sliced_02)
print
print "-- getIndexValuesList : ", getIndexValuesList(df_sliced_04, "CD_PARAMETRE") 
print
print "-- getColValuesList : ", getColValuesList(df_MA, "NORME_DCE") 
print 
#print "-- getColValuesList : ", getColValuesList(df_sliced_02, "NOM_COM")

In [None]:
df_sliced_01 

In [None]:
df_sliced_02.head(7)

In [None]:
df_sliced_03.head()

In [None]:
df_sliced_04

In [None]:
df_sliced_05.head()

In [None]:

########################################################
########################################################
########################################################
### -- MERGE DATAS ??? -- 
########################################################
########################################################


In [None]:
#df_stations_MCT_MA = pd.concat( [df_stations_MCT, df_MA] )
#df_stations_MCT_MA.head()


#print df_stations_MA_MCT.columns


# pivot tables
#df_mct_2008.T

In [None]:

########################################################
########################################################
########################################################
### -- analysis --
########################################################
########################################################

## selections : http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label
### TO DO 




In [None]:

########################################################
########################################################
########################################################
### -- exports --
########################################################
########################################################


### export functions

test_df = df_stations.head()


In [None]:
test_record = df_stations.loc[["44"], : ]
test_record.head()

In [None]:
df_stations.head(1)


In [None]:
### return json 


In [None]:
test_record_reset = test_record.reset_index()
test_record_reset.set_index("CD_STATION", inplace=True)
test_record_reset.head()

In [None]:
#json_stations = df_stations.head(2).to_json(orient="split")
json_stations = test_record_reset.to_json(orient="index") ### set unique index as first json key
#print json_stations

### pretty prints
parsed = json.loads(json_stations)
print json.dumps(parsed, indent=2, sort_keys=True)