In [1]:
'''
-----------------------------
STATIONS - PESTICIDES - STATS
-----------------------------

GOAL : notebook python functions to add at root (app initialization on run.py) 
create panda objects / implement query functions / export to JSON 
for data analysis and visualization

- READ .CSV AND .XLSX FILES (DATA) AND CONVERT IT TO PANDAS DATAFRAMES
- CHANGE COORD STATIONS TO WGS_84 (LAT/LONG)
- CLEAN AND MERGE DATA
- QUERY FUNCTIONS
- EXPORT FUNCTIONS (JSON)

AUTHOR : Julien Paris
DATE   : 01/01/2017

TO DO : 
- 
'''
print





In [2]:
### import standard libraries
import os
import itertools
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

# pyproj settings to convert coordinates
from pyproj import Proj, transform
inProj  = Proj(init='epsg:2154') # proj in  : Lambert 93
outProj = Proj(init='epsg:4326') # proj out : WSG 84

In [3]:
### basic folders addresses and names
cwd = os.getcwd()

data_folder      = "app/static/data"
stats_folder     = "stats"
_web             = "_web" 
stats_web_folder = stats_folder + _web

stats_path     = os.path.join(cwd, data_folder, stats_folder)
stats_web_path = os.path.join(cwd, data_folder, stats_web_folder)

print "-- cwd :", cwd
print "-- stats path : "    , stats_path
print "-- stats web path : ", stats_web_path

for file in os.listdir(stats_path):
    if file.endswith(".csv") or file.endswith(".xlsx") or file.endswith(".xls"):
        print "--- dataset in '/data' : ", file
        #print cwd+datas_folder+"/"+file

-- cwd : /Users/jpy/Dropbox/_FLASK/concours_pesticides
-- stats path :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats
-- stats web path :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web
--- dataset in '/data' :  hazardous_listing.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2010_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2011_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2012_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2013.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2013.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2013_copy.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2014.csv
--- dataset in '/data' :  ma_qp_fm_rcsrco_pesteso_2014.xlsx
--- dataset in '/data' :  ma_qp_fm_rcsrco_pes

In [4]:
# set encoding and variables for .csv (keep accents)

csv_encoding = "latin-1"

_csv     = ".csv"
_sep_csv = ";"
_xls     = ".xls"
_xlsx    = ".xlsx"
_copy    = "_copy"

### IF SET TO "False" REDO COPIES FROM ORGINAL XLSX
copies_done = True

def excel_to_csv_temp(df_from_excel, df_name):
    print "df_name : ", df_name
    outfilename = os.path.join( stats_path, df_name + _copy + _csv )
    print "outfilename : ", outfilename
    df_from_excel.to_csv(outfilename, sep=_sep_csv, encoding = csv_encoding )


In [5]:
### panda dataframes for every db + settings

# common code for all pesticcides
all_pesticides_code = "XXXXXX"


#np.array = time_frame

# root strings for datas names
root_mct = "df_mct_"
root_ma  = "df_ma_"

# list of datas filenames
data_stations   = {"files": "stations"  , "ext" : _xlsx, "path" : stats_path }
data_pesticides = {"files": "pesticides", "ext" : _xls , "path" : stats_path }


### good source in .xlsx
data_MCT = { 
    "ext" : _xlsx,
    "path": stats_path,
    "files": [
        "moy_tot_quantif_2007",
        "moy_tot_quantif_2008",
        "moy_tot_quantif_2009",  
        "moy_tot_quantif_2010",
        "moy_tot_quantif_2011",
        "moy_tot_quantif_2012",
        "moy_tot_quantif_2013",
        "moy_tot_quantif_2014",
    ]
}


### good source in .xlsx
data_MA = { 
    "ext" : _xlsx,
    "path": stats_path,
    "files": [
        "ma_qp_fm_ttres_pesteso_2007",
        "ma_qp_fm_ttres_pesteso_2008",
        "ma_qp_fm_ttres_pesteso_2009",
        "ma_qp_fm_rcsrco_pesteso_2010",
        "ma_qp_fm_rcsrco_pesteso_2011",
        "ma_qp_fm_rcsrco_pesteso_2012",
        "ma_qp_fm_rcsrco_pesteso_2013",
        "ma_qp_fm_rcsrco_pesteso_2014",
        ]
}



In [6]:
# set time frame
#years   = {"ANNEE" : [2007, 2008, 2009, 2010, 2011, 2012 ] }


In [7]:
### functions : cleaning operations on dataframes

idx = pd.IndexSlice

def stat_file_path(filename):
    path = os.path.join(stats_path, filename)
    return path 


def checkDTypes (df) :
    # check data type
    
    for index in df.index.names :
        print "---- index : ", index

    for col in df.columns :
        #label = col.values
        dtype = df[col].dtype
        
        print "---- dtypes col : ", col, "/", dtype
        

In [8]:
def comas2points(df, list_col_names="all_col"): 
    # convert all weird "," to "." and then to float values
    
    if list_col_names == "all_col" : 
        df.loc[:, :] = df.replace(to_replace=',', value='.', regex=True)
    else : 
        df.loc[:, list_col_names ] = df.loc[:,list_col_names].replace(to_replace=',', value='.', regex=True)
    return df


def ints2floats(df, list_col_names, to="float") :
    
    if to == "float":
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(float)
    elif to == "int" :
        df.loc[:, list_col_names] = df.loc[:, list_col_names].astype(int)        
    return df


In [9]:
def dfCleanNa(df_list): 
    # clean from NaN values if entire row is NaN
    
    df_list_clean = []
    for df in df_list :
        df_cleaned_01 = df.dropna(how="all") # on empty rows
        df_cleaned_02 = df_cleaned_01.dropna( axis=1, how="all") # on empty columns
        df_list_clean.append(df_cleaned_02)
    
    return df_list_clean


In [10]:

#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#
#   -- DATAS TO DATA FRAMES --                         #
#------------------------------------------------------#
#------------------------------------------------------#
#------------------------------------------------------#


In [11]:

########################################################
########################################################
########################################################
### -- DF_PESTICIDES --
########################################################
########################################################
########################################################


In [12]:
functions_cols= ["CODE_FONCTION","LIBELLE CODE_FONCTION"]

_missing = "no ref"

functions_split = {
    "A"   : "A",
    "B"   : "B",
    "BF"  : "B,F",
    "F"   : "F",
    "FA"  : "F,A",
    "FHM" : "F,H,M",
    "FN"  : "F,N",
    "H"   : "H",
    "I"   : "I",
    "IA"  : "I,A",
    "IAFH": "I,A,F,H",
    "IAM" : "I,A,M",
    "IAN" : "I,A,N",
    "IM"  : "I,M",
    "IN"  : "I,N",
    "Ireg": "I,Reg",
    "N"   : "N",
    "R"   : "Ro", #### twin with Ro
    "Reg" : "Reg",
    "RepO": "RepO",
    "Ro"  : "Ro", ####
    "HFNI": "H,F,N,I",
    "HG"  : "H,G",
    
    "PP"  : "PP",
    _missing : _missing
}

""" print df_pesticides.CODE_FONCTION.unique()
u'I' nan u'H' u'IM' u'IN' u'IA' u'F' u'Ireg' u'FN' u'Reg' u'N' u'IAFH'
 u'IAM' u'IAN' u'R' u'FA' u'RepO' u'Ro' u'FHM' u'BF' u'A' u'B' u'HG'
 u'HFNI' u'PP'
 """
functions_split_list = {
    "A"   : ["A"],
    "B"   : ["B"],
    "BF"  : ["B","F"],
    "F"   : ["F"],
    "FA"  : ["F","A"],
    "FHM" : ["F","H","M"],
    "FN"  : ["F","N"],
    "H"   : ["H"],
    "I"   : ["I"],
    "IA"  : ["I","A"],
    "IAFH": ["I","A","F","H"],
    "IAM" : ["I","A","M"],
    "IAN" : ["I","A","N"],
    "IM"  : ["I","M"],
    "IN"  : ["I","N"],
    "Ireg": ["I","Reg"],
    "N"   : ["N"],
    "R"   : ["Ro"], #### twin with Ro
    "Reg" : ["Reg"],
    "RepO": ["RepO"],
    "Ro"  : ["Ro"], ####
    "HFNI": ["H","F","N","I"],
    "HG"  : ["H","G"],
    
    "PP"  : ["PP"], 
    _missing : [_missing]
}

'''
functions_full = {
    "A"    : "Acaricide",
    "B"    : "Biocide",
    "BF"   : "Biocide, Fongicide",
    "F"    : "Fongicide",
    "FA"   : "Fongicide, Acaricide",
    "FHM"  : "Fongicide, Herbicide, Mollusticide",
    "FN"   : "Fongicide, Nématicide",
    "H"    : "Herbicide",
    "I"    : "Insecticide",
    "IA"   : "Insecticide, Acaricide",
    "IAFH" : "Insecticide, Acaricide, Fongicide, Herbicide",
    "IAM"  : "Insecticide, Acaricide, Mollusticide",
    "IAN"  : "Insecticide, Acaricide, Nématicide",
    "IM"   : "Insecticide, Mollusticide",
    "IN"   : "Insecticide, Nématicide",
    "Ireg" : "Insecticide, Régulateur de croissance",
    "N"    : "Nématicide",
    "R"    : "Rodenticide", ### twin with Ro
    "Reg"  : "Régulateur de croissance",
    "RepO" : "Répulsif",
    "Ro"   : "Rodenticide", ####
    "HFNI" : "Herbicide, Fongicide, Nématicide, Insecticide",
    "HG"   : "Herbicide, Graminicide"
}
'''

functions_list = [
    "A",
    "B",
    "F",
    "H",
    "I",
    "M",
    "N",
    #"R", ### twin with Ro
    "Reg",
    #"reg",
    "RepO",
    "Ro", ### twin with R
    "G",
    "PP", 
    _missing
]

### main reference dict
functions_light = {
    "A"   : "Acaricide",
    "B"   : "Biocide",
    "F"   : "Fongicide",
    "H"   : "Herbicide",
    "I"   : "Insecticide",
    "M"   : "Mollusticide",
    "N"   : "Nématicide",
    #"R"   : "Rodenticide", ### twin with Ro
    "Reg" : "Régulateur de croissance",
    #"reg" : "Régulateur de croissance",
    "RepO": "Répulsif",
    "Ro"  : "Rodenticide", ### twin with R
    "G"   : "Graminicide",
    "PP"  : "%s on 'PP'" %(_missing), #### unknown
    _missing : _missing
}

### 7441 / Furilazole  / PP
### 7513 / Fenchlorazole-ethyl / PP

### optional
df_functions = pd.Series(functions_light, name="LIBELLE_CODE_FONCTION")
df_functions.index.name = 'CODE_FONCTION'
df_functions.reset_index()
df_functions = df_functions.to_frame()

df_functions #["A"]


Unnamed: 0_level_0,LIBELLE_CODE_FONCTION
CODE_FONCTION,Unnamed: 1_level_1
A,Acaricide
B,Biocide
F,Fongicide
G,Graminicide
H,Herbicide
I,Insecticide
M,Mollusticide
N,Nématicide
PP,no ref on 'PP'
Reg,Régulateur de croissance


In [13]:
#functions_dict_index = dict(zip(df_functions['LIBELLE_CODE_FONCTION'], df_functions['LIBELLE_CODE_FONCTION']))
#functions_dict_index

In [14]:
pesticides_dang_csv_filepath = os.path.join( stats_path, "hazardous_listing" + _csv)
print pesticides_dang_csv_filepath

df_pesticides_dang = pd.read_csv( pesticides_dang_csv_filepath, sep=_sep_csv, encoding=csv_encoding )
#df_pesticides_dang.drop('Unnamed: 0', axis=1, inplace=True)

df_pesticides_dang.set_index("CAS", inplace=True, drop=False)
df_pesticides_dang.sort_index(inplace=True)

df_pesticides_dang

/Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats/hazardous_listing.csv


Unnamed: 0_level_0,CAS,Type
CAS,Unnamed: 1_level_1,Unnamed: 2_level_1
10004-44-1,10004-44-1,III
10071-13-3,10071-13-3,U
101-05-3,101-05-3,O
101-21-3,101-21-3,U
101-27-9,101-27-9,O
101-42-8,101-42-8,O
101007-06-1,101007-06-1,U
10112-91-1,10112-91-1,II
101205-02-1,101205-02-1,III
1014-69-3,1014-69-3,O


In [15]:
#df_pesticides_dang.loc["35256-85-0"]

In [16]:
# identify twin values in df_pesticides_dang

#list_CAS_unique = df_pesticides_dang["CAS"].unique
list_CAS        = df_pesticides_dang["CAS"].values
list_CAS_twins  = []
list_CAS_unique = []

for CAS in list(list_CAS):
    if CAS not in list_CAS_unique : 
        list_CAS_unique.append(CAS)
    else : 
        list_CAS_twins.append(CAS)

#print list_CAS_twins

In [17]:
### JUST DO IT ONCE
# copy original pesticides data

if copies_done == False :
    
    pesticides_original_data = os.path.join( data_pesticides["path"], data_pesticides["files"] + data_pesticides["ext"] )
    print pesticides_original_data

    df_pesticides_original_data = pd.read_excel( pesticides_original_data )

    excel_to_csv_temp( df_pesticides_original_data, data_pesticides["files"] )


In [18]:
# read pesticides list

pesticides_csv_filepath = os.path.join( stats_path, data_pesticides["files"] + _copy + _csv)
print pesticides_csv_filepath

df_pesticides = pd.read_csv( pesticides_csv_filepath, sep=_sep_csv, encoding=csv_encoding )
#df_pesticides = comas2points(df_pesticides, ["NORME_DCE"])
#df_pesticides = ints2floats (df_pesticides, ["NORME_DCE"])
df_pesticides.drop('Unnamed: 0', axis=1, inplace=True)

##### dates  : col "DATE_NA_USAGE"
df_pesticides["DATE_NA_USAGE"] = pd.to_datetime(df_pesticides["DATE_NA_USAGE"], infer_datetime_format=True, errors='coerce')

# fill NaN values / avoid bugs on .replace
df_pesticides['CODE_FONCTION'].fillna(_missing, inplace=True)
df_pesticides['CODE_CAS'].fillna(_missing, inplace=True)


/Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats/pesticides_copy.csv


In [19]:
#df_pesticides.head(100)

In [20]:

# replace CODE_FONCTION type "AI" --> "A,I" 
df_pesticides["CODE_FONCTION"].replace(functions_split, inplace=True)
df_pesticides["CODE_CAS"].replace( {u"Non renseigné": _missing} , inplace=True)


In [21]:
#df_pesticides.head(100)

In [22]:
#df_pesticides.iloc[0]

#print df_pesticides_dang.loc["76-44-8"]["Type"]

In [23]:
### add explanations FUNCTIONS
def add_danger(row):
    
    code_CAS = row["CODE_CAS"]
    
    try : 
        danger_level = df_pesticides_dang.loc[code_CAS]["Type"]
    except :
        danger_level = _missing
        
    return danger_level 

df_pesticides["Type"] = df_pesticides.apply(add_danger, axis=1)

In [24]:

### add explanations FUNCTIONS
def add_function(row):
    
    function_list_raw  = row["CODE_FONCTION"]
    #print function_list_raw, type(function_list_raw)
    
    if function_list_raw == _missing :
        functions_lb = _missing
        
    else :
        if "," in function_list_raw :
            function_list = [ functions_light[f] for f in row["CODE_FONCTION"].split(",") ]
            functions_lb = str.join(",", function_list)
        else : 
            function_list = functions_light[function_list_raw]
            functions_lb = str(function_list)

        
    """if  pd.isnull(function_list_raw) == False :
    #if  pd.isnull(row["CODE_FONCTION"]) == False :
        
        row["CODE_FONCTION"] = functions_split_list[row["CODE_FONCTION"]]
        
        function_list_lb = [ functions_light[f] for f in row["CODE_FONCTION"] ]
        #function_list = [ functions_light[f] for f in row["CODE_FONCTION"].split(",") ]
        #function_list = row["CODE_FONCTION_LIST"]
        #function_list = row["CODE_FONCTION"]
        
        if len(function_list_lb) > 1 :
            functions = str.join(", ", function_list_lb )
        else :
            functions = str(function_list_lb[0])
            
        '''if "," in function_list :
            #functions = str.join(", ", function_list )
            functions = str(function_list)
        else :
            functions = str(function_list)'''
    
    else :
        row["CODE_FONCTION"] = "missing"
        functions = np.NaN
    
    print "--3 %s" %(functions)"""
    
    #print row["CODE_FONCTION"], functions_lb, type(functions_lb)
    return functions_lb


df_pesticides["FONCTIONS"] = df_pesticides.apply( add_function, axis=1 )


In [25]:

# set index : CODE_PESTICIDE 
#df_pesticides.set_index(["CODE_FAMILLE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True, drop=False)
df_pesticides.set_index(["CODE_FAMILLE", "CD_PARAMETRE", "CODE_FONCTION"], inplace=True, drop=False)
#df_pesticides.set_index(["CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
df_pesticides.sort_index(inplace=True)


In [26]:
#print df_pesticides.loc[ (u"Amides", 1661) ]#["Type"]

In [27]:
#df_pesticides["CODE_FONCTION"].head(10)
df_pesticides.sample(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CD_PARAMETRE,LB_PARAMETRE,NOM_PARAM2,CODE_FAMILLE,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,Type,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,CODE_FONCTION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Urées,1897,I,1897,Téflubenzuron,Téflubenzuron,Urées,I,PNA,,,,83121-18-0,NaT,C14H6Cl2F4N2O2,0.1,U,Insecticide
Organochlorés,1200,I,1200,Hexachlorocyclohexane alpha,Hexachlorocyclohexane alpha,Organochlorés,I,PNA,,,,319-84-6,NaT,C6H6Cl6,0.1,no ref,Insecticide
Pyréthrinoïdes,1812,I,1812,Alpha-cyperméthrine/alphaméthrine,Alphaméthrine,Pyréthrinoïdes,I,PA,,,,67375-30-8,NaT,C22H19Cl2NO3,0.1,II,Insecticide
Diazines,1686,H,1686,Bromacil,Bromacil,Diazines,H,PNA,,,,314-40-9,2004-01-01,C9H13BrN2O2,0.1,U,Herbicide
Azoles,2860,Reg,2860,Imazaquine,,Azoles,Reg,PA,,,,81335-37-7,NaT,,0.1,U,Régulateur de croissance
Organophosphorés,6595,H,6595,Bensulide,,Organophosphorés,H,PNA,,,,741-58-2,NaT,,0.1,II,Herbicide
Organophosphorés,7149,no ref,7149,Phorate sulfone,Phorsul,Organophosphorés,no ref,PNA,,1525,Phorate,2588-04-7,NaT,C7H17O4PS3,0.1,no ref,no ref
Carbamates,5483,I,5483,Indoxacarbe,,Carbamates,I,PA,,,,173584-44-6,NaT,C22-H17-Cl-F3-N3-O7,0.1,no ref,Insecticide
Organochlorés,1197,I,1197,Heptachlore,Heptachlore,Organochlorés,I,PNA,,,,76-44-8,NaT,C10H5Cl7,0.03,O,Insecticide
Divers (organiques),5589,Reg,5589,Ancymidole,,Divers (organiques),Reg,PNA,,,,12771-68-5,NaT,,0.1,III,Régulateur de croissance


In [28]:
pest_famille_list = list(df_pesticides.index.levels[0])
print "-- len pest_famille_list", len(pest_famille_list)
print pest_famille_list
print

print " -- df_pesticides.index.names    : ", df_pesticides.index.names
print " -- df_pesticides.index.values   : ", df_pesticides.index.values
print " -- df_pesticides.columns.values : ", df_pesticides.columns.values
print " -- df_pesticides.columns        : ", df_pesticides.columns
print 

checkDTypes(df_pesticides)


-- len pest_famille_list 31
[u'Ald\xe9hydes et c\xe9tones', u'Amides', u'Amines', u'Autres \xe9l\xe9ments min\xe9raux', u'Azoles', u'Benz\xe8ne et d\xe9riv\xe9s', u'COHV, solvants chlor\xe9s, fr\xe9ons', u'Carbamate', u'Carbamates', u'Carbamates et thiocarbamates', u'Chloroacetamide ', u'Chloroac\xe9tamide', u'Chloroalcanes', u'Compos\xe9s ph\xe9noliques', u'Diazines', u'Divers (organiques)', u'Fongicides', u'Hydrocarbures et indices li\xe9s', u'Inconnu', u'Indices', u'Metaux et m\xe9tallo\xefdes', u'Organochlor\xe9s', u'Organom\xe9talliques', u'Organophosphor\xe9s', u'Pyridines', u'Pyr\xe9thrino\xefdes', u'Quinazolinones', u'Triazines et m\xe9tabolites', u'Triazoles', u'Triazolopyrimidines sulfonamides', u'Ur\xe9es']

 -- df_pesticides.index.names    :  [u'CODE_FAMILLE', u'CD_PARAMETRE', u'CODE_FONCTION']
 -- df_pesticides.index.values   :  [(nan, 6276, 'no ref') (nan, 6824, 'no ref') (nan, 6856, 'no ref') ...,
 (u'Ur\xe9es', 9055, 'no ref') (u'Ur\xe9es', 99011, 'H')
 (u'Ur\xe9es', 99

In [29]:
df_pesticides.shape

(1046, 15)

In [30]:
### test slicing
df_pesticides.loc[ idx[:,1130] , : ]["CODE_FONCTION"] #.head(3)


CODE_FAMILLE  CD_PARAMETRE  CODE_FONCTION
Carbamates    1130          I,N              I,N
Name: CODE_FONCTION, dtype: object

In [31]:
test_function = str(df_pesticides.loc[("Carbamates",1093 )]["CODE_FONCTION"])
print test_function

CODE_FONCTION
I,M    I,M
Name: CODE_FONCTION, dtype: object


In [32]:
df_pesticides.loc[ df_pesticides["CODE_FONCTION"] == "PP"] #.head(3) 
### 7441 / Furilazole  / PP
### 7513 / Fenchlorazole-ethyl / PP


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CD_PARAMETRE,LB_PARAMETRE,NOM_PARAM2,CODE_FAMILLE,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,Type,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,CODE_FONCTION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Inconnu,7441,PP,7441,Furilazole,Furilazole,Inconnu,PP,PA,,,,121776-33-8,NaT,C11H13Cl2NO3,0.1,no ref,no ref on 'PP'
Inconnu,7513,PP,7513,Fenchlorazole-ethyl,Fenchlorazole-ethyl,Inconnu,PP,,,,,103112-35-2,NaT,C12H8Cl5N3O2,0.1,U,no ref on 'PP'


In [33]:
### test slicing
df_pesticides.loc[ idx[:,1432:1474], :] #.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CD_PARAMETRE,LB_PARAMETRE,NOM_PARAM2,CODE_FAMILLE,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,Type,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,CODE_FONCTION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Carbamates,1463,"I,Reg",1463,Carbaryl,Carbaryl,Carbamates,"I,Reg",PNA,,,,63-25-2,2008-11-20,C12H11NO2,0.1,II,"Insecticide,Régulateur de croissance"
Carbamates,1474,Reg,1474,Chlorprophame,Chlorprophame,Carbamates,Reg,PA,,,,101-21-3,NaT,C10H12ClNO2,0.1,U,Régulateur de croissance
Divers (organiques),1432,F,1432,Pyriméthanil,Pyriméthanil,Divers (organiques),F,PA,,,,53112-28-0,NaT,C12H13N3,0.1,III,Fongicide
Divers (organiques),1473,F,1473,Chlorothalonil,Chlorothalonil,Divers (organiques),F,PA,,,,1897-45-6,NaT,C8Cl4N2,0.1,U,Fongicide
Organochlorés,1472,"F,N",1472,Chloropicrine,Chloropicrine,Organochlorés,"F,N",PNA,,,,76-06-2,NaT,CCl3NO2,0.1,FM,"Fongicide,Nématicide"
Organophosphorés,1464,I,1464,Chlorfenvinphos,Chlorfenvinphos,Organophosphorés,I,PNA,,,,470-90-6,2007-12-31,C12H14Cl3O4P,0.1,Ib,Insecticide


In [34]:
#df_pesticides.info()

In [126]:
df_pesticides.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CD_PARAMETRE,LB_PARAMETRE,NOM_PARAM2,CODE_FAMILLE,CODE_FONCTION,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,Type,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,CODE_FONCTION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Divers (organiques),5653,I,5653,Methoprene,,Divers (organiques),I,PNA,,,,40596-69-8,2003-12-31,,0.1,U,Insecticide
Carbamates,1281,H,1281,Triallate,Triallate,Carbamates,H,PA,,,,2303-17-5,NaT,C10H16Cl3NOS,0.1,III,Herbicide
Organophosphorés,1217,"I,A",1217,Méthidathion,Méthidathion,Organophosphorés,"I,A",PNA,,,,950-37-8,2007-12-31,C6H11N2O4PS3,0.1,Ib,"Insecticide,Acaricide"
Triazines et métabolites,6102,no ref,6102,Trietazine 2-hydroxy,,Triazines et métabolites,no ref,,probable,5842.0,Trietazine,15352-25-7,NaT,,0.1,no ref,no ref
Triazines et métabolites,5744,H,5744,Dipropetryn,,Triazines et métabolites,H,,,,,4147-51-7,NaT,,0.1,O,Herbicide
Amines,3334,Ro,3334,Crimidine,,Amines,Ro,PA,,,,535-89-7,NaT,C7-H10-Cl-N3,0.1,O,Rodenticide
Urées,2016,H,2016,Chlorbromuron,Chlorbromuron,Urées,H,PNA,,,,13360-45-7,NaT,C9H10BrClN2O2,0.1,O,Herbicide
Organochlorés,1203,I,1203,Hexachlorocyclohexane gamma,Lindane,Organochlorés,I,PNA,,,,58-89-9,1998-01-07,C6H6Cl6,0.1,II,Insecticide
Divers (organiques),5741,"I,A",5741,Dinoseb methyl ether,,Divers (organiques),"I,A",,,,,6099-79-2,NaT,,0.1,no ref,"Insecticide,Acaricide"
Divers (organiques),1213,H,1213,"2,4-MCPB","2,4-MCPB",Divers (organiques),H,PA,,,,94-81-5,NaT,C11H13ClO3,0.1,II,Herbicide


In [35]:
#df_pesticides.memory_usage()

In [36]:

########################################################
########################################################
########################################################
### -- DF_STATIONS -- 
########################################################
########################################################
########################################################


In [37]:
### JUST DO IT ONCE
# copy original stations data

if copies_done == False :

    stations_original_data = os.path.join( data_stations["path"], data_stations["files"] + data_stations["ext"] )
    print stations_original_data

    df_stations_original_data = pd.read_excel( stations_original_data )

    excel_to_csv_temp( df_stations_original_data, data_stations["files"] )


In [38]:
### read stations.csv

#lab_stations = "INFOS"
#df_stations  = pd.read_csv( stat_file_path(datas_stations), sep=";", encoding=csv_encoding , na_values=[""] )

stations_csv_filepath = os.path.join( stats_path, data_stations["files"] + _copy + _csv)
print stations_csv_filepath

df_stations = pd.read_csv( stations_csv_filepath, sep=_sep_csv, encoding=csv_encoding, na_values=[""] )


### OPTIMIZATION df_AV_MA / df_AV_MCT
### add numerical index for CODE_STATION#df_stations.drop('Unnamed: 0', axis=1, inplace=True)
df_stations.rename(index=str, columns={"Unnamed: 0": "INDEX_STATION"}, inplace=True)


'''
IMPORTANT : 
name column to link to carto (.shp file) : 
"CD_ME_v2" | "CD_ME_niv1_surf"

for instance : 
"DG330" in column "CD_ME_v2" | "CD_ME_niv1_surf" in df_stations
... corresponds to :
"DG330" in column "CdMasseDEa" in gdf object (geopandas from .shp file)

''' 

# add columns CD_PARAMETRE, LB_PARAMETRE
#df_stations["CD_PARAMETRE"] = 99999
#df_stations["LB_PARAMETRE"] = "all pesticides"

# get columns labels
#col_labels_stations = list(df_stations.columns.values)
#print " -- col_labels :", df_stations[0:5]

# add multilevel hierarchy on columns
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations, "NO_DATE"])
#df_stations.columns = pd.MultiIndex.from_product([lab_stations, col_labels_stations])

#to_float = ["ALTITUDE", "PROFONDEUR_MAXI_POINT", "X_FICT_L93", "Y_FICT_L93"]
#df_stations = comas2points(df_stations, to_float)
#df_stations = ints2floats (df_stations, to_float)

#print "-- indices names :", df_stations.index.name

#print df_stations["Unnamed: 26"].unique()
#df_stations.drop('Unnamed: 26', axis=1, inplace=True)

#print df_stations.columns.values

df_stations.head(3)


/Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats/stations_copy.csv


Unnamed: 0,INDEX_STATION,CD_STATION,NUM_COM,NOM_COM,NUM_DEP,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,...,reseau2013,reseau2014,fi_ma_2007,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014
0,0,06521X0019/SCE,1125,CORVEISSIAT,1,AERM&C,459.0,,01125_ _FRDG140,888869.860702,...,,,oui,oui,oui,,,,,
1,1,07015X0009/F,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.0,15.2,01133_FRDG511_FRDG330,916062.939502,...,,,oui,,,,,,,
2,2,07015X0010/P,1133,CRESSIN-ROCHEFORT,1,AERM&C,229.8,16.0,01133_FRDG511_FRDG330,915390.033302,...,,,,oui,,,,,,


In [39]:
stations_dict_index = dict(zip(df_stations['CD_STATION'], df_stations['INDEX_STATION']))

In [40]:
stations_dict_index["07015X0009/F"]

1

In [41]:
### FOR CARTO : add column for long lat in WSG84

def convertCoordinates(row):
    x1,y1 = row["X_FICT_L93"], row["Y_FICT_L93"]
    convertedCoord = transform(inProj,outProj, x1, y1)
    return list(convertedCoord)

def extractFromList(index):
    value = row[colName][index]
    print value
    return value

df_stations["COORD_WSG84"] = df_stations.apply(convertCoordinates,axis=1)
#df_stations["LAT_WSG84"]   = df_stations.apply(lambda row: extractFromList(row['COORD_WSG84'], 0), axis=1)
#df_stations["LONG_WSG84"]  = df_stations.apply(extractFromList(index=1),axis=1)

## cf : http://chrisalbon.com/python/pandas_expand_cells_containing_lists.html
# expand df.tags into its own dataframe
coord = df_stations['COORD_WSG84'].apply(pd.Series)
# rename each variable with its tag
#coord = coord.rename(columns = lambda x : 'COORD_' + str(x))
coord.columns = ["LAT_WSG84","LONG_WSG84"] 
#print coord.head()

print coord.head()
print 

# copy CD_STATION column for further uses
df_stations["CD_STATION_"] = df_stations["CD_STATION"]

# join the coord dataframe back to the original dataframe
df_stations = pd.concat( [df_stations, coord], axis=1, join="outer" )
#df_stations.head(3)


   LAT_WSG84  LONG_WSG84
0   5.452862   46.270740
1   5.781881   45.793046
2   5.772809   45.785001
3   5.788505   45.844201
4   5.074473   45.836095



In [42]:
print list(df_stations.columns)

['INDEX_STATION', u'CD_STATION', u'NUM_COM', u'NOM_COM', u'NUM_DEP', u'codagence', u'ALTITUDE', u'PROFONDEUR_MAXI_POINT', u'Unit\xe9_coord_fictifs', u'X_FICT_L93', u'Y_FICT_L93', u'CD_ME_v2', u'CD_ME_niv1_surf', u'reseau2009', u'reseau2010', u'reseau2011', u'reseau2012', u'reseau2013', u'reseau2014', u'fi_ma_2007', u'fi_ma_2008', u'fi_ma_2009', u'fi_ma_2010', u'fi_ma_2011', u'fi_ma_2012', u'fi_ma_2013', u'fi_ma_2014', 'COORD_WSG84', 'CD_STATION_', 'LAT_WSG84', 'LONG_WSG84']


In [43]:
# set indexes for stations
#df_stations.set_index( ["CD_STATION"], inplace=True) 
df_stations.set_index(["NUM_DEP", "NOM_COM",  "CD_ME_niv1_surf", "CD_ME_v2", "CD_STATION"], inplace=True) 
df_stations.sort_index(inplace=True) 

print "-- df_stations.shape : ", df_stations.shape
checkDTypes(df_stations)


-- df_stations.shape :  (13039, 26)
---- index :  NUM_DEP
---- index :  NOM_COM
---- index :  CD_ME_niv1_surf
---- index :  CD_ME_v2
---- index :  CD_STATION
---- dtypes col :  INDEX_STATION / int64
---- dtypes col :  NUM_COM / object
---- dtypes col :  codagence / object
---- dtypes col :  ALTITUDE / float64
---- dtypes col :  PROFONDEUR_MAXI_POINT / object
---- dtypes col :  Unité_coord_fictifs / object
---- dtypes col :  X_FICT_L93 / float64
---- dtypes col :  Y_FICT_L93 / float64
---- dtypes col :  reseau2009 / object
---- dtypes col :  reseau2010 / object
---- dtypes col :  reseau2011 / object
---- dtypes col :  reseau2012 / object
---- dtypes col :  reseau2013 / object
---- dtypes col :  reseau2014 / object
---- dtypes col :  fi_ma_2007 / object
---- dtypes col :  fi_ma_2008 / object
---- dtypes col :  fi_ma_2009 / object
---- dtypes col :  fi_ma_2010 / object
---- dtypes col :  fi_ma_2011 / object
---- dtypes col :  fi_ma_2012 / object
---- dtypes col :  fi_ma_2013 / object
----

In [44]:
df_stations.shape

(13039, 26)

In [45]:
#df_stations.info()


In [46]:
### list of Masses d'Eau : 
### "CD_ME_niv1_surf" | "CD_ME_v2" in stats == "CdMasseDEa" in .shp

MEs_niv1_list = list(df_stations.index.get_level_values("CD_ME_niv1_surf").unique() )
MEs_niv1_list.sort()

MEs_niv2_list = list(df_stations.index.get_level_values("CD_ME_v2").unique() )
MEs_niv2_list.sort()


In [47]:
print len(MEs_niv1_list)
print MEs_niv1_list[:10]

550
[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [48]:
print len(MEs_niv2_list)
print MEs_niv2_list[:10]

566
[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [49]:
MEs_all_list = list(set(MEs_niv1_list + MEs_niv2_list))
MEs_all_list.sort()
print MEs_all_list[:10]


[nan, u'AG001', u'AG002', u'AG003', u'AG004', u'AG005', u'AG006', u'AG007', u'AG008', u'AG009']


In [50]:
#for ME in MEs_all_list[1:] : 
#    if ME.startswith("GG"):
#        print ME

In [51]:
df_stations.head(5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,INDEX_STATION,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,31,,AEAG,10.0,33.0,,0.0,0.0,Hors RCS et RCO,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856
,,,CG004,01688X0034/AVAL,33,,AERM,235.0,,,0.0,0.0,Hors RCS et RCO,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0034/AVAL,-1.363081,-5.983856
,,,CG004,01688X0039/F1,32,,AERM,170.0,80.0,,0.0,0.0,Hors RCS et RCO,,...,oui,,,,,,"[-1.36308121012, -5.98385630921]",01688X0039/F1,-1.363081,-5.983856
1.0,AMBERIEU-EN-BUGEY,DG149,DG149,06758X0052/HY,34,1004.0,AERM&C,310.0,,01004_ _FRDG149,883079.012902,6544021.0,Hors RCS et RCO,horsRCSRCODRIRE,...,,,,,,,"[5.36469511404, 45.9712376427]",06758X0052/HY,5.364695,45.971238
1.0,AMBLEON,DG149,DG149,07007X0001/006A,35,1006.0,AERM&C,420.0,,01006_ _FRDG149,900470.478202,6520388.0,Hors RCS et RCO,,...,,,,,,,"[5.57920725651, 45.7536265801]",07007X0001/006A,5.579207,45.753627


In [52]:
idx = pd.IndexSlice

cd_station_test1 = "06784X0024"
cd_station_test2 = "10261X0039/F3"

df_stations.loc[ idx [ :,:,:,:, cd_station_test2 ] , : ]#["INDEX_STATION"].values

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,INDEX_STATION,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,31,,AEAG,10.0,33,,0.0,0.0,Hors RCS et RCO,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856


In [53]:

########################################################
########################################################
########################################################
### -- DF_MCT (moy concentrations totales)  --
########################################################
########################################################
########################################################


In [54]:
### JUST DO IT ONCE
# copy original MCT datato CSV

if copies_done == False :

    for MCT_file in data_MCT["files"] : 

        MCT_original_data = os.path.join( data_MCT["path"], MCT_file + data_MCT["ext"] )
        print MCT_original_data

        df_MCT_original_data = pd.read_excel( MCT_original_data )

        excel_to_csv_temp( df_MCT_original_data, MCT_file)
    


In [55]:
#lab_MCT = "MCT"

## read datas MCT

df_mct_2007 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][0]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2008 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][1]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2009 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][2]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2010 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][3]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2011 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][4]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2012 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][5]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2013 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][6]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_mct_2014 = pd.read_csv( os.path.join(stats_path, data_MCT["files"][7]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])

df_mct_2007.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2007 = df_mct_2007.dropna( axis=0, how="all") # on empty rows

df_mct_2008.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2008 = df_mct_2008.dropna( axis=0, how="all") # on empty rows

df_mct_2009.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2009 = df_mct_2009.dropna( axis=0, how="all") # on empty rows

df_mct_2010.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2010 = df_mct_2010.dropna( axis=0, how="all") # on empty rows

df_mct_2011.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2011 = df_mct_2011.dropna( axis=0, how="all") # on empty rows

df_mct_2012.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2012 = df_mct_2012.dropna( axis=0, how="all") # on empty rows

df_mct_2013.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2013 = df_mct_2013.dropna( axis=0, how="all") # on empty rows

df_mct_2014.drop('Unnamed: 0', axis=1, inplace=True)
df_mct_2014 = df_mct_2014.dropna( axis=0, how="all") # on empty rows


In [56]:
### add index station to df_MCT / df_MA 

idx = pd.IndexSlice

def add_indexStation(row) :
    
    cd_station    = row["CD_STATION"]
    
    try :
        #index_station = df_stations.loc[ idx[ :,:,:,:, cd_station ], : ].iloc[0]["INDEX_STATION"]
        index_station = stations_dict_index[cd_station]
    except : 
        print row["ANNEE"], cd_station, "--> ", _missing
        index_station = _missing
    
    return index_station


In [57]:
df_mct_2007["INDEX_STATION"] = df_mct_2007.apply(add_indexStation, axis=1)


2007 06784X0024 -->  no ref
2007 07735X0033 -->  no ref
2007 09978X0023 -->  no ref
2007 10207X0191 -->  no ref
2007 07735X0033 -->  no ref
2007 09978X0023 -->  no ref
2007 10207X0191 -->  no ref


In [58]:
df_mct_2007.sample(4)


Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION
869,2007,03784X0016/F,2,0.08,0.1,103,108,3,3,9735
221,2007,00887X1007/SAEP1,6,0.003333,0.02,103,108,0,1,901
1107,2007,05347X0017/P6,2,0.065,0.12,271,273,1,2,12062
494,2007,01936X0019/HY,6,0.066667,0.13,103,108,0,3,6991


In [59]:
#df_mct_2007.shape
print " -- df_mct_2007.index.names : ", df_mct_2007.index.names
print " -- df_mct_2007.columns     : ", df_mct_2007.columns

df_mct_2007.head()

 -- df_mct_2007.index.names :  [None]
 -- df_mct_2007.columns     :  Index([        u'ANNEE',    u'CD_STATION',        u'NBPREL',       u'MOYPTOT',
             u'MAXPTOT',    u'MINMOLRECH',    u'MAXMOLRECH',       u'MINMOLQ',
             u'MAQMOLQ', u'INDEX_STATION'],
      dtype='object')


Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION
0,2007,00054X0169/F1,4,0.0,0.0,18,96,0,0,8644
1,2007,00057X0245/F1,2,0.0,0.0,18,96,0,0,8527
2,2007,00057X0248/F4,2,0.02,0.04,61,96,0,1,8682
3,2007,00061X0118/F8,4,0.0125,0.02,18,96,0,1,8640
4,2007,00066X0042/SO,2,0.28,0.35,19,19,2,2,8418


In [60]:
df_mct_2008["INDEX_STATION"] = df_mct_2008.apply(add_indexStation, axis=1)

In [61]:
df_mct_2009["INDEX_STATION"] = df_mct_2009.apply(add_indexStation, axis=1)

In [62]:
df_mct_2010["INDEX_STATION"] = df_mct_2010.apply(add_indexStation, axis=1)

In [63]:
df_mct_2010.head()

Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION
5512,2010.0,00057X0245/F1,1.0,0.0,0.0,63.0,63.0,0.0,0.0,8527
5513,2010.0,00057X0248/F4,1.0,0.0,0.0,63.0,63.0,0.0,0.0,8682
5514,2010.0,00061X0118/F8,2.0,0.045,0.06,63.0,63.0,2.0,2.0,8640
5515,2010.0,00066X0042/SO,2.0,0.295,0.3,31.0,31.0,2.0,2.0,8418
5516,2010.0,00071X0015/F,2.0,0.0,0.0,31.0,31.0,0.0,0.0,8621


In [64]:
df_mct_2011["INDEX_STATION"] = df_mct_2011.apply(add_indexStation, axis=1)

In [65]:
df_mct_2012["INDEX_STATION"] = df_mct_2012.apply(add_indexStation, axis=1)

In [66]:
df_mct_2013["INDEX_STATION"] = df_mct_2013.apply(add_indexStation, axis=1)

In [67]:
df_mct_2013.head()

Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION
11144,2013.0,00054X0169/F1,2.0,0.0,0.0,93.0,226.0,0.0,0.0,8644
11145,2013.0,00057X0245/F1,2.0,0.0,0.0,95.0,97.0,0.0,0.0,8527
11146,2013.0,00057X0248/F4,2.0,0.0,0.0,93.0,226.0,0.0,0.0,8682
11147,2013.0,00061X0118/F8,3.0,0.027,0.034,95.0,392.0,1.0,3.0,8640
11148,2013.0,00066X0042/SO,2.0,0.6275,0.91,95.0,228.0,2.0,2.0,8418


In [68]:
df_mct_2014["INDEX_STATION"] = df_mct_2014.apply(add_indexStation, axis=1)

In [69]:
df_mct_2014.head()

Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION
13302,2014.0,00054X0169/F1,2.0,0.0,0.0,285.0,287.0,0.0,0.0,8644
13303,2014.0,00057X0245/F1,3.0,0.0,0.0,284.0,395.0,0.0,0.0,8527
13304,2014.0,00057X0248/F4,3.0,0.020333,0.031,284.0,395.0,0.0,2.0,8682
13305,2014.0,00061X0118/F8,3.0,0.017667,0.023,288.0,395.0,2.0,3.0,8640
13306,2014.0,00066X0042/SO,2.0,0.6155,0.849,286.0,288.0,3.0,6.0,8418


In [70]:
#checkDTypes(df_mct_2007)

In [71]:
#checkDTypes(df_mct_2008)

In [72]:
#checkDTypes(df_mct_2009)

In [73]:
#checkDTypes(df_mct_2010)

In [74]:
#checkDTypes(df_mct_2011)

In [75]:
#checkDTypes(df_mct_2012)

In [76]:
test_dict = {    "CODE_FAMILLE"  : { "count" : "df_A" }, 
                 "CODE_FONCTION" : { "count" : "df_B" }, 
                 "Type"          : { "count" : "df_C" }
            }

for k, v in test_dict.iteritems() :
    print v["count"]

df_A
df_C
df_B


In [77]:
#df_mct_2008.head() 

#df_ = df_mct_2010.dropna(how="all")
#df_.loc[:, ("ANNEE")] = df_.loc[:, ("ANNEE")].astype(int)
#df_.head() 

In [78]:
### merge all MCT datas with multiIndex
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-multiple-dataframe-or-panel-objects
# cf : http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-with-two-multi-indexes
# options/alternatives : .merge .join .concat .append

frames_mct = [df_mct_2007,df_mct_2008, df_mct_2009, df_mct_2010, df_mct_2011, df_mct_2012, df_mct_2013, df_mct_2014]

# clean from NaN values if entire row is NaN
frames_mct_cleaned = dfCleanNa(frames_mct)
    
df_MCT = pd.concat(frames_mct_cleaned)

# convert all year column data to integers
df_MCT.loc[:, "ANNEE"] = df_MCT.loc[:, "ANNEE"].astype(int) 

'''
# convert all year column data to integers
df_MCT = ints2floats(df_MCT, ["ANNEE"], to="int")

# convert all weird "," to "." and then to float values
df_MCT   = comas2points(df_MCT)
to_float = ['NBPREL', 'MOYPTOT', 'MAXPTOT', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ']
df_MCT   = ints2floats(df_MCT, to_float)

'''

# add column CD_PARAMETRE, LB_PARAMETRE
df_MCT["CD_PARAMETRE"] = all_pesticides_code
df_MCT["LB_PARAMETRE"] = "all_pesticides"



In [79]:
df_MCT.sample(10)

Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,CD_PARAMETRE,LB_PARAMETRE
11482,2013,01204X0045/FE1,4.0,0.100625,0.1275,274.0,276.0,2.0,4.0,1750,XXXXXX,all_pesticides
8477,2011,05282X0097/P1,4.0,0.02,0.05,409.0,409.0,0.0,1.0,5235,XXXXXX,all_pesticides
10915,2012,09615X0033/HY,1.0,0.0,0.0,23.0,23.0,0.0,0.0,1459,XXXXXX,all_pesticides
14462,2014,05264X0003/PUITS,1.0,0.296,0.296,281.0,281.0,6.0,6.0,2500,XXXXXX,all_pesticides
480,2009,02025X0004/HY,2.0,0.0,0.0,272.0,272.0,0.0,0.0,3414,XXXXXX,all_pesticides
9445,2012,01001B0153/HY,9.0,0.064333,0.112,277.0,277.0,2.0,5.0,10934,XXXXXX,all_pesticides
508,2007,01996X0023/F,2.0,0.07,0.08,103.0,108.0,1.0,2.0,9495,XXXXXX,all_pesticides
1362,2009,07235X0011/F,5.0,0.028,0.09,56.0,383.0,0.0,2.0,5052,XXXXXX,all_pesticides
11758,2013,02302X0097/HY,5.0,0.2286,0.34,380.0,400.0,2.0,7.0,6887,XXXXXX,all_pesticides
13053,2013,09771X0314/F,2.0,0.09,0.1,44.0,44.0,1.0,1.0,5409,XXXXXX,all_pesticides


In [80]:
df_MCT.tail(10)

Unnamed: 0,ANNEE,CD_STATION,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,CD_PARAMETRE,LB_PARAMETRE
15337,2014,10971X0202/F3,1.0,0.016,0.016,151.0,151.0,2.0,2.0,9413,XXXXXX,all_pesticides
15338,2014,10972X0084/111111,11.0,0.108091,0.25,1.0,405.0,0.0,2.0,9382,XXXXXX,all_pesticides
15339,2014,10972X0130/BARNIO,4.0,0.255,1.01,404.0,405.0,0.0,2.0,9358,XXXXXX,all_pesticides
15340,2014,10972X0151/F6,4.0,0.0,0.0,404.0,405.0,0.0,0.0,9429,XXXXXX,all_pesticides
15341,2014,10972X0181/ARMENG,4.0,0.04,0.13,404.0,405.0,1.0,2.0,9366,XXXXXX,all_pesticides
15342,2014,10972X0184/F3BIS,1.0,0.0,0.0,151.0,151.0,0.0,0.0,9427,XXXXXX,all_pesticides
15343,2014,10972X0203/CALMET,4.0,0.0,0.0,404.0,405.0,0.0,0.0,9363,XXXXXX,all_pesticides
15344,2014,10982X0003/SEGRE,4.0,0.0025,0.01,404.0,405.0,0.0,1.0,9341,XXXXXX,all_pesticides
15345,2014,11013X0002/F,4.0,0.06725,0.111,151.0,408.0,2.0,4.0,9334,XXXXXX,all_pesticides
15346,2014,11195X0147/FITTEL,1.0,0.0,0.0,23.0,23.0,0.0,0.0,3601,XXXXXX,all_pesticides


In [81]:

# set index hierarchy
#df_MCT.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MCT.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

print " -- df_MCT.index.names    : ", df_MCT.index.names
print " -- df_MCT.index.values   : ", df_MCT.index.values
print " -- df_MCT.columns.values : ", df_MCT.columns.values
print " -- df_MCT.columns        : ", df_MCT.columns

df_MCT.sort_index(inplace=True) 


 -- df_MCT.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MCT.index.values   :  [(u'00054X0169/F1', 2007, 'XXXXXX', 'all_pesticides')
 (u'00057X0245/F1', 2007, 'XXXXXX', 'all_pesticides')
 (u'00057X0248/F4', 2007, 'XXXXXX', 'all_pesticides') ...,
 (u'10982X0003/SEGRE', 2014, 'XXXXXX', 'all_pesticides')
 (u'11013X0002/F', 2014, 'XXXXXX', 'all_pesticides')
 (u'11195X0147/FITTEL', 2014, 'XXXXXX', 'all_pesticides')]
 -- df_MCT.columns.values :  [u'NBPREL' u'MOYPTOT' u'MAXPTOT' u'MINMOLRECH' u'MAXMOLRECH' u'MINMOLQ'
 u'MAQMOLQ' 'INDEX_STATION']
 -- df_MCT.columns        :  Index([       u'NBPREL',       u'MOYPTOT',       u'MAXPTOT',    u'MINMOLRECH',
          u'MAXMOLRECH',       u'MINMOLQ',       u'MAQMOLQ', u'INDEX_STATION'],
      dtype='object')


In [82]:
print df_MCT.shape 


(15347, 8)


In [83]:
df_MCT["MOYPTOT_YEAR"] = np.NaN

checkDTypes(df_MCT)


---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBPREL / float64
---- dtypes col :  MOYPTOT / float64
---- dtypes col :  MAXPTOT / float64
---- dtypes col :  MINMOLRECH / float64
---- dtypes col :  MAXMOLRECH / float64
---- dtypes col :  MINMOLQ / float64
---- dtypes col :  MAQMOLQ / float64
---- dtypes col :  INDEX_STATION / object
---- dtypes col :  MOYPTOT_YEAR / float64


In [84]:
df_MCT.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11221X0134/TRAVO,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3633,
11233X0118/PUGNAC,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,3570,
11233X0118/PUGNAC,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3570,
11233X0118/PUGNAC,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3570,
11234X0127/BARA,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,3575,
11234X0127/BARA,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3575,
11234X0127/BARA,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3575,
11282X0005/ARAGUI,2007,XXXXXX,all_pesticides,2.0,0.0,0.0,1.0,376.0,0.0,0.0,3539,
11282X0005/ARAGUI,2008,XXXXXX,all_pesticides,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3539,
11282X0005/ARAGUI,2013,XXXXXX,all_pesticides,2.0,0.07,0.1,408.0,408.0,1.0,1.0,3539,


In [85]:

########################################################
########################################################
########################################################
### -- DF_MA (moy analyses)  --
########################################################
########################################################
########################################################


In [86]:
### JUST DO IT ONCE !! GREEDY FOR MA DATA --> approx 30 min
# copy original MA data to CSV

if copies_done == False :

    start_time = datetime.now()
    print "-- start_time for making csv copies of MA data : ", start_time

    for MA_file in data_MA["files"] : 

        MA_original_data = os.path.join( data_MA["path"], MA_file + data_MA["ext"] )
        print MA_original_data

        df_MA_original_data = pd.read_excel( MA_original_data )

        excel_to_csv_temp( df_MA_original_data, MA_file)

    print "-- FINISH / time spent for making csv copies of MA data : ", datetime.now() - start_time


In [87]:
#lab_MA = "MA"

### DEPRECATED

def multilevel_MA (df, year):

    # set indexes : STATION and CODE_PESTICIDE
    #df.set_index(["CD_STATION", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)
    #df.set_index(["CD_STATION"], inplace=True)

    # get columns labels for df_ma
    #col_labels_df_ma = list(df.columns.values)
    #print " -- col_labels :", col_labels_df_ma

    # add multilevel hierarchy on columns
    #df.columns = pd.MultiIndex.from_product([lab_MA, col_labels_df_ma, year])
    
    # convert all weird "," to "." and then to float values
    #to_float = ["MA_MOY", "NORME_DCE"]
    df       = comas2points(df, to_float)
    df       = ints2floats (df, to_float)

    df["ANNEE"] = year
    
    return df


In [88]:
### WARNING : GREEDY IF READ .XLSX --> READ .CSV COPIES

## read datas MCT

df_ma_2007 = pd.read_csv( os.path.join(stats_path, data_MA["files"][0]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2008 = pd.read_csv( os.path.join(stats_path, data_MA["files"][1]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2009 = pd.read_csv( os.path.join(stats_path, data_MA["files"][2]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2010 = pd.read_csv( os.path.join(stats_path, data_MA["files"][3]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2011 = pd.read_csv( os.path.join(stats_path, data_MA["files"][4]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2012 = pd.read_csv( os.path.join(stats_path, data_MA["files"][5]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2013 = pd.read_csv( os.path.join(stats_path, data_MA["files"][6]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])
df_ma_2014 = pd.read_csv( os.path.join(stats_path, data_MA["files"][7]+_copy+_csv ), sep=_sep_csv, encoding = csv_encoding) #,  index_col=[1,0])

print "-- finish reading CSV"

'''
df_ma_2007 = pd.read_excel( stat_file_path(datas_MA_excel[0]) )
#df_ma_2007 = pd.read_csv( stat_file_path(datas_MA_csv[0]), sep=";", encoding = csv_encoding )
    
df_ma_2008 = pd.read_excel( stat_file_path(datas_MA_excel[1]) )
#df_ma_2008 = pd.read_csv( stat_file_path(datas_MA_csv[1]), sep=";", encoding = csv_encoding)

df_ma_2009 = pd.read_excel( stat_file_path(datas_MA_excel[2]) )
#df_ma_2009 = pd.read_csv( stat_file_path(datas_MA_csv[2]), sep=";", encoding = csv_encoding)

df_ma_2010 = pd.read_excel( stat_file_path(datas_MA_excel[3]) )
#df_ma_2010 = pd.read_csv( stat_file_path(datas_MA_csv[3]), sep=";", encoding = csv_encoding)

df_ma_2011 = pd.read_excel( stat_file_path(datas_MA_excel[4]) )
#df_ma_2011 = pd.read_csv( stat_file_path(datas_MA_csv[4]), sep=";", encoding = csv_encoding)

df_ma_2012 = pd.read_excel( stat_file_path(datas_MA_excel[5]) )
#df_ma_2012 = pd.read_csv( stat_file_path(datas_MA_csv[5]), sep=";", encoding = csv_encoding)

'''

df_ma_2007.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2007 = df_ma_2007.dropna( axis=0, how="all") # on empty rows

df_ma_2008.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2008 = df_ma_2008.dropna( axis=0, how="all") # on empty rows

df_ma_2009.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2009 = df_ma_2009.dropna( axis=0, how="all") # on empty rows

df_ma_2010.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2010 = df_ma_2010.dropna( axis=0, how="all") # on empty rows

df_ma_2011.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2011 = df_ma_2011.dropna( axis=0, how="all") # on empty rows

df_ma_2012.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2012 = df_ma_2012.dropna( axis=0, how="all") # on empty rows

df_ma_2013.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2013 = df_ma_2013.dropna( axis=0, how="all") # on empty rows

df_ma_2014.drop('Unnamed: 0', axis=1, inplace=True)
df_ma_2014 = df_ma_2014.dropna( axis=0, how="all") # on empty rows

print "-- finish dropping"

df_ma_2007["ANNEE"] = 2007
df_ma_2008["ANNEE"] = 2008
df_ma_2009["ANNEE"] = 2009
df_ma_2010["ANNEE"] = 2010
df_ma_2011["ANNEE"] = 2011
df_ma_2012["ANNEE"] = 2012
df_ma_2013["ANNEE"] = 2013
df_ma_2014["ANNEE"] = 2014

print "-- finish adding year"


-- finish reading CSV
-- finish dropping
-- finish adding year


In [89]:
df_ma_2007["INDEX_STATION"] = df_ma_2007.apply(add_indexStation, axis=1)

In [90]:
df_ma_2007.head(1)

Unnamed: 0,CD_STATION,CD_PARAMETRE,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,LB_PARAMETRE,ANNEE,INDEX_STATION
0,00053X0002/SO1,1102,1,0.05,0,0.1,Aldicarbe,2007,8687


In [91]:
df_ma_2008["INDEX_STATION"] = df_ma_2008.apply(add_indexStation, axis=1)

In [92]:
df_ma_2009["INDEX_STATION"] = df_ma_2009.apply(add_indexStation, axis=1)

In [93]:
df_ma_2010["INDEX_STATION"] = df_ma_2010.apply(add_indexStation, axis=1)

In [94]:
df_ma_2011["INDEX_STATION"] = df_ma_2011.apply(add_indexStation, axis=1)

In [95]:
df_ma_2012["INDEX_STATION"] = df_ma_2012.apply(add_indexStation, axis=1)

In [96]:
df_ma_2013["INDEX_STATION"] = df_ma_2013.apply(add_indexStation, axis=1)

In [97]:
df_ma_2014["INDEX_STATION"] = df_ma_2014.apply(add_indexStation, axis=1)

In [98]:
'''
### add multilevel on index + cleaning

df_ma_2007 = multilevel_MA(df_ma_2007, 2007)
df_ma_2008 = multilevel_MA(df_ma_2008, 2008)
df_ma_2009 = multilevel_MA(df_ma_2009, 2009)
df_ma_2010 = multilevel_MA(df_ma_2010, 2010)
df_ma_2011 = multilevel_MA(df_ma_2011, 2011)
df_ma_2012 = multilevel_MA(df_ma_2012, 2012)
'''
print




In [99]:
#df_ma_2010.head() 

#df_ma_2011.head() 

#df_ma_2012.head() 

In [100]:
### WARNING : GREEDY
### merge all MA datas 

frames_MA = [df_ma_2007, df_ma_2008, df_ma_2009, df_ma_2010, df_ma_2011, df_ma_2012, df_ma_2013, df_ma_2014]

# clean from NaN values if entire row is NaN
frames_MA_cleaned = dfCleanNa(frames_MA)

# concatenate datas MA
df_MA = pd.concat(frames_MA_cleaned)

# set index hierarchy
#df_MA.set_index(["CD_STATION"], inplace=True)
#df_MA.set_index(["CD_STATION", "ANNEE"], inplace=True)
df_MA.set_index(["CD_STATION", "ANNEE", "CD_PARAMETRE", "LB_PARAMETRE"], inplace=True)

#df_MA.sort_index(inplace=True) 
df_MA.sortlevel(inplace=True) 

print " -- df_MA.index.names    : ", df_MA.index.names
print " -- df_MA.index.values   : ", df_MA.index.values
print " -- df_MA.columns.values : ", df_MA.columns.values
print " -- df_MA.columns        : ", df_MA.columns


 -- df_MA.index.names    :  [u'CD_STATION', u'ANNEE', u'CD_PARAMETRE', u'LB_PARAMETRE']
 -- df_MA.index.values   :  [(u'00053X0002/SO1', 2007, 1102, u'Aldicarbe')
 (u'00053X0002/SO1', 2007, 1107, u'Atrazine')
 (u'00053X0002/SO1', 2007, 1108, u'Atrazine d\xe9s\xe9thyl') ...,
 (u'11282X0005/ARAGUI', 2013, 5526, u'Boscalid')
 (u'11282X0005/ARAGUI', 2013, 5617, u'Dimethenamid-P')
 (u'11282X0005/ARAGUI', 2013, 5654, u'Metrafenone')]
 -- df_MA.columns.values :  [u'NBANASPERTS1' u'MA_MOY' u'NBQUANTIF' u'NORME_DCE' 'INDEX_STATION']
 -- df_MA.columns        :  Index([u'NBANASPERTS1', u'MA_MOY', u'NBQUANTIF', u'NORME_DCE',
       u'INDEX_STATION'],
      dtype='object')


In [101]:
### MA : add columns for averages and custom indicators
df_MA["MA_MOY_YEAR"] = np.NaN


In [102]:
print df_MA.shape

(3492185, 6)


In [103]:
### MA : add columns for averages and custom indicators
### is_MA_MOY_sup_to_NORME_DCE --> GREEDY : delta_time : 0:11:56


def is_MA_MOY_sup_to_NORME_DCE(row):
    moy_  = row["MA_MOY"]
    norm_ = row["NORME_DCE"]
    isSup = moy_ > norm_
    return isSup


start_time = datetime.now()
print ">>> start is_MA_MOY_sup_to_NORME_DCE --> %s" %(start_time)

#df_MA["MAMOY_SUP_TO_NORME"] = df_MA.apply(is_MA_MOY_sup_to_NORME_DCE,axis=1)

delta_time = datetime.now() - start_time
print ">>> start is_MA_MOY_sup_to_NORME_DCE / delta_time : %s" %(delta_time)



>>> start is_MA_MOY_sup_to_NORME_DCE --> 2017-02-03 18:01:13.606196
>>> start is_MA_MOY_sup_to_NORME_DCE / delta_time : 0:00:00.000383


In [104]:
checkDTypes(df_MA)

---- index :  CD_STATION
---- index :  ANNEE
---- index :  CD_PARAMETRE
---- index :  LB_PARAMETRE
---- dtypes col :  NBANASPERTS1 / int64
---- dtypes col :  MA_MOY / float64
---- dtypes col :  NBQUANTIF / int64
---- dtypes col :  NORME_DCE / float64
---- dtypes col :  INDEX_STATION / int64
---- dtypes col :  MA_MOY_YEAR / float64


In [105]:
#list(df_MA.index.levels[2])

In [106]:

df_MA.head(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,8687,
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,8687,
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,8687,


In [107]:
idx = pd.IndexSlice

df_MA.loc[ idx[ : , :, 1107 ], : ].head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,8687,
00053X0004/F1,2007,1107,Atrazine,1,0.01,0,0.1,8448,


In [109]:
df_MA.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
09907X0402/PISTE,2008,1677,Meptyldinocap,1,0.025,0,0.1,4362,
02264X1009/P,2010,1137,Cyanazine,5,0.0085,0,0.1,6577,
08151X0006/C,2008,1403,Diméthomorphe,2,0.025,0,0.1,5675,
10292X0110/P,2011,1141,"2,4-D",2,0.005,0,0.1,9065,
04702X0034/SOURCE,2014,1506,Glyphosate,4,0.025,0,0.1,2426,
01318X0042/SAEP,2012,1176,Dinoterbe,3,0.0025,0,0.1,6494,
04288X0067/F,2011,1474,Chlorprophame,4,0.01,0,0.1,5455,
00845X0018/HY,2013,1281,Triallate,3,0.005,0,0.1,239,
08567X0039/F,2009,1289,Trifluraline,2,0.01,0,0.1,5988,
01082X0014/PAEP,2013,1668,Oryzalin,2,0.025,0,0.1,934,


In [110]:

########################################################
########################################################
########################################################
### --- DF_AV /// by : 
###           year - pesticides (levels rows)
###           year - departements (levels columns) 
########################################################
########################################################
########################################################


In [111]:
years_list = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 ] 
print "-- len years_list", len(years_list)


departements_list = list(df_stations.index.levels[0])
print "-- len departements_list", len(departements_list)
#print departements_list


'''
pesticides_familles_list = list(df_pesticides.index.levels[0])
print "-- len pesticides_famille_list", len(pesticides_familles_list)
print pesticides_familles_list


pesticides_fonctions_list = [k for k,v in functions_light.iteritems() ]
print "-- len pesticides_fonctions_list", len(pesticides_fonctions_list)
print pesticides_fonctions_list
'''


### only pesticides listed in df_MA
pesticides_list = list(df_MA.index.levels[2])
#pesticides_list = list(df_pesticides.index.levels[1])
pesticides_list.append(all_pesticides_code)
print "-- len pesticides_list", len(pesticides_list)
print pesticides_list[:10], "..."



-- len years_list 8
-- len departements_list 95
-- len pesticides_list 391
[1083, 1092, 1094, 1101, 1102, 1103, 1104, 1105, 1107, 1108] ...


In [112]:
#list_A = ["A", "B"]
#list_A * 3

In [113]:
### create df_AV dataframe dummy

tuples    = list(itertools.product(years_list, pesticides_list))
len_rows  = len(tuples)
#list_    = [np.NaN]*len_rows
list_pest = pesticides_list*len(years_list)
#dict_     = {"test" : list_pest }

index = pd.MultiIndex.from_tuples(tuples, names=['ANNEE', 'CD_PARAMETRE'])
#index = pd.MultiIndex.from_tuples(tuples, names=['year', 'CD_PARAMETRE'])

df_AV = pd.DataFrame(np.asarray(list_pest), index=index, columns=["CD_PARAMETRE"])


In [114]:
df_AV.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1
2007,1083,1083
2007,1092,1092
2007,1094,1094
2007,1101,1101
2007,1102,1102


In [115]:
df_AV_dpt = df_AV.copy()

for dpt in departements_list :
    df_AV_dpt[str(dpt)] = np.NaN
df_AV_dpt["TOT_FRANCE"] = np.NaN

#df_AV_dpt.drop(0, axis=1, inplace=True)


In [116]:
print df_AV_dpt.shape
df_AV_dpt.tail()


(3128, 97)


Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014,6856,6856,,,,,,,,,,...,,,,,,,,,,
2014,6862,6862,,,,,,,,,,...,,,,,,,,,,
2014,6894,6894,,,,,,,,,,...,,,,,,,,,,
2014,6895,6895,,,,,,,,,,...,,,,,,,,,,
2014,XXXXXX,XXXXXX,,,,,,,,,,...,,,,,,,,,,


In [117]:
df_AV_ME = df_AV.copy()

for ME in MEs_all_list[1:] :
    df_AV_ME[str(ME)] = np.NaN

df_AV_ME["TOT_FRANCE"] = np.NaN

#df_AV_ME.drop(0, axis=1, inplace=True)


In [118]:
print df_AV_ME.shape
df_AV_ME.tail()


(3128, 586)


Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,...,HG402,HG501,HG502,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014,6856,6856,,,,,,,,,,...,,,,,,,,,,
2014,6862,6862,,,,,,,,,,...,,,,,,,,,,
2014,6894,6894,,,,,,,,,,...,,,,,,,,,,
2014,6895,6895,,,,,,,,,,...,,,,,,,,,,
2014,XXXXXX,XXXXXX,,,,,,,,,,...,,,,,,,,,,


In [119]:
#df_pesticides.head(1)


In [120]:

#idx = pd.IndexSlice    
df_AV_dpt.loc[ idx[ :, 1109 ], : ]
#df_pesticides.loc[ idx[ :, 1109 ], : ]
#df_MA.loc[ idx[ :, :, 1109 ], : ]

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,87,88,89,90,91,92,93,94,95,TOT_FRANCE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,1109,1109,,,,,,,,,,...,,,,,,,,,,
2008,1109,1109,,,,,,,,,,...,,,,,,,,,,
2009,1109,1109,,,,,,,,,,...,,,,,,,,,,
2010,1109,1109,,,,,,,,,,...,,,,,,,,,,
2011,1109,1109,,,,,,,,,,...,,,,,,,,,,
2012,1109,1109,,,,,,,,,,...,,,,,,,,,,
2013,1109,1109,,,,,,,,,,...,,,,,,,,,,
2014,1109,1109,,,,,,,,,,...,,,,,,,,,,


In [121]:
### add columns for danger / by pesticide in AV dfs

idx = pd.IndexSlice

def add_danger_AV (row):
    
    CD_param = row["CD_PARAMETRE"]
    #print CD_param , type(CD_param)

    col_name = "Type"

    if CD_param == all_pesticides_code :
        danger = _missing
    
    else : 
        #print CD_param , type(CD_param) 
        try :
            danger = df_pesticides.loc[ idx [ :, int(CD_param) ] , : ].iloc[0][col_name]
        except :
            danger = _missing
    
    #print CD_param , danger, "..."
    return danger


# add danger types
df_AV_dpt["Type"] = df_AV_dpt.apply(add_danger_AV, axis=1)
df_AV_ME["Type"]  = df_AV_ME.apply( add_danger_AV, axis=1)



In [122]:
def add_fonction_AV (row):
    
    CD_param = row["CD_PARAMETRE"]
    #print CD_param , type(CD_param)
    
    col_name = "CODE_FONCTION"
    
    if CD_param == all_pesticides_code :
        fonction = _missing
    
    else : 
        #print CD_param , type(CD_param) 
        try :
            fonction = df_pesticides.loc[ idx [ :, int(CD_param) ] , : ].iloc[0][col_name]
        except :
            fonction = _missing
    
    #print CD_param , danger, "..."
    return fonction

# add danger types
df_AV_dpt["CODE_FONCTION"] = df_AV_dpt.apply(add_fonction_AV, axis=1)
df_AV_ME["CODE_FONCTION"]  = df_AV_ME.apply( add_fonction_AV, axis=1)

In [123]:
def add_famille_AV (row):
    
    CD_param = row["CD_PARAMETRE"]
    #print CD_param , type(CD_param)
    
    col_name = "CODE_FAMILLE"
    
    if CD_param == all_pesticides_code :
        fonction = _missing
    
    else : 
        #print CD_param , type(CD_param) 
        try :
            fonction = df_pesticides.loc[ idx [ :, int(CD_param) ] , : ].iloc[0][col_name]
        except :
            fonction = _missing
    
    #print CD_param , danger, "..."
    return fonction

# add danger types
df_AV_dpt["CODE_FAMILLE"] = df_AV_dpt.apply(add_famille_AV, axis=1)
df_AV_ME["CODE_FAMILLE"]  = df_AV_ME.apply( add_famille_AV, axis=1)

In [124]:
df_AV_dpt.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,1083,1083,,,,,,,,,,...,,,,,,,,II,I,Organophosphorés
2007,1092,1092,,,,,,,,,,...,,,,,,,,II,H,Carbamates
2007,1094,1094,,,,,,,,,,...,,,,,,,,no ref,I,Divers (organiques)
2007,1101,1101,,,,,,,,,,...,,,,,,,,II,H,Divers (organiques)
2007,1102,1102,,,,,,,,,,...,,,,,,,,Ia,"I,N",Carbamates
2007,1103,1103,,,,,,,,,,...,,,,,,,,O,I,Organochlorés
2007,1104,1104,,,,,,,,,,...,,,,,,,,II,H,Triazines et métabolites


In [127]:

########################################################
### fill df_AV_dpt + df_AV_ME & correspondinf MCT | MA
########################################################


In [128]:
### ---> optimization A --- dic_dpt_stations

### pre-store staions per dpt in dict
### iterate through departements

dic_dpt_stations = {}
dic_dpt_stations_count = {}

for dpt in departements_list : 

    # get list of CD_STATION within dpt 
    stations_list  = df_stations.query("NUM_DEP == '%s' " %(dpt) )
    stations_list_ = list(stations_list["CD_STATION_"])
    
    dic_dpt_stations[dpt]       = stations_list_
    dic_dpt_stations_count[dpt] = len(stations_list_)
    

#print dic_dpt_stations_count

test_1 = { k: dic_dpt_stations_count[k] for k in dic_dpt_stations_count.keys()[:1]}
print test_1

test_2 = { k: dic_dpt_stations[k] for k in dic_dpt_stations.keys()[:1]}
print test_2



{u'24': 128}
{u'24': [u'08085X0023/P', u'08076X0017/ERH', u'07596X0010/F', u'07842X0005/HY', u'07821X0001/SOURCE', u'08066X0047/F', u'08066X0019/F', u'08085X0040/HY', u'07595X0022/F', u'08326X0004/HY', u'08326X0006/F', u'07584X0007/F', u'07104X0501/HY', u'08088X0015/F', u'08087X0001/HY', u'08086X0031/S', u'08085X0032/HY', u'08322X0015/P', u'07346X0002/HY', u'07345X0018/F', u'07842X0007/F2', u'08311X0001/HY', u'07583X0003/HY', u'08067X0002/HY', u'07827X0007/SOURCE', u'08305X0002/F', u'08301X0002/F', u'08305X0030/F', u'08316X0016/HY', u'07582X0005/HY', u'07827X0017/HY', u'08087X0021/F', u'07826X0010/HY', u'07107X0031/F', u'08072X0010/HY', u'07361X0014/HY', u'07361X0002/HY', u'07361X0004/S', u'07841X0019/F', u'07847X0001/HY', u'07846X0012/HY', u'07846X0013/HY', u'07348X0010/HY', u'07811X0011/F', u'08075X0014/F', u'08075X0012/HY', u'08301X0015/P', u'08065X0025/F', u'08073X0017/HY', u'08077X0030/ERH', u'08077X0005/F', u'08077X0026/S1', u'07597X0007/A25', u'07346X0013/HY', u'08066X0005/F', u

In [129]:
#print list(test_concat["CD_STATION_"])

In [130]:
MEs_all_list[0:5]


[nan, u'AG001', u'AG002', u'AG003', u'AG004']

In [131]:
#df_empty = pd.DataFrame()
#df_empty
#df_test_concat = pd.concat([df_empty, test_concat])
#df_test_concat

In [132]:
### ---> optimization A --- dic_ME_stations

### pre-store staions per ME in dict
### iterate through ME
### "CD_ME_niv1_surf" | "CD_ME_v2" in stats == "CdMasseDEa" in .shp

df_empty = pd.DataFrame()

dic_ME_stations = {}
dic_ME_stations_count = {}

for ME in MEs_all_list[1:] :     

    # get list of CD_STATION within ME 
    try : 
        stations_list_niv1 = df_stations.loc[ idx [ :, :, ME, :  ], : ]
    except :
        stations_list_niv1 = df_empty
    try : 
        stations_list_niv2 = df_stations.loc[ idx [ :, :, : , ME ], : ]
    except :
        stations_list_niv2 = df_empty
        
    stations_list  = pd.concat( [ stations_list_niv1, stations_list_niv2 ] )
    stations_list_ = list(stations_list["CD_STATION_"])

    dic_ME_stations[ME] = stations_list_
    dic_ME_stations_count[ME] = len(stations_list_)
    

#print dic_ME_stations_count

test_1 = { k: dic_ME_stations_count[k] for k in dic_ME_stations_count.keys()[:1]}
print test_1

#test_2 = { k: dic_ME_stations[k] for k in dic_ME_stations.keys()[:1]}
#print test_2



{u'HG217': 30}


In [133]:

###################################################################
### MAIN AV FUNCTION
###################################################################

###################################################################
### add columns for averages and custom indicators
###################################################################

def MoyDF_YearPest_BY_DptME(dpt_ME, year, cd_parametre, start_time, _1stRd, debug=True ):
    
    # create slicers
    idx = pd.IndexSlice

    # variables : "MOYPTOT" on df_MCT / "MA_MOY" on df_MA
    
    if cd_parametre == all_pesticides_code :
        df = df_MCT
        column_name = "MOYPTOT"
        column_mean = "MOYPTOT_YEAR"
        
    else :
        df = df_MA
        column_name = "MA_MOY"
        column_mean = "MA_MOY_YEAR"
    
    #########################################
    
    if   dpt_ME == "dpt":
        df_AV = df_AV_dpt
        dic_dptME_stations = dic_dpt_stations
    
    elif dpt_ME == "ME" :
        df_AV = df_AV_ME
        dic_dptME_stations = dic_ME_stations
    
    #########################################
    
    try : 
        #df_moy_tot_year = df.query("ANNEE == %s and %s" %(year, query_cd_parametre) )
        df_moy_tot_year = df.loc[ idx[:,year, cd_parametre] , [column_name] ]

        #print "-- %s GLOBAL - debug / mean_year %s for %s / shape df_moy_tot_year = %s" %(column_mean, year , cd_parametre, df_moy_tot_year.shape ) 

        mean_year = df_moy_tot_year[column_name].mean()
    
    except : 
        # if no cd_parametre key for this year 
        mean_year = np.NaN
    
    #########################################
    
    if debug == True :
        delta_time = datetime.now() - start_time
        print "-- %s - mean_year %s for %s : %s (delta time : %s)" %(column_mean, year , cd_parametre, mean_year, delta_time) 
    
    #########################################
    
    ### escapes if mean_year == nan (leave df_Av NaN value)
    if pd.isnull(mean_year) == True :
        pass
    
    else :
        # cf : http://stackoverflow.com/questions/28002197/pandas-proper-way-to-set-values-based-on-condition-for-subset-of-multiindex-da
        # cf : http://pandas-docs.github.io/pandas-docs-travis/advanced.html#advanced-indexing-with-hierarchical-index
        
        
        ### just copy total mean values during first round
        if _1stRd :
            
            # copy mean_year in corresponding dataframe (df)
            df.loc[ idx[ :, year, cd_parametre ] , [ column_mean ] ] = mean_year

        # copy mean_year in df_AV_dpt|df_AV_ME
        df_AV.loc[ idx[year, cd_parametre] , ['TOT_FRANCE'] ] = mean_year
        
        
        ### iterate through departements|ME
        for dptME, stations_list in dic_dptME_stations.iteritems() : 
                        
            # compute mean for dpt|ME
            
            df_moy_tot_dptME_year = df_moy_tot_year.loc[ idx[stations_list,:, :], : ]
            
            #print "-- %s DPT - df_moy_tot_dpt|ME_year %s for %s - dpt|ME %s (%s stations) / shape df_moy_dptME_year = %s" %(column_mean, year, cd_parametre, dptME, len(stations_list), df_moy_tot_dptME_year.shape ) 
            
            mean_year_dptME = df_moy_tot_dptME_year[column_name].mean()
            
            if debug == True :
                print "-- %s ----- mean_year_dpt|ME %s for %s - dpt|ME %s (%s stations) : %s" %(column_mean, year, cd_parametre, dptME, len(stations_list), mean_year_dptME) 
                
            # copy mean_year_dpt|ME in df_AV
            df_AV.loc[ idx[year, cd_parametre] , [dptME] ] = mean_year_dptME
            
            

In [134]:
### compute for df_MCT - iterate through years and dpt --> delta_time : 0:00:04

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MCT = False

_1stRd = True
_df    = "MCT"
_vs    = "dpt"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    start_lap = datetime.now()
    
    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for %s >>>>>>>>" %(_df, _vs, year)
    MoyDF_YearPest_BY_DptME( _vs, year, all_pesticides_code, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MCT)
    
    if debug_MOYPTOT_YEAR_MCT == True :
        delta_lap = datetime.now() - start_lap
        print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
        print

print 
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 
   

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt >>>>>>>> 2017-02-03 18:06:48.230510 

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2007 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2008 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2009 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2010 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2011 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2012 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2013 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs dpt for 2014 >>>>>>>>

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs dpt --- FINISHED --- delta_time : 0:00:06.832183


In [135]:
### compute for df_MCT - iterate through years and ME --> delta_time : 0:00:16

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MCT = False

_1stRd = False
_df    = "MCT"
_vs    = "ME"

print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    start_lap = datetime.now()
    
    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for %s >>>>>>>>" %(_df, _vs, year)
    MoyDF_YearPest_BY_DptME( _vs, year, all_pesticides_code, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MCT)
    
    if debug_MOYPTOT_YEAR_MCT == True :
        delta_lap = datetime.now() - start_lap
        print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
        print

print 
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME MCT / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs ME >>>>>>>> 2017-02-03 18:06:56.621888 

>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2007 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2008 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2009 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2010 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2011 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2012 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2013 >>>>>>>>
>>>>>>>> MoyDF_YearPest_BY_DptME / MCT vs ME for 2014 >>>>>>>>

>>>>>>>> MoyDF_YearPest_BY_DptME MCT / MCT vs ME --- FINISHED --- delta_time : 0:00:24.881824


In [136]:
df_MCT.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00054X0169/F1,2007,XXXXXX,all_pesticides,4.0,0.0,0.0,18.0,96.0,0.0,0.0,8644,0.094931


In [137]:
df_MA.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,8687,


In [138]:
### WARNING : TAKES ++ TIME TO PROCESS !!! aprox 40 min + 15 min (2013-2014)
### compute for df_MA - - iterate through years and dpt

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MA = False
debug_MA              = False ### break after first year if True

_1stRd = True ## don't copy mean year / already done at 1st round
_df    = "MA"
_vs    = "dpt"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s " %(_df, _vs, year)

    for pesticide in pesticides_list[:-1] :
        
        if debug_MA == True : 
            start_lap = datetime.now()
            print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s / pesticide %s " %( _df, _vs, year, pesticide)
            
        MoyDF_YearPest_BY_DptME( _vs, year, pesticide, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MA )    
        
        if debug_MOYPTOT_YEAR_MA == True : 
            delta_lap = datetime.now() - start_lap
            print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %( _df, _vs, year, delta_lap)
            print
            
            # break after 1st pesticide
            break
        
    # break after 1st year : 2007 
    if debug_MA : 
        break

print
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 


>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt >>>>>>>> 2017-02-03 18:07:31.482527 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2007 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2008 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2009 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2010 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2011 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2012 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2013 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt for year 2014 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs dpt --- FINISHED --- delta_time : 1:26:24.497542


In [139]:
df_MA.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,8687,0.026375


In [140]:
print df_AV_dpt.shape
df_AV_dpt.head()

(3128, 100)


Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,1083,1083,0.023464,0.010379,0.019545,0.024118,0.024571,0.01,0.01,0.003418,,...,0.01,0.019323,0.01,0.01,0.01,0.022549,0.015277,II,I,Organophosphorés
2007,1092,1092,0.024488,0.045,0.040705,0.024677,0.024857,0.02,0.02,0.04,,...,0.02,0.03023,0.0425,0.025,0.04375,0.0375,0.030659,II,H,Carbamates
2007,1094,1094,0.023464,0.01,0.013269,0.024118,0.024571,0.01,0.01,0.008,,...,0.022273,0.005526,0.005,0.005,0.005,0.021732,0.013958,no ref,I,Divers (organiques)
2007,1101,1101,0.015512,0.004924,0.010705,0.015294,0.015143,0.006034,0.02,0.009032,0.008913,...,0.024091,0.008628,0.021136,0.02375,0.018889,0.021891,0.016568,II,H,Divers (organiques)
2007,1102,1102,0.023464,0.01,0.014348,0.024118,0.024571,0.008023,0.01,0.008,,...,0.01,0.05,0.05,0.05,0.05,0.05,0.026375,Ia,"I,N",Carbamates


In [141]:
### WARNING : TAKES ++++ TIME TO PROCESS !!! delta_time : 01:20:00
### compute for df_MA - - iterate through years and ME --> 01:30:00 approx

### check time deltas for eficiency 
start_time = datetime.now()
#print str(start_time)

debug_MOYPTOT_YEAR_MA = False  
debug_MA              = False  ### break after 1st year if True

_1stRd = False ## don't copy mean year / already done at 1st round
_df    = "MA"
_vs    = "ME"

print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s >>>>>>>> %s " %(_df, _vs, start_time) 
print 

for year in years_list :

    print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s " %(_df, _vs, year)

    for pesticide in pesticides_list[:-1] :
        
        if debug_MA == True : 
            start_lap = datetime.now()
            print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s for year %s / pesticide %s " %(_df, _vs, year, pesticide)
            
        MoyDF_YearPest_BY_DptME( _vs, year, pesticide, start_time, _1stRd, debug=debug_MOYPTOT_YEAR_MA )    
        
        if debug_MOYPTOT_YEAR_MA == True : 
            delta_lap = datetime.now() - start_lap
            print ">>>>>>>> finished MoyDF_YearPest_BY_DptME / %s vs %s for %s --- delta_lap : %s >>>>>>>>" %(_df, _vs, year, delta_lap)
            print
            
            break
        
    #break after 1st year : 2007 
    if debug_MA : 
        break


print
delta_time = datetime.now() - start_time
print ">>>>>>>> MoyDF_YearPest_BY_DptME / %s vs %s --- FINISHED --- delta_time : %s" %(_df, _vs, delta_time) 


>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME >>>>>>>> 2017-02-03 19:40:22.324011 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2007 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2008 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2009 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2010 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2011 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2012 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2013 
>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME for year 2014 

>>>>>>>> MoyDF_YearPest_BY_DptME / MA vs ME --- FINISHED --- delta_time : 2:39:00.782137


In [142]:
print df_AV_ME.shape
df_AV_ME.head()

(3128, 589)


Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,...,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2007,1083,1083,0.01,0.01,0.01,0.01,0.010909,0.0165,,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.00875,0.015277,II,I,Organophosphorés
2007,1092,1092,0.045833,0.036667,0.04381,0.036667,0.040889,0.045254,0.05,0.048095,0.040909,...,0.028571,0.032381,0.030139,0.029111,0.027232,0.05,0.030659,II,H,Carbamates
2007,1094,1094,,,,,,,,,,...,0.005,0.005625,0.005,0.005,0.005,0.01,0.013958,no ref,I,Divers (organiques)
2007,1101,1101,0.023438,0.02,0.022679,0.02,0.021708,0.023093,0.025,0.024286,0.022955,...,0.019762,0.015444,0.017778,0.019222,0.022054,0.005833,0.016568,II,H,Divers (organiques)
2007,1102,1102,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.031548,0.034048,0.034028,0.032222,0.029018,0.01,0.026375,Ia,"I,N",Carbamates


In [143]:
#df_AV_dpt.set_index[ ; inplace=True]


In [144]:
df_MCT.loc[ idx[:,2013,:,:], : ].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11195X0147/FITTEL,2013,XXXXXX,all_pesticides,3.0,0.0,0.0,23.0,408.0,0.0,0.0,3601,0.214001
11221X0134/TRAVO,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3633,0.214001
11233X0118/PUGNAC,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3570,0.214001
11234X0127/BARA,2013,XXXXXX,all_pesticides,2.0,0.0,0.0,408.0,408.0,0.0,0.0,3575,0.214001
11282X0005/ARAGUI,2013,XXXXXX,all_pesticides,2.0,0.07,0.1,408.0,408.0,1.0,1.0,3539,0.214001


In [145]:
df_MA.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00053X0002/SO1,2007,1102,Aldicarbe,1,0.05,0,0.1,8687,0.026375
00053X0002/SO1,2007,1107,Atrazine,1,0.01,0,0.1,8687,0.021479
00053X0002/SO1,2007,1108,Atrazine déséthyl,1,0.005,0,0.1,8687,0.038045
00053X0002/SO1,2007,1109,Atrazine déisopropyl,1,0.005,0,0.1,8687,0.020551
00053X0002/SO1,2007,1136,Chlortoluron,1,0.01,0,0.1,8687,0.018465


In [146]:

############################################
############################################
############################################
### EXPORTS FOR WEB CONSUMMING 
############################################
############################################
############################################


### df_pesticides --> CSV 
### df_stations   --> CSV (+ GEOJSON from different notebook)
### df_MCT        --> CSV 
### df_MA         --> CSV 
### df_AV_dpt     --> CSV 
### df_AV_MA      --> CSV 



#csv_encoding = "latin-1"

#_csv     = ".csv"
#_sep_csv = ";"
#_web     = "_web" 



In [147]:
df_stations.head(1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,INDEX_STATION,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,Unité_coord_fictifs,X_FICT_L93,Y_FICT_L93,reseau2009,reseau2010,...,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,COORD_WSG84,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
,,,,10261X0039/F3,31,,AEAG,10.0,33,,0.0,0.0,Hors RCS et RCO,,...,,,,,,,"[-1.36308121012, -5.98385630921]",10261X0039/F3,-1.363081,-5.983856


In [148]:
### work on copies
######################################


df_pest_dang_web = df_pesticides_dang.copy()
df_pest_func_web = df_functions.copy()

df_pest_web      = df_pesticides.copy()

#df_stations_web  = df_stations.copy()

#df_MCT_web       = df_MCT.copy()
#df_MA_web        = df_MA.copy()

df_AV_dpt_web    = df_AV_dpt.copy()
df_AV_ME_web     = df_AV_ME.copy()



In [149]:
### clean df_pest_dang_web
######################################


## drop useless columns for web use 
drop_col_dang = ['CAS']

df_pest_dang_web = df_pest_dang_web.drop( drop_col_dang, axis=1 )


In [150]:
#df_pest_dang_web.head(1)

In [151]:
### clean df_pest_web
######################################


## drop useless columns for web use 
drop_col_pesticides = [ 'CD_PARAMETRE', 'CODE_FONCTION']

df_pest_web = df_pest_web.drop( drop_col_pesticides, axis=1 )


In [152]:
df_pest_web.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LB_PARAMETRE,NOM_PARAM2,CODE_FAMILLE,STATUT,METABOLITE,PARENT,NOM_PARENT,CODE_CAS,DATE_NA_USAGE,FORMULEB,NORME_DCE,Type,FONCTIONS
CODE_FAMILLE,CD_PARAMETRE,CODE_FONCTION,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,6276,no ref,Somme pesticides analyses,Pesticides totaux,,,,,,no ref,NaT,,0.5,no ref,no ref


In [153]:
float_test = 0.54467
print int(float_test)

0


In [154]:
## clean df_stations_web
###########################################

df_stations_web  = df_stations.copy()

'''
NUM_DEP;NOM_COM;CD_ME_niv1_surf;CD_ME_v2;CD_STATION;INDEX_STATION;NUM_COM;
codagence;ALTITUDE;PROFONDEUR_MAXI_POINT;
reseau2009;reseau2010;reseau2011;reseau2012;reseau2013;reseau2014;
fi_ma_2007;fi_ma_2008;fi_ma_2009;fi_ma_2010;fi_ma_2011;fi_ma_2012;fi_ma_2013;fi_ma_2014;
LAT_WSG84;LONG_WSG84

'''
cols_stations_to_drop = [u"Unité_coord_fictifs","X_FICT_L93","Y_FICT_L93", "COORD_WSG84"]

df_stations_web = df_stations_web.drop( cols_stations_to_drop, axis=1 )


In [155]:
df_stations_web["ALTITUDE"] = df_stations_web["ALTITUDE"].apply(lambda x : np.round(x, 1) if not pd.isnull(x) else x )
#df_stations_web["ALTITUDE"] = df_stations_web["ALTITUDE"][df_stations_web["ALTITUDE"] != np.NaN ].astype(int, coerce=True)

In [156]:
#df_stations_web[ df_stations_web["ALTITUDE"] == "NA"]
df_stations_web.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,INDEX_STATION,NUM_COM,codagence,ALTITUDE,PROFONDEUR_MAXI_POINT,reseau2009,reseau2010,reseau2011,reseau2012,reseau2013,...,fi_ma_2008,fi_ma_2009,fi_ma_2010,fi_ma_2011,fi_ma_2012,fi_ma_2013,fi_ma_2014,CD_STATION_,LAT_WSG84,LONG_WSG84
NUM_DEP,NOM_COM,CD_ME_niv1_surf,CD_ME_v2,CD_STATION,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
85,LA TARDIERE,GG030,GG030,05638X0025/FORAGE,12086,85289,AELB,142.0,61.0,RCS,RCS/RCO,RCS/RCO,RCS/RCO,RCS/RCO,...,,oui,oui,oui,,oui,,05638X0025/FORAGE,-0.75402,46.663756
73,SAINT-PIERRE-D'ALBIGNY,DG144,DG144,07265X0024/HY,10570,73270,AERM&C,811.0,,Hors RCS et RCO,,,,,...,,,,,,,,07265X0024/HY,6.188101,45.57816
28,VIERVILLE,GG092,,02922X0010/PF,3384,28408,AESN,150.0,37.5,Hors RCS et RCO,,,,,...,oui,,,,,,,02922X0010/PF,1.927196,48.383104


In [157]:
df_MCT.head(2)#[["MINMOLRECH", "MAXMOLRECH"]].astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00054X0169/F1,2007,XXXXXX,all_pesticides,4.0,0.0,0.0,18.0,96.0,0.0,0.0,8644,0.094931
00054X0169/F1,2012,XXXXXX,all_pesticides,1.0,0.0,0.0,64.0,64.0,0.0,0.0,8644,0.189392


In [158]:
df_MCT_web.head(2)#[["MINMOLRECH", "MAXMOLRECH"]].astype(int)

NameError: name 'df_MCT_web' is not defined

In [159]:
### clean df_MCT_web / df_MA_web
######################################

df_MCT_web       = df_MCT.copy()

### round values in df_ to save space :

decimals = 3 ## round : 0.1234566 --> to : 0.123

#integers = 0
#MCT : MINMOLRECH MAXMOLRECH MINMOLQ MAQMOLQ

# round values in MCT / MA
#df_MCT_web.round( { 
#df_MCT_web = df_MCT_web.round( { 
                             #'MOYPTOT_YEAR' : decimals,
                             #'MINMOLRECH'  : integers,
                             #'MAXMOLRECH'  : integers,
                             #'MINMOLQ'     : integers,
                             #'MAQMOLQ'     : integers
                           #} )
                        
df_MCT_web["MOYPTOT_YEAR"] = df_MCT_web["MOYPTOT_YEAR"].apply(lambda x : np.round(x, decimals)  \
                                                                if not pd.isnull(x) else x)


In [160]:
df_MCT_web[ ['NBPREL', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ' ] ] \
          = df_MCT_web[ ['NBPREL', 'MINMOLRECH', 'MAXMOLRECH', 'MINMOLQ', 'MAQMOLQ' ] ].astype(int)


In [161]:
df_MCT.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
09411X0255/F,2014,XXXXXX,all_pesticides,4.0,0.1075,0.16,404.0,405.0,1.0,1.0,12006,0.217187
02958X0112/AEP,2011,XXXXXX,all_pesticides,2.0,0.0,0.0,107.0,229.0,0.0,0.0,12697,0.168051


In [162]:
df_MCT_web.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBPREL,MOYPTOT,MAXPTOT,MINMOLRECH,MAXMOLRECH,MINMOLQ,MAQMOLQ,INDEX_STATION,MOYPTOT_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
09781X0015/P219,2011,XXXXXX,all_pesticides,8,0.06125,0.24,19,42,1,3,5332,0.168
11013X0002/F,2010,XXXXXX,all_pesticides,5,0.1154,0.19,105,409,2,5,9334,0.186


In [163]:

df_MA_web        = df_MA.copy()

#df_MA_web.round(  { 'MA_MOY_YEAR': decimals }, inplace=True )
#df_MA_web.round(  { 'MA_MOY_YEAR': decimals } )
#df_MA_web["MA_MOY_YEAR"] = df_MA_web["MA_MOY_YEAR"].apply(lambda x:pd.Series.round(x, decimals))
df_MA_web["MA_MOY_YEAR"] = df_MA_web["MA_MOY_YEAR"].apply(lambda x : np.round(x, decimals)  \
                                                                if not pd.isnull(x) else x)

In [164]:
df_MA.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01824X0031/F3,2013,1668,Oryzalin,1,0.025,0,0.1,11410,0.020933
07226X0322/CPT,2011,2008,Flurtamone,5,0.01,0,0.1,9859,0.017169


In [165]:
df_MA_web.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
CD_STATION,ANNEE,CD_PARAMETRE,LB_PARAMETRE,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
07953X0006/S,2009,1208,Isoproturon,1,0.025,0,0.1,5113,0.012
07851X0106/F1,2007,1256,Propazine,1,0.02,0,0.1,2256,0.018


In [166]:
'''
## drop useless columns for web use 
drop_col_cd_stations = [ 'CD_STATION']

df_MCT_web = df_MCT_web.drop( drop_col_cd_stations, axis=1 )
df_MA_web  = df_MA_web.drop( drop_col_cd_stations, axis=1 )
'''
print




In [167]:
### drop index "LB_PARAMETRE" in df_MCT_web / df_MA_web

df_MCT_web.index = df_MCT_web.index.droplevel(3)
df_MA_web.index  = df_MA_web.index.droplevel(3)


### drop index "CD_STATION" in df_MCT_web / df_MA_web

df_MCT_web.index = df_MCT_web.index.droplevel(0)
df_MA_web.index  = df_MA_web.index.droplevel(0)


In [168]:
df_MA_web.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,NBANASPERTS1,MA_MOY,NBQUANTIF,NORME_DCE,INDEX_STATION,MA_MOY_YEAR
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007,1102,1,0.05,0,0.1,8687,0.026


In [169]:
### clean df_AV_dpt_web / df_AV_ME_web
######################################

# round all AV values 

#df_AV_dpt_web = df_AV_dpt.round(decimals)
#df_AV_ME_web  = df_AV_ME.round(decimals)
df_AV_dpt_web = df_AV_dpt_web.round(decimals)
df_AV_ME_web  = df_AV_ME_web.round(decimals)

# drop NaN row in AV
df_AV_dpt_web = df_AV_dpt_web.dropna( axis=0, how="all") # on empty rows
df_AV_ME_web  = df_AV_ME_web.dropna(  axis=0, how="all") # on empty rows


In [170]:
df_AV_dpt.sample(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2009,1214,1214,0.022535,0.009939,0.019063,0.012028,0.010053,0.01,0.0125,0.009163,0.005,...,0.014286,0.033077,0.01725,0.005,0.01,0.013433,0.012326,no ref,H,Divers (organiques)


In [171]:
df_AV_dpt_web.sample(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,01,02,03,04,05,06,07,08,09,...,90,91,92,93,94,95,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2012,2009,2009,0.011,,0.006,0.005,,0.008,,0.01,,...,0.005,0.05,,,0.05,,0.01,II,I,Azoles


In [172]:
df_AV_ME.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,...,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010,1407,1407,,,,,,,,,,...,,,,,,,,U,F,Carbamates
2012,6862,6862,,,,,,,,,,...,,,,,,,,no ref,no ref,
2010,5968,5968,,,,,,,,,,...,,,,,,,,no ref,H,Urées


In [173]:
df_AV_ME_web.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,CD_PARAMETRE,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,...,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,CD_PARAMETRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014,1253,1253,0.006,0.007,0.006,0.006,0.007,0.006,0.006,0.005,0.005,...,0.01,0.009,0.01,0.01,0.01,0.006,0.007,II,F,Azoles
2013,2987,2987,,,,,,,,,,...,,0.01,,,,0.002,0.009,no ref,F,Amides
2008,1812,1812,,,,,,,,,,...,0.005,0.005,0.005,0.005,0.005,,0.019,II,I,Pyréthrinoïdes


In [174]:
# drop level "CD_PARAMETRE" in AV

df_AV_dpt_web.index = df_AV_dpt_web.index.droplevel(1)
df_AV_ME_web.index  = df_AV_ME_web.index.droplevel(1)


In [175]:
df_AV_ME_web.sample(10)

Unnamed: 0_level_0,CD_PARAMETRE,AG001,AG002,AG003,AG004,AG005,AG006,AG007,AG008,AG009,...,HG503,HG504,HG505,HG506,HG507,HG508,TOT_FRANCE,Type,CODE_FONCTION,CODE_FAMILLE
ANNEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,1913,,,,,,,,,,...,,,,,,,,U,H,Urées
2014,2737,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,...,0.01,0.009,0.01,0.01,0.01,0.01,0.009,no ref,no ref,Divers (organiques)
2014,1937,,,,,,,,,,...,,,,,,,,U,H,Amides
2007,1529,,,,,,,,,,...,,,,,,,,U,F,Azoles
2012,1697,,,,,,,,,,...,,,,,,,,"CAS 584-79-2 II 584-79-2 II Name: Type, ...",I,Pyréthrinoïdes
2013,1719,,,,,,,,,,...,,,,,,,,U,F,Divers (organiques)
2009,1102,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.022,0.018,0.021,0.02,0.023,0.005,0.021,Ia,"I,N",Carbamates
2008,1157,0.025,0.025,0.025,0.025,0.025,0.024,0.025,0.025,0.025,...,0.01,0.01,0.01,0.01,0.01,,0.014,II,"I,A",Organophosphorés
2007,1877,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,...,0.029,0.031,0.03,0.029,0.029,0.025,0.023,II,I,Divers (organiques)
2012,1831,,,,,,,,,0.01,...,,,,,,,0.011,no ref,no ref,Triazines et métabolites


In [176]:
### WRITING .CSVs ###

### list df to save for web appli : df, name

df_order_save = [
    "pest_dang",
    "pest_functions",
    "pesticides",
    "stations",
    "MCT",
    "MA",
    "AV_dpt",
    "AV_ME"
]

df_to_web = {
    "pest_dang"      : df_pest_dang_web,
    "pest_functions" : df_pest_func_web,
    "pesticides"     : df_pest_web,
    "stations"       : df_stations_web,
    "MCT"            : df_MCT_web,
    "MA"             : df_MA_web,
    "AV_dpt"         : df_AV_dpt_web,
    "AV_ME"          : df_AV_ME_web      
    }

#for k, df in df_to_web.iteritems() :
    #print k

In [177]:

csv_encoding_web = "utf-8"

def df_to_csv_web(df_, df_name):
    
    print ">>> df_to_csv_web / df_%s " %(df_name)
    outfilename = os.path.join( stats_web_path, df_name + _web + _csv )
    
    print "... outfilename : ", outfilename
    df_.to_csv(outfilename, sep=_sep_csv, encoding = csv_encoding_web )
    print ">>> df_to_csv_web finished for df_%s --> to %s " %(df_name, df_name+_web+_csv)
    print 


### save all pandas df_ to .csv for further uses
#for df_name, df in df_to_web.iteritems() :
for df_name in df_order_save :
    df_to_csv_web( df_to_web[df_name], df_name ) 


## test writing
#test_to_web_csv = "AV_ME"
#df_to_csv_web( df_to_web[test_to_web_csv], test_to_web_csv )


>>> df_to_csv_web / df_pest_dang 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/pest_dang_web.csv
>>> df_to_csv_web finished for df_pest_dang --> to pest_dang_web.csv 

>>> df_to_csv_web / df_pest_functions 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/pest_functions_web.csv
>>> df_to_csv_web finished for df_pest_functions --> to pest_functions_web.csv 

>>> df_to_csv_web / df_pesticides 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/pesticides_web.csv
>>> df_to_csv_web finished for df_pesticides --> to pesticides_web.csv 

>>> df_to_csv_web / df_stations 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/stations_web.csv
>>> df_to_csv_web finished for df_stations --> to stations_web.csv 

>>> df_to_csv_web / df_MCT 
... outfilename :  /Users/jpy/Dropbox/_FLASK/concours_pesticides/app/static/data/stats_web/MCT_w

In [None]:

########################################################
########################################################
########################################################
### --- QUERIES ON DFs
########################################################
########################################################
########################################################


In [None]:
## cf : http://pandas.pydata.org/pandas-docs/stable/indexing.html#the-query-method-experimental

def queryByIndexValue (df, indexName, indexLabelList):
    queryString = '%s in %s' %( indexLabelList, indexName)
    #print queryString
    result = df.query(queryString)
    return result
#df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

def queryByColValue (df, colName, comparator, colValue ):
    queryString = '(%s %s %s)' %( colName, comparator, colValue)
    #print queryString
    result = df.query(queryString)
    return result


def getIndexValuesList(df, indexName):
    result = df.index.get_level_values(indexName).unique()
    return list(result)

def getColValuesList(df, colName ) :
    result = df[colName].unique()
    return list(result)


def listIndexUniqueValues(df) :
    dictIndex = {}
    for indexName in df.index.names :
        listValues = getIndexValuesList(df, indexName)
        dictIndex[indexName] = listValues
    return dictIndex


In [None]:
### test slice by ME index

test_slice = df_stations #.head(5)
test_slice

ME_1 = "DG149"
#ME_2 = "CG004"
ME_2 = "DG149"

test_loc_ME_1 = test_slice.loc[ idx[:,:,  :  , ME_1 ], : ]
test_loc_ME_2 = test_slice.loc[ idx[:,:, ME_2,  :   ], : ]
test_concat = pd.concat([test_loc_ME_1, test_loc_ME_2])
#test_concat

In [None]:
############ TEST ####################
### test 1 on df_MCT

_station = "00057X0248/F4"
_dpt  = '24'
_year = 2007
_cd_parametre = all_pesticides_code

print "station : %s / dpt : %s / year : %s" %(_station, _dpt, _year)

# get list of CD_STATION within dpt 
_stations_list  = df_stations.query("NUM_DEP == '%s'" %(_dpt) )
_stations_list_ = list(_stations_list["CD_STATION_"])
print "len(_stations_list_)", len(_stations_list_)

#df_moy_dpt  = df_MCT.query('CD_STATION=="%s" and ANNEE==%s ' %(station, year) )
_df_moy_dpt_ = df_MCT.query('%s in CD_STATION and ANNEE==%s and CD_PARAMETRE == "%s" ' %( _stations_list_, _year, _cd_parametre ))

print "_df_moy_dpt_.shape", _df_moy_dpt_.shape


In [None]:
############ TEST ####################
### test 2 on df_MA

_dpt  = '24'
_year = 2007
_cd_parametre = str(1177)
_query_cd_parametre = "CD_PARAMETRE==%s" %(_cd_parametre)

_df_moy_   = df_MA.query("ANNEE == %s and %s" %(_year, _query_cd_parametre) )
_mean_year = _df_moy_["MA_MOY"].mean()
print pd.isnull(_mean_year), ":", _mean_year

#df_moy_

In [None]:
############ TEST ####################
### test 1/a on df_MCT vs ME


_years_list = [2007, 2008]
_ME_list    = ["DG149", "CG004"]
source      = "MCT" 
#source      = "MA"

if source == "MCT": 
    df_source = df_MCT
    _cd_parametre = all_pesticides_code
    _column_name  = "MOYPTOT"

elif source == "MA": 
    df_source = df_MA
    _cd_parametre = 1177 
    _column_name  = "MA_MOY"

    
print "TEST MEs / %s.shape" %("df_"+ source), df_source.shape
print 


for _year in _years_list :
               
    for _ME in _ME_list :

        print "_year : %s / _ME : %s" %( _year, _ME )
        print "++ %s.shape              " %("df_"+source), df_source.shape

        _df_moy_tot_year = df_source.loc[ idx[ :, _year, _cd_parametre ] , [_column_name] ]
        print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape

        _stations_list_temp = dic_ME_stations[_ME]
        print "-- len(_stations_list_temp)  ", len(_stations_list_temp)

        _df_moy_tot_year_ME = _df_moy_tot_year.loc[ idx[ _stations_list_temp ,:, :] , :]
        print "-- _df_moy_tot_year_dpt.shape", _df_moy_tot_year_ME.shape

        #print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape
        print


In [None]:
############ TEST ####################
### test 1/b on df_MCT/df_MA vs dpt


_years_list = [2007, 2008]
_dpt_list   = ['24', '44']
#source     = "MCT" 
source      = "MA"

if source == "MCT": 
    df_source = df_MCT
    _cd_parametre = all_pesticides_code
    _column_name  = "MOYPTOT"

elif source == "MA": 
    df_source = df_MA
    _cd_parametre = 1177 
    _column_name  = "MA_MOY"

    
print "TEST DPTs / %s.shape" %("df_"+source), df_source.shape
print 

for _year in _years_list :
               
    for _dpt in _dpt_list :

        print "_year : %s / _dpt : %s" %( _year, _dpt )
        print "++ %s.shape               " %("df_"+source), df_source.shape

        _df_moy_tot_year = df_source.loc[ idx[:, _year, _cd_parametre ] , [_column_name] ]
        print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape

        _stations_list_temp = dic_dpt_stations[_dpt]
        print "-- len(_stations_list_temp)  ", len(_stations_list_temp)

        _df_moy_tot_year_dpt = _df_moy_tot_year.loc[ idx[ _stations_list_temp ,:, :] , :]
        print "-- _df_moy_tot_year_dpt.shape", _df_moy_tot_year_dpt.shape

        #print "-- _df_moy_tot_year.shape    ", _df_moy_tot_year.shape
        print


In [None]:

########################################################
########################################################
########################################################
### tests queries 
########################################################
########################################################
########################################################


In [None]:
'''main complete and clean DF :
    - df_pesticides
    - df_stations
    - df_MCT
    - df_MA
'''

df_sliced_01 = queryByIndexValue(df_stations, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )
df_sliced_02 = queryByIndexValue(df_stations, "NUM_DEP", ["44"] )
df_sliced_03 = queryByIndexValue(df_MCT, "ANNEE", [2009,2010] )
df_sliced_04 = queryByIndexValue(df_MA, "CD_STATION", ["00066X0042/SO", "00053X0002/SO1"] )

df_sliced_05 = queryByColValue(df_MA, "MA_MOY", ">", "NORME_DCE")

print "-- listIndexUniqueValues : ", listIndexUniqueValues(df_sliced_02)
print
print "-- getIndexValuesList : ", getIndexValuesList(df_sliced_04, "CD_PARAMETRE") 
print
print "-- getColValuesList : ", getColValuesList(df_MA, "NORME_DCE") 
print 
#print "-- getColValuesList : ", getColValuesList(df_sliced_02, "NOM_COM")

In [None]:
df_sliced_01 

In [None]:
df_sliced_02.head(7)

In [None]:
df_sliced_03.head()

In [None]:
df_sliced_04

In [None]:
df_sliced_05.head()

In [None]:

########################################################
########################################################
########################################################
### -- MERGE DATAS ??? -- 
########################################################
########################################################


In [None]:
#df_stations_MCT_MA = pd.concat( [df_stations_MCT, df_MA] )
#df_stations_MCT_MA.head()


#print df_stations_MA_MCT.columns


# pivot tables
#df_mct_2008.T

In [None]:

########################################################
########################################################
########################################################
### -- analysis --
########################################################
########################################################

## selections : http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label
### TO DO 




In [None]:

########################################################
########################################################
########################################################
### -- exports --
########################################################
########################################################


### export functions

test_df = df_stations.head()


In [None]:
test_record = df_stations.loc[["44"], : ]
test_record.head()

In [None]:
df_stations.head(1)


In [None]:
### return json 


In [None]:
test_record_reset = test_record.reset_index()
test_record_reset.set_index("CD_STATION", inplace=True)
test_record_reset.head()

In [None]:
#json_stations = df_stations.head(2).to_json(orient="split")
json_stations = test_record_reset.to_json(orient="index") ### set unique index as first json key
#print json_stations

### pretty prints
parsed = json.loads(json_stations)
print json.dumps(parsed, indent=2, sort_keys=True)