In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
from tools import detect_anomalies, posOutlier, Low_Variance_QC
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../public/hvstat_africa_data_v1.0.csv',index_col=0)
outlierCount=df.reset_index()[['country','product']].drop_duplicates()
outlierCount['outlier_cnt'] = 0
outlierCount['low_variance_cnt'] = 0
outlierCount['outlier_pct'] = 0
outlierCount['low_variance_pct'] = 0
outlierCount = outlierCount.rename(columns={'product':'crop'})

In [3]:
outlierCount, df = posOutlier(outlierCount,df) #run the positive outlier QC algorithm
outlierCount, df = Low_Variance_QC(outlierCount,df) #Run the low variance QC outlier algorithm

#set all QC flags to 0 (overwriting any legacy manual flags)
df['qc_flag'] = 0
df.loc[((df.low_variance=='outlier')|(df.outlier=='outlier')),'qc_flag'] = 1 #flag outliers of all types as an outlier

outlierCount.to_csv('../public/qcFlags_hvstat_africa_data_v1.0.csv')
df.to_csv('../public/hvstat_africa_data_wQCflags_v1.0.csv')

In [4]:
for country in df['country'].unique():
    counDF=df[df['country']==country]
    for crop in counDF['product'].unique():
        crDF = counDF[counDF['product'] == crop]
        if len(crDF) == 0:
            continue
        cropSys = crDF.crop_production_system.unique()
        for crop_sys in cropSys:
            for season_name in crDF[crDF['crop_production_system'] == crop_sys].season_name.unique():
                csDF = crDF[(crDF.season_name == season_name) & (crDF.crop_production_system == crop_sys)]
                csDF = csDF.drop_duplicates()
                if'none'in csDF['admin_2'].unique():
                    csDF['name']=csDF['admin_1']
                else:
                    csDF['name'] = csDF['admin_2']
                names = csDF.name.unique()
                for name in names:
                    rDF = csDF[csDF.name == name]
                    rDF = rDF.sort_values(by='year', axis=0)
                    if rDF['yield'].isna().all():
                        continue
                    if (np.sum(rDF['outlier'].values=='outlier')==0)&(
                        np.sum(rDF['low_variance'].values=='outlier')==0):continue

                    fig1 = plt.figure()
                    plt.title(country + ', '+ name + ', '+ crop+ ', '+ season_name)
                    plt.plot(rDF.harvest_year,rDF[['yield']],'-o',color='grey')
                    plt.plot(rDF.loc[rDF['outlier'] == 'outlier','harvest_year'],
                             rDF.loc[rDF['outlier'] == 'outlier','yield'],'o',color='orange')
                    plt.plot(rDF.loc[rDF['low_variance'] == 'outlier','harvest_year'],
                             rDF.loc[rDF['low_variance'] == 'outlier','yield'],'o',color='purple')
                    fig1.savefig( '../figures/QCplts/'+ country+ '_'+ name + '_'+ crop+ '_'+ season_name+'.png')
                    plt.close()