In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import datetime
from datetime import timedelta
import pandas as pd
import matplotlib as matplotlib
import matplotlib.pyplot as plt
from platform import python_version

# Definitions of certain constants
DAY = datetime.timedelta(days=1)

# The following are "irredeemable"
RAIN_DESC = "Rain perturbing etcp"
SIMUL_DESC = "Software simulation"
IRR_DESC = "Irrigation perturbing etcp"
NULL_PROFILE_DESC = "Null profile value"
DATA_BLIP_DESC = "Profile data blip"
LARGE_PROFILE_DIP_DESC = "Large profile dip"
ETCP_POS_DESC = "Etcp is positive"
ETCP_OUTLIERS_DESC = "Etcp outliers"
LUX_DESC = "Luxurious water uptake"
BAD_KCP_DESC = "Unacceptable kcp"
UNREDEEMABLE = [RAIN_DESC, SIMUL_DESC, IRR_DESC, NULL_PROFILE_DESC, DATA_BLIP_DESC,
                LARGE_PROFILE_DIP_DESC, ETCP_POS_DESC, ETCP_OUTLIERS_DESC, LUX_DESC, BAD_KCP_DESC]

# The following  are "redeemable"
HU_STUCK_DESC = "Heat Units `stuck`"
ETO_STUCK_DESC = "Eto `stuck`"
ETC_STUCK_DESC = "Stuck etc due to stuck eto"
REDEEMABLE = [HU_STUCK_DESC, ETO_STUCK_DESC, ETC_STUCK_DESC]

ETO_MAX = 12
KCP_MAX = 0.8
ETCP_MAX = ETO_MAX * KCP_MAX

In [3]:
print("Henri is using Python version 3.6.8.  You are using Python version {}.".format(python_version()))
print("Henri is using numpy version 1.15.4.  You are using numpy version {}.".format(np.__version__))
print("Henri is using pandas version 0.24.0.  You are using pandas version {}.".format(pd.__version__))
print("Henri is using matplotlib version 2.2.3.  You are using matplotlib version {}.".format(matplotlib.__version__))

Henri is using Python version 3.6.8.  You are using Python version 3.6.8.
Henri is using numpy version 1.15.4.  You are using numpy version 1.15.4.
Henri is using pandas version 0.24.0.  You are using pandas version 0.24.0.
Henri is using matplotlib version 2.2.3.  You are using matplotlib version 2.2.3.


# Define a helper function called `"flagger"`

This `"flagger"` function will set the flag value equal to 1 for faulty data.  The dates for which the flag value is equal to 1 will not be used in the new calculation of $k_{cp} = \frac{\mathrm{ET}_{cp}}{\mathrm{ET}_o}$.

`"flagger"` will also add a description in the `"description"` column about why a particular date has been flagged.

The `"flagger"` function only operates on the `"df_flag"` DataFrame (instantiated later in this notebook) which only has two columns:
1. `"binary_value"`
2. `"description"`  
`"df_flag"` also has a DateTime Index.

Later on, if desired, we can then merge the `"df_flag"` DataFrame with the main DataFrame containing all our data.  Our merging will take place in such a fashion that we merge entries corresponding to identical dates. (In short, we merge on the Index of our DataFrame).

In [4]:
def flagger(bad_dates, brief_desc, bin_value=0):
    """
    Flag bad_dates with a binary value of 1 and append a brief description about why bad_dates were flagged.
    
    Parameters:
    bad_dates (pandas.core.indexes.datetimes.DatetimeIndex):  Dates for which we cannot calculate k_cp because our readings were perturbed and rendered unuseful.
    brief_desc (str):  A very short description about why bad_dates were flagged.
    bin_value (int):  The binary value.  If Eto is imputed, Etc and heat_units are stuck, we can still get away with a new calculation of kcp; thus set binary_value=0 for such redeemable events.
    
    Returns:
    None.  It updates the DataFrame storing all the information related to flagging.  In this case the DataFrame is called `df_flag`    
    """
    if df_flag.loc[bad_dates, "description"].str.contains(brief_desc).all(axis=0):
        # The bad_dates have already been flagged for the reason given in brief_desc.
        # No use in duplicating brief_desc contents in the description column.
        # Therefore redundant information in the df_flag DataFrame is avoided.
        print("You have already flagged these dates for the reason given in `brief_desc`; No flagging has taken place.")
        return
    else:
        for d in bad_dates:
            cond = (brief_desc in df_flag.loc[d, "description"])
            if (df_flag.loc[d, "binary_value"] == 0) & (bin_value == 0) & (cond is True):
                continue
            elif (df_flag.loc[d, "binary_value"] == 0) & (bin_value == 0) & (cond is False):
                df_flag.loc[d, "description"] += (" " + brief_desc + ".")
            elif (df_flag.loc[d, "binary_value"] == 0) & (bin_value == 1) & (cond is True):
                df_flag.loc[d, "binary_value"] = 1
            elif (df_flag.loc[d, "binary_value"] == 0) & (bin_value == 1) & (cond is False):
                df_flag.loc[d, "binary_value"] = 1
                df_flag.loc[d, "description"] += (" " + brief_desc + ".")
            elif (df_flag.loc[d, "binary_value"] == 1) & (bin_value == 0) & (cond is True):
                continue
            elif (df_flag.loc[d, "binary_value"] == 1) & (bin_value == 0) & (cond is False):
                df_flag.loc[d, "description"] += (" " + brief_desc + ".")
            elif (df_flag.loc[d, "binary_value"] == 1) & (bin_value == 1) & (cond is True):
                continue
            else:  # (df_flag.loc[d, "binary_value"] == 1) & (bin_value == 1) & (cond is False)
                df_flag.loc[d, "description"] += (" " + brief_desc + ".")
        df_flag.loc[bad_dates, "description"] = df_flag.loc[:, "description"].apply(lambda s: s.lstrip().rstrip())

# Define a helper function named `"reporter"`:

This function prints statements regarding:
1. How much data was lost due to a specific flagging operation.
2. (Optional, `Default=False`) How much data remains useful after all the flagging that has taken place in the entire notebook session.

In [5]:
def reporter(brief_desc, remaining=False):
    tally = df_flag["description"].str.contains(brief_desc).sum()
    n_tot_entries = len(df_flag.index)
    perc = tally / n_tot_entries * 100
    print("{:.1f}% of data is affected due to [{}].".format(perc, brief_desc))
    
    if remaining:
        calc = 100 - df_flag["binary_value"].sum()/len(df_flag.index)*100
        print("After all the flagging that has taken place in this entire notebook, only {:.0f}% of your data is useful.".format(calc))

# The _Kouebokkeveld_ longterm data, and imputer definition:

In the following, we:
1. Define a DataFrame storing the _Kouebokkeveld_ long-term data.
2. Define the `kbv_imputer` function that tries to impute the long-term data into stuck/repeating $\mathrm{ET}_o$ values.

In [6]:
calendar_week = np.array([25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                          35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 
                          45, 46, 47, 48, 49,  1,  2,  3,  4,  5,
                           6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
                          16, 17, 18, 19, 20, 21, 22, 23, 24])

kbv_eto = np.array([2.30, 2.30, 2.30, 2.30, 2.30, 2.40, 2.50, 2.65, 2.80, 3.10,
                    3.40, 3.65, 4.00, 4.40, 4.80, 5.30, 5.80, 6.30, 6.70, 7.10,
                    7.60, 8.00, 8.30, 8.60, 8.80, 8.90, 8.90, 8.90, 8.80, 8.70,
                    8.50, 8.30, 8.00, 7.50, 7.00, 6.50, 5.80, 5.20, 4.70, 4.30,
                    3.70, 3.40, 3.10, 2.80, 2.50, 2.45, 2.40, 2.35, 2.30])

df_kbv = pd.DataFrame(data=kbv_eto, index=calendar_week, columns=["kbv_eto"])

In [7]:
def kbv_imputer(flagged_dates, dataframe, column_to_be_imputed, flag_dataframe):
    IMPUTED_ETO = "Imputed eto"
    for d in flagged_dates:
        week_number = d.isocalendar()[1]
        try:
            dataframe.loc[d, [column_to_be_imputed]] = df_kbv.loc[week_number, "kbv_eto"]
            for description in UNREDEEMABLE:
                if description in flag_dataframe.loc[d, "description"]:
                    break
            else:
                flag_dataframe.loc[d, "binary_value"] = 0  # we have 'salvaged' an entry.
                flag_dataframe.loc[d, "description"] = flag_dataframe.loc[d, "description"].replace(ETO_STUCK_DESC, IMPUTED_ETO)
        except KeyError:
            dataframe.loc[d, column_to_be_imputed] = np.nan
    return dataframe, flag_dataframe

# Master Crop Coefficients & $k_{cp}$ flagging function:

In [8]:
calendar_month = np.array([7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6])
penman_kcp = np.array([0.10, 0.30, 0.40, 0.60, 0.73, 0.88, 0.95, 0.95, 0.95, 0.70, 0.40, 0.20])
row_4 =      np.array([0.61, 0.73, 0.88, 0.95, 0.95, 0.95, 0.95, 0.90, 0.80, 0.40, 0.30, 0.10])

accepted_kcp_norm = pd.DataFrame(index=calendar_month, data=row_4, columns=["norm_kcp"])
accepted_kcp_norm.index.name = "calendar_month"

def calculate_kcp_deviation(dataframe):
    dataframe["kcp_perc_deviation"] = 0.0
    for d in dataframe.index:
        month_from_datetime = d.month
        associated_kcp_norm = accepted_kcp_norm.loc[month_from_datetime, "norm_kcp"]
        empirical_kcp = dataframe.loc[d, "kcp"]
        perc_deviation = np.abs((empirical_kcp - associated_kcp_norm)/associated_kcp_norm) * 100.0
        dataframe.loc[d, "kcp_perc_deviation"] = perc_deviation
    return dataframe["kcp_perc_deviation"]

# Specify the Probe-ID you are interested in:

Valid options are:
* `"P-370"`
* `"P-371"`
* `"P-372"`
* `"P-384"`
* `"P-391"`
* `"P-392"`
* `"P-891"`

You can change the Probe-ID in the following code-cell, and please remember to specify it in string format:

In [9]:
probe_id = "P-392"
assert isinstance(probe_id, str), "variable probe_id must be of type string!"

# Extract the data from our Excel file and store it in a `pandas DataFrame`.

- Notice in the following code cell, we extract the daily data for the probe of interest.

- In the for-loop, we remove the unnecessary leading white-space at the beginning of each column name.  After the redundant white-space has been removed, we assign the redundant-free column names to the dataframe.

In [10]:
data = pd.read_excel("Golden_Delicious_daily_data.xlsx", sheet_name=probe_id, index_col=0, parse_dates=True)
new_columns = []
for c in data.columns:
    if '0' in c:
        c = c.replace("0", "o")
    new_columns.append(c.lstrip())
data.columns = new_columns
print(data.columns)

Index(['heat_units', 'rain', 'erain', 'total_irrig', 'tot_eff_irrig', 'etc',
       'ety', 'eto', 'etcp', 'rzone', 'available', 'days_left',
       'deficit_current', 'rzm', 'rzm_source', 'fcap', 'profile',
       'deficit_want', 'refill', 'eto_forecast_yr', 'original_unit_system'],
      dtype='object')


The following data columns are not of interest to us for our analysis:
* `rzone`
* `available`
* `days_left`
* `deficit_current`
* `rzm`
* `fcap`
* `deficit_want`
* `refill`
* `eto_forecast_yr`

Therefore, we are going to drop these columns from the `data` DataFrame:

In [11]:
data.drop(labels=["rzone", "available", "days_left", "deficit_current", "rzm", 
                  "fcap", "deficit_want", "refill", "eto_forecast_yr"], axis=1, inplace=True)

In [12]:
data.columns

Index(['heat_units', 'rain', 'erain', 'total_irrig', 'tot_eff_irrig', 'etc',
       'ety', 'eto', 'etcp', 'rzm_source', 'profile', 'original_unit_system'],
      dtype='object')

In [13]:
data["calendar_week"] = data.index.to_series().apply(lambda d: d.isocalendar()[1])

In [14]:
data.head()

Unnamed: 0_level_0,heat_units,rain,erain,total_irrig,tot_eff_irrig,etc,ety,eto,etcp,rzm_source,profile,original_unit_system,calendar_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-08-01,0.0,0.0,0.0,0.0,0.0,0.82,0.68,6.8,218.81,Electronic Probe,218.81,si,31
2017-08-02,0.0,1.0,0.0,0.0,0.0,0.82,0.68,6.8,3.18,Electronic Probe,221.99,si,31
2017-08-03,0.0,0.2,0.0,0.0,0.0,0.82,0.68,6.8,-1.3,Electronic Probe,220.69,si,31
2017-08-04,0.0,0.0,0.0,0.0,0.0,0.82,0.68,6.8,-0.46,Electronic Probe,220.23,si,31
2017-08-05,0.0,0.0,0.0,0.0,0.0,0.82,0.46,6.8,-0.51,Electronic Probe,219.72,si,31


# Create standalone `df_flag` dataframe

In the following code we create a standalone dataframe which will store flag values as well as descriptions on why a particular date was flagged as being inappropriate for our new calculation of $k_{cp}$.  At first, we initialise all our dates to a flag value of 0.  If during our analysis we realise that there are dates with junk data, we will update the flag value to be 1, and also provide a brief description.

In [15]:
df_flag = pd.DataFrame(index=data.index, columns=["binary_value", "description"])
df_flag["binary_value"] = 0
df_flag["description"] = str()  # we initialise this column with an empty string `""`
df_flag.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 550 entries, 2017-08-01 to 2019-02-01
Data columns (total 2 columns):
binary_value    550 non-null int64
description     550 non-null object
dtypes: int64(1), object(1)
memory usage: 12.9+ KB


# `["etc", "eto"]` versus time

Notice that `"etc"` is column G in the Excel spreadsheet.  
`"eto"` is column I in the Excel spreadsheet.  
`"etcp"` lives in column J of the Excel spreadsheet.

In [16]:
fig, ax = plt.subplots()
fig.set_size_inches(9, 3.5)

ax.plot(data.index, data["etc"], color="blue", label="$\mathrm{ET}_c$")
ax.plot(data.index, data["eto"], color="green", label="$\mathrm{ET}_o$")
ax.axvline(x=datetime.datetime(2018, 8, 1), linestyle="--", color="red", alpha=0.6, label="2018-08-01")
ax.set_ylabel("$\mathrm{ET}_c$ and $\mathrm{ET}_o$")
ax.set_xlabel("Date")
ax.set_title("Evapotranspiration versus Time")
ax.legend(loc=0)
plt.show()

<IPython.core.display.Javascript object>

In [17]:
data["eto_diff1"] = data["eto"].diff(periods=1)
data["eto_diff2"] = data["eto"].diff(periods=2)
condition = (data["eto_diff1"] == 0.0) | (data["eto_diff2"] == 0)  # bitwise OR operation
bad_eto_days = data[condition].index

data.loc[bad_eto_days, ["eto"]] = np.nan

flagger(bad_dates=bad_eto_days, brief_desc=ETO_STUCK_DESC, bin_value=1)
reporter(brief_desc=ETO_STUCK_DESC)

53.5% of data is affected due to [Eto `stuck`].


In [18]:
print(df_flag)

            binary_value   description
date                                  
2017-08-01             0              
2017-08-02             1  Eto `stuck`.
2017-08-03             1  Eto `stuck`.
2017-08-04             1  Eto `stuck`.
2017-08-05             1  Eto `stuck`.
2017-08-06             1  Eto `stuck`.
2017-08-07             1  Eto `stuck`.
2017-08-08             1  Eto `stuck`.
2017-08-09             1  Eto `stuck`.
2017-08-10             1  Eto `stuck`.
2017-08-11             1  Eto `stuck`.
2017-08-12             1  Eto `stuck`.
2017-08-13             1  Eto `stuck`.
2017-08-14             1  Eto `stuck`.
2017-08-15             1  Eto `stuck`.
2017-08-16             1  Eto `stuck`.
2017-08-17             1  Eto `stuck`.
2017-08-18             0              
2017-08-19             0              
2017-08-20             0              
2017-08-21             0              
2017-08-22             0              
2017-08-23             0              
2017-08-24             0 

### Impute _Kouebokkeveld_ data into stuck `eto` values.

In [19]:
data, df_flag = kbv_imputer(flagged_dates=bad_eto_days, dataframe=data,
                    column_to_be_imputed="eto", flag_dataframe=df_flag)

In [20]:
for d in df_flag.index:
    print("{} <---> {}".format(df_flag.loc[d, "binary_value"], df_flag.loc[d, "description"]))

0 <---> 
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> Imputed eto.
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> Imputed eto.
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <---> 
0 <-

### Flag `etc` values that are stuck

In [21]:
data.loc[bad_eto_days, ["etc"]] = np.nan

flagger(bad_dates=bad_eto_days, brief_desc=ETC_STUCK_DESC, bin_value=0)
reporter(brief_desc=ETC_STUCK_DESC)

53.5% of data is affected due to [Stuck etc due to stuck eto].


In [22]:
print(df_flag)

            binary_value                               description
date                                                              
2017-08-01             0                                          
2017-08-02             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-03             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-04             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-05             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-06             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-07             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-08             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-09             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-10             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-11             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-12             0  Imputed eto. Stuck etc due to stuck eto.
2017-08-13             0  Imputed eto. Stuck etc due to stuck 

There are time intervals for which `"et0"` (and also `"heat_units"`) are stuck: they repeat identical values for a long duration of time.  Obviously, it is not possible to get such repetitive values when considering the fact that weather data is very random.  The data entries associated with these repeating values are flagged.

It is rather unfortunate that a relatively large volume of data is lost when flagging these "stuck" values.

In [23]:
fig, ax = plt.subplots()
fig.set_size_inches(9, 3.5)

ax.plot(data.index, data["etc"], color="blue", label="Remaining $\mathrm{ET}_c$")
ax.plot(data.index, data["eto"], color="green", label="Remaining $\mathrm{ET}_o$")
ax.axvline(x=datetime.datetime(2018, 8, 1), linestyle="--", color="red", alpha=0.6, label="2018-08-01")
ax.set_ylabel("$\mathrm{ET}_c$ and $\mathrm{ET}_o$")
ax.set_xlabel("Date")
ax.set_title("$\mathrm{ET}_c$ and $\mathrm{ET}_o$ after flagging and imputation.")
ax.legend(loc=0)
plt.show()

<IPython.core.display.Javascript object>

In [24]:
eto_max = max(data["eto"])
print("The maximum (valid) eto is equal to: {:.1f}.".format(eto_max))

The maximum (valid) eto is equal to: 8.9.


# `"rain"` versus time

In the code cell below, we flag rain events in which:
* `"rain"` > 2 mm.

In [25]:
condition = (data["rain"] > 2)
flagged_rain_dates = data[condition].index

flagger(bad_dates=flagged_rain_dates, brief_desc=RAIN_DESC, bin_value=1)
reporter(brief_desc=RAIN_DESC)

14.9% of data is affected due to [Rain perturbing etcp].


In [26]:
print(df_flag)

            binary_value                                        description
date                                                                       
2017-08-01             0                                                   
2017-08-02             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-03             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-04             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-05             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-06             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-07             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-08             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-09             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-10             0           Imputed eto. Stuck etc due to stuck eto.
2017-08-11             1  Imputed eto. Stuck etc due to stuck eto. Rain ...
2017-08-12  

In [27]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.bar(x=data.index, height=data["rain"], color="magenta", label="rain")
ax.set_xlabel("Date")
ax.set_ylabel("rain [mm]")
ax.set_title("rain versus time")
ax.scatter(flagged_rain_dates, data.loc[flagged_rain_dates, ["rain"]], label="Rain > 2 mm (flagged)", 
           c="black", marker=6, s=10, alpha=1)
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", ls="--", alpha=0.4, label="2018-08-01")
ax.legend()
plt.show()

<IPython.core.display.Javascript object>

The great majority of rain events are flagged.

There appears to be few rain events in which: rain < 2 mm.

# `"Total Irrigation"`

We need to flag data entries corresponding to irrigation events because it distorts our `"profile"` and `"etcp"` waterbalance readings.  This is somewhat complicated by the possibility of a **farmer logging an irrigation event on the wrong date.**

Flag an irrigation data entry:
1. If the farmer logged irrigation taking place for that day, **AND**
2. If the cerresponding $\mathrm{ET}_{cp} > 0.5\cdot\mathrm{ET}_c$, **AND**
3. If there is no rain for that particular day: rain == 0.

Let us implement this in the following code cell:

In [28]:
conditions = (data["total_irrig"] > 0) & (data["etcp"] > 0.5*data["etc"]) & (data["rain"] == 0)
flag_irrigation_dates = data[conditions].index

flagger(bad_dates=flag_irrigation_dates, brief_desc=IRR_DESC, bin_value=1)
reporter(brief_desc=IRR_DESC)

3.8% of data is affected due to [Irrigation perturbing etcp].


In [29]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.bar(data.index, data["total_irrig"], color="magenta", label="Irrigation")
ax.scatter(flag_irrigation_dates, data.loc[flag_irrigation_dates, ["total_irrig"]], label="Flagged Irr events", 
           c="black", marker="o", s=5, alpha=1)
ax.scatter(flagged_rain_dates, data.loc[flagged_rain_dates, ["total_irrig"]], label="rain > 2 mm",
           c="orange", marker="^", s=10, alpha=1)
ax.set_xlabel("Date")
ax.set_ylabel("Total Irrigation [mm]")
ax.set_title("Total Irrigation versus Time")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", ls="--", alpha=0.4, label="2018-08-01")
ax.legend()
plt.show()

<IPython.core.display.Javascript object>

This figure looks wrong.  There appears to be many irrigation events that are not flagged.

# Investigate `"rzm_source"`:  "`software`" versus "`Electronic Probe`".

Basically we want to flag the entries for which the column `"rzm_source"` contains the description `software`.  We do not want to build our model from simulated data, but rather from actual probe readings.

In [30]:
condition = data["rzm_source"].str.contains("software")
flag_software_dates = data[condition].index

flagger(bad_dates=flag_software_dates, brief_desc=SIMUL_DESC, bin_value=1)
reporter(brief_desc="Software simulation")

3.6% of data is affected due to [Software simulation].


# `"profile"` versus time

Notice that `"profile"` corresponds to column R in the Excel spreadsheet.

In the `"profile"` column, there are certain entries containing 0.0; these entries correspond to missing data.  For these missing `"profile"` entries, we replace the 0.0's with `NaN`'s (Not a Number).  We also flag these missing `"profile"` entries.

In [31]:
data["profile"].replace(0.0, np.nan, inplace=True)  # replace missing entries with NaN

condition = data["profile"].isnull()
bad_profile_days = data[condition].index

flagger(bad_dates=bad_profile_days, brief_desc=NULL_PROFILE_DESC, bin_value=1)
reporter(brief_desc=NULL_PROFILE_DESC)

3.6% of data is affected due to [Null profile value].


In [32]:
data["profile_difference"] = data["profile"].diff()

data_blip_days = []
for d in data.index:
    try:
        if (data.loc[d, "profile_difference"] < 0) and pd.isnull(data.loc[d + DAY, "profile"]):
            data_blip_days.append(d)
    except KeyError:
        pass

data_blip_days = pd.to_datetime(data_blip_days)
flagger(bad_dates=data_blip_days, brief_desc=DATA_BLIP_DESC, bin_value=1)
reporter(brief_desc=DATA_BLIP_DESC)

1.1% of data is affected due to [Profile data blip].


In [33]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.set_xlabel('Date')
ax.set_ylabel('Profile')
ax.set_title("Profile versus Time.")
ax.plot(data.index, data["profile"], color="blue", label="Profile", lw="1")
ax.scatter(x=data_blip_days, y=data.loc[data_blip_days, ["profile"]], s=50, color="black", 
            marker="*", label="Data blips", edgecolors="red")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="black", alpha=0.4, linestyle="--", label="2018-08-01")

for d in flag_irrigation_dates:
    ax.axvline(x=d, color="pink", alpha=1.0, linestyle="-", linewidth=0.3)
ax.axvline(x=flag_irrigation_dates[0], color="pink", alpha=1.0, linestyle="-", linewidth=0.3, label="Flagged Irr")

for d in flagged_rain_dates:
    ax.axvline(x=d, color="lime", alpha=1.0, linestyle="-", linewidth=0.3)
ax.axvline(x=flagged_rain_dates[0], color="lime", alpha=1.0, linestyle="-", linewidth=0.3, label="rain > 2 mm")

ax.legend()
plt.show()

<IPython.core.display.Javascript object>

Notice that for the dates adjacent to missing data gaps, there is always a strange slanting in the `"profile"` value.  These slanting dips are indicated by the red stars in the above plot.  The data entries associated with these slanting dips are also flagged.  These are data blips and imply that the apple tree had a massive water uptake via absorption through its roots, but physiologically this is not possible.

There are also other profile readings that appear suspicious because of the following pattern: profile(t-1) is some value, say $k$; profile(t) dips very low from profile(t-1); profile(t+1) is close to the value of profile(t-1).  Up next, we try to flag these "large dips" that appear suspicious.

In [34]:
data.loc[data_blip_days ,["profile_difference"]] = np.nan

negative_differences = data[data["profile_difference"] < 0]["profile_difference"].values
percentile_value = np.quantile(negative_differences, q=[0.01, 0.02, 0.03, 0.04, 0.05,
                                                        0.06, 0.07, 0.08, 0.09, 0.10])[4]

large_dip_days = []
for d in data.index:
    try:
        if (data.loc[d, "profile_difference"] < percentile_value) and (data.loc[d + DAY, "profile_difference"] > 0):
            large_dip_days.append(d)
    except KeyError:
        pass

large_dip_days = pd.to_datetime(large_dip_days)
flagger(bad_dates=large_dip_days, brief_desc=LARGE_PROFILE_DIP_DESC, bin_value=1)
reporter(brief_desc=LARGE_PROFILE_DIP_DESC)
print(len(large_dip_days))

2.2% of data is affected due to [Large profile dip].
12


In [35]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.set_xlabel('Date')
ax.set_ylabel('Profile')
ax.set_title("Profile versus Time.")
ax.plot(data.index, data["profile"], color="blue", label="Profile", lw=1)
ax.scatter(x=data_blip_days, y=data.loc[data_blip_days, ["profile"]], s=50, color="black", 
            marker="*", label="Data blips", edgecolors="red")
ax.scatter(x=large_dip_days, y=data.loc[large_dip_days, ["profile"]], s=50, color="black", 
            marker="X", label="'Large' Dips", edgecolors="green")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="black", alpha=0.5, linestyle="--", label="2018-08-01")

for d in flag_irrigation_dates:
    ax.axvline(x=d, color="pink", alpha=1.0, linestyle="-", linewidth=0.3)
ax.axvline(x=flag_irrigation_dates[0], color="pink", alpha=1.0, linestyle="-", linewidth=0.3, label="Flagged Irr")

for d in flagged_rain_dates:
    ax.axvline(x=d, color="lime", alpha=1.0, linestyle="-", linewidth=0.3)
ax.axvline(x=flagged_rain_dates[0], color="lime", alpha=1.0, linestyle="-", linewidth=0.3, label="rain > 2 mm")

ax.legend()
plt.show()

<IPython.core.display.Javascript object>

# `"heat_units"` versus Time

Notice that heat_units corresponds to column B of the excile spreadsheet.

There are many dates for which the value of `"heat_units"` is jammed and repeats for a long time interval.  This is due to faulty weatherstation data.  Entries for which `"heat_units"` values repeat are flagged, and the actual `"heat_units"` values are replaced with 0.

In [36]:
data["hu_diff1"] = data["heat_units"].diff(periods=1)
data["hu_diff2"] = data["heat_units"].diff(periods=2)

In [37]:
condition = (data["hu_diff1"] == 0.0) | (data["hu_diff2"] == 0)  # bitwise OR operation
bad_hu_days = data[condition].index

flagger(bad_dates=bad_hu_days, brief_desc=HU_STUCK_DESC, bin_value=0)
reporter(brief_desc=HU_STUCK_DESC)

53.8% of data is affected due to [Heat Units `stuck`].


In [38]:
df_flag

Unnamed: 0_level_0,binary_value,description
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-01,0,
2017-08-02,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-03,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-04,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-05,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-06,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-07,0,Imputed eto. Stuck etc due to stuck eto.
2017-08-08,0,Imputed eto. Stuck etc due to stuck eto.
2017-08-09,0,Imputed eto. Stuck etc due to stuck eto.
2017-08-10,0,Imputed eto. Stuck etc due to stuck eto.


In [39]:
data.loc[bad_hu_days, "heat_units"] = 0.0

In [40]:
def cumulative(s):
    new = s.to_frame()
    new["cumulative"] = 0.0
    for stamp in new.index:
        if (stamp.month == 8) and (stamp.day == 1):
            new.loc[stamp, ["cumulative"]] = 0.0
        elif (stamp.month == 8) and (stamp.day == 2):
            new.loc[stamp, ["cumulative"]] = 0.0 + new.loc[stamp]["heat_units"]
        else:
            new.loc[stamp, ["cumulative"]] = new.loc[(stamp - DAY)]["cumulative"] + new.loc[stamp]["heat_units"]
    return new["cumulative"]


data["cumul_heat_units"] = cumulative(data["heat_units"])
# df_gdd.head(n=30)

In [41]:
fig = plt.figure()
fig.set_size_inches(8, 3)
ax1 = fig.add_subplot(111)

color = "blue"
ax1.set_title("GDD versus Time")
ax1.set_xlabel('Date')
ax1.set_ylabel('Heat Units (GDD)', color=color)
pl1 = ax1.bar(data.index, data["heat_units"], color=color, label="Heat Units")
ax1.tick_params(axis='y', labelcolor=color)
ax1.axvline(x=datetime.datetime(2018, 8 , 1), color="red", ls="--", alpha=0.6, label="2018-08-01")

ax2 = ax1.twinx()  # instantiate a second axis that shares the same x-axis
color="green"
ax2.set_ylabel("Cumulative GDD", color=color)
pl2 = ax2.plot(data.index, data["cumul_heat_units"], color=color, label="Cumulative GDD", lw=1)
ax2.tick_params(axis="y", labelcolor=color)

# added these three lines
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

As can be seen in the above plot, there are many gaps due to faulty weather station data.  Therefore the green curve representing cumulative growing-degree-days is also affected and not accurate.

# `"etcp"` versus time

Notice that `"etcp"` belongs to column J in the Excel file.  
`"etcp"` is defined as the difference between consecutive `"profile"` readings:

$$
\mathrm{ET}_{cp}(t) = \mathrm{Profile}(t) - \mathrm{Profile}(t-1)
$$

We are mostly interested in the $\mathrm{ET}_{cp}$ entries for which $\mathrm{ET}_{cp} < 0$.  These negative entries reflect incidents in which water was lost from the soil due to: (1) Water Drainage, (2) Luxurious water uptake, (3) Normal water uptake, (4) and Drought-stress.

We are not interested in dates in which $\mathrm{ET}_{cp}$ is perturbed by irrigation and/or rain.  For such dates where $\mathrm{ET}_{cp}$ is perturbed, we expect $\mathrm{ET}_{cp} \ge 0$.  Therefore, data entries corresponding to $\mathrm{ET}_{cp} \ge 0$ are flagged.  Furthermore, all $\mathrm{ET}_{cp} \ge 0$ entries are set to `NaN` values (Not a Number).

In [42]:
condition = data["etcp"] >= 0.0
bad_etcp_days = data[condition].index

flagger(bad_dates=bad_etcp_days, brief_desc=ETCP_POS_DESC, bin_value=1)
reporter(brief_desc=ETCP_POS_DESC)

33.3% of data is affected due to [Etcp is positive].


Let us set the $\mathrm{ET}_{cp}$ values for which _all_ the dates in the `df_flag` DataFrame is flagged, due to whatever reason, to `np.nan`.

In [43]:
condition = df_flag["binary_value"] == 1
flagged_dates = df_flag[condition].index

data.loc[flagged_dates, "etcp"] = np.nan
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 550 entries, 2017-08-01 to 2019-02-01
Data columns (total 19 columns):
heat_units              550 non-null float64
rain                    550 non-null float64
erain                   550 non-null float64
total_irrig             550 non-null float64
tot_eff_irrig           550 non-null float64
etc                     256 non-null float64
ety                     550 non-null float64
eto                     545 non-null float64
etcp                    304 non-null float64
rzm_source              550 non-null object
profile                 530 non-null float64
original_unit_system    550 non-null object
calendar_week           550 non-null int64
eto_diff1               549 non-null float64
eto_diff2               548 non-null float64
profile_difference      517 non-null float64
hu_diff1                549 non-null float64
hu_diff2                548 non-null float64
cumul_heat_units        550 non-null float64
dtypes: float64(16), int6

In [44]:
data.describe()

Unnamed: 0,heat_units,rain,erain,total_irrig,tot_eff_irrig,etc,ety,eto,etcp,profile,calendar_week,eto_diff1,eto_diff2,profile_difference,hu_diff1,hu_diff2,cumul_heat_units
count,550.0,550.0,550.0,550.0,550.0,256.0,550.0,545.0,304.0,530.0,550.0,549.0,548.0,517.0,549.0,548.0,550.0
mean,3.072909,3.281091,1.027455,1.547091,1.107255,1.419453,6.040182,4.589706,-3.669507,200.761208,29.274545,-0.000182,-0.001661,0.091489,0.023497,0.043066,475.74773
std,4.361954,13.083497,4.062032,3.186438,2.259269,0.982894,12.935759,2.021064,5.415617,53.255505,15.800414,1.09477,1.361479,12.089465,2.198385,3.129725,372.521795
min,0.0,0.0,0.0,0.0,0.0,0.07,0.08,0.71,-32.58,95.67,1.0,-5.38,-5.26,-71.49,-9.75,-14.2,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.4775,0.7625,2.65,-3.4625,140.59,15.25,0.0,0.0,-2.38,0.0,-0.05,94.312501
50%,0.0,0.0,0.0,0.0,0.0,1.37,1.49,4.4,-1.485,228.53,33.0,0.0,0.0,-0.88,0.0,0.0,462.475004
75%,6.137501,0.2,0.0,0.0,0.0,2.0375,4.55,6.16,-0.805,243.8225,43.0,0.09,0.0925,0.76,0.2,0.5625,914.300005
max,17.650001,157.2,25.0,10.7,8.0,3.68,76.31,8.9,-0.03,301.62,52.0,5.26,5.26,74.49,7.4,10.149999,921.000005


At this stage, we are now only left with the $\mathrm{ET}_{cp} < 0$ entries (i.e. entries associated with water drainage, luxurious/normal water uptake, and drought-stress).  For simplicity, we multiply these remaining $\mathrm{ET}_{cp}$ values with -1 so that henceforth we only work with positive values of $\mathrm{ET}_{cp}$ (which is a little bit more convenient for programming purposes).

In [45]:
data["etcp"] = data["etcp"].multiply(-1)

In [46]:
fig, ax = plt.subplots()
fig.set_size_inches(8.6, 3)

ax.scatter(data.index, data["etcp"], color="red", label="Remaining $\mathrm{ET}_{cp}$", 
        marker="o", s=10, edgecolors="black", linewidth=1, alpha=0.6)
ax.set_xlabel("Date")
ax.set_ylabel("$\mathrm{ET}_{cp}$")
ax.set_title("$\mathrm{ET}_{cp}$ versus time")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", alpha=0.6, ls="--", label="2018-08-01")
ax.legend()
plt.show()

<IPython.core.display.Javascript object>

# Remove $\mathrm{ET}_{cp}$ outliers

As can be seen in the figure above, there are some outliers still present in the remaining $\mathrm{ET}_{cp}$ dataset.

These outliers are most likely associated with phases of water-drainage and luxurious water uptake.  We, on the other hand, are only interested in phases corresponding to normal water uptake.

In [47]:
s = data["etcp"].quantile([0.50, 0.60, 0.70, 0.75, 0.80, 0.85, 0.90, 0.91, 0.92, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0])
s

0.50     1.4850
0.60     2.0200
0.70     2.9420
0.75     3.4625
0.80     4.3900
0.85     7.2735
0.90    10.8340
0.91    11.0819
0.92    13.2188
0.94    14.8826
0.95    15.7325
0.96    17.9704
0.97    18.2200
0.98    20.3698
0.99    25.5446
1.00    32.5800
Name: etcp, dtype: float64

From an educated guess, let us accept a maximum $\mathrm{ET}_o$ of 12.0 mm.  Let us make another educated guess and allow for a maximum $k_{cp}$ value of 0.8 (of course, these educated guesses vary from cultivar to cultivar).  This implies that the maximum allowed value for $\mathrm{ET}_{cp}$ is as follows:

$$
\mathrm{max}(\mathrm{ET}_{cp}) = 0.8 \times 12 = 9.6\,\mathrm{mm}
$$

Consequently, we flag all $\mathrm{ET}_{cp} > 9.6\, \mathrm{mm}$ data entries.

Let us perform a filtering process to get rid of all $\mathrm{ET}_{cp}$ values higher than $9.6\,\mathrm{mm}$:

In [48]:
condition = data["etcp"] >= ETCP_MAX
bad_high_etcp_dates = data[condition].index

data.loc[bad_high_etcp_dates, ["etcp"]] = np.nan

flagger(bad_dates=bad_high_etcp_dates, brief_desc=ETCP_OUTLIERS_DESC, bin_value=1)
reporter(brief_desc=ETCP_OUTLIERS_DESC)

7.1% of data is affected due to [Etcp outliers].


# Remove dates for which luxurious water uptake is present

But remember, we only tolerate $\mathrm{ET}_{cp}$ values for which $k_{cp} \le 0.8$.  Therefore, to ensure that all luxurious water-uptake phases are completely flagged, we perform another flagging operation that flags data entries for which $\mathrm{ET}_{cp} > 0.8\cdot\mathrm{ET}_o$ (for extra insurance).

In [49]:
condition = data["etcp"] > data["eto"].mul(KCP_MAX, fill_value=np.nan)
luxurious_dates = data[condition].index

data.loc[luxurious_dates, ["etcp"]] = np.nan

flagger(bad_dates=luxurious_dates, brief_desc=LUX_DESC, bin_value=1)
reporter(brief_desc=LUX_DESC, remaining=True)

6.2% of data is affected due to [Luxurious water uptake].
After all the flagging that has taken place in this entire notebook, only 42% of your data is useful.


# Evaluate the remaining $\mathrm{ET}_{cp}$ entries which are _hopefully_ valid data points

In [50]:
condition = (df_flag["binary_value"] == 0)
useful_dates = data[condition].index

In [51]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.scatter(useful_dates, data.loc[useful_dates, ["eto"]], color="green", label="$\mathrm{ET}_o$", 
        marker="o", s=10, edgecolors="black", linewidth=1, alpha=0.6)
ax.scatter(useful_dates, data.loc[useful_dates, ["etcp"]], color="red", label="$\mathrm{ET}_{cp}$", 
        marker="s", s=10, edgecolors="black", linewidth=1, alpha=0.6)
ax.set_xlabel("Date")
ax.set_ylabel("$\mathrm{ET}_o$ and $\mathrm{ET}_{cp}$")
ax.set_title("$\mathrm{ET}_o$ and $\mathrm{ET}_{cp}$ versus Time")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", alpha=0.6, ls="--", label="2018-08-01")
ax.legend()
plt.show()

<IPython.core.display.Javascript object>

# How must we flag events of drought-stress?

One line of reasoning is that during phases of drough-stress we expect very small changes in the waterbalance "Profile" readings.  According to this logic, we expect for drought-stress that:

$$
\mathrm{Profile}(t) - \mathrm{Profile}(t-1) = \mathrm{ET}_{cp}(t) < \varepsilon
$$

where $\varepsilon$ is a relatively small value, such as, for example, $\varepsilon = 0.1$.

It was suggested we evaluate $k_{cp}$ and compare it to the accepted norm (on file).  If $k_{cp}$ differs by more than 50% +- then flagging should take place.

# Rough calculation of $k_{cp}$

Basically, after all the necessary flagging has been carried out, we can proceed to calculate $k_{cp}$ as follows:

$$
k_{cp} = \frac{\mathrm{ET}_{cp}}{\mathrm{ET}_o}
$$

In the plot below, we show the calculated $k_{cp}$ for entries containing both valid $\mathrm{ET}_{cp}$ and $\mathrm{ET}_o$ entries.

In [59]:
data["kcp"] = data["etcp"].div(data["eto"], fill_value=np.nan)
print(data["kcp"].notna().sum())

fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.scatter(data.index, data["kcp"], color="purple", label="$k_{cp}$",
        marker="D", s=10, edgecolors="black", linewidth=1, alpha=0.6)
ax.set_xlabel("Date")
ax.set_ylabel("$k_{cp}$")
ax.set_title("$k_{cp}$ versus time")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", alpha=0.6, ls="--", label="2018-08-01")
ax.legend()
plt.show()

231


<IPython.core.display.Javascript object>

# Flag $k_{cp}$ values that deviate by more than +/- 50% from the accepted norm:

This is achieved with the help of the `kcp_flagger` function defined above.

In [53]:
# In order to check for NaN in a column, use:  df.isnull(), or, s.isnull()
perc_series = calculate_kcp_deviation(data)
condition = (perc_series.isnull()) | (perc_series > 50)
bad_calc_kcp_dates = data[condition].index

data.loc[bad_calc_kcp_dates, "kcp"] = np.nan
flagger(bad_dates=bad_calc_kcp_dates, brief_desc=BAD_KCP_DESC, bin_value=1)
reporter(brief_desc=BAD_KCP_DESC, remaining=True)

89.5% of data is affected due to [Unacceptable kcp].
After all the flagging that has taken place in this entire notebook, only 11% of your data is useful.


# Replot kcp versus time.  Evaluate after bad kcp values have been flagged.

In [54]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 3)

ax.scatter(data.index, data["kcp"], color="gold", label="Remaining $k_{cp}$",
        marker="D", s=10, edgecolors="black", linewidth=1, alpha=0.6)
ax.set_xlabel("Date")
ax.set_ylabel("$k_{cp}$")
ax.set_title("$k_{cp}$ versus time")
ax.axvline(x=datetime.datetime(2018, 8, 1), color="blue", alpha=0.6, ls="--", label="2018-08-01")
ax.legend()
plt.show()

<IPython.core.display.Javascript object>

# Merge the original `data` DataFrame with the `df_flag` DataFrame

In [55]:
master_data = pd.concat([data, df_flag], axis=1, join="inner")
master_data.head(n = 100)

Unnamed: 0_level_0,heat_units,rain,erain,total_irrig,tot_eff_irrig,etc,ety,eto,etcp,rzm_source,...,eto_diff1,eto_diff2,profile_difference,hu_diff1,hu_diff2,cumul_heat_units,kcp,kcp_perc_deviation,binary_value,description
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-01,0.000000,0.0,0.0,0.0,0.00,0.82,0.68,6.80,,Electronic Probe,...,,,,,,0.000000,,,1,Etcp is positive. Unacceptable kcp.
2017-08-02,0.000000,1.0,0.0,0.0,0.00,,0.68,2.50,,Electronic Probe,...,0.00,,3.18,0.000000,,0.000000,,,1,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-03,0.000000,0.2,0.0,0.0,0.00,,0.68,2.50,1.30,Electronic Probe,...,0.00,0.00,-1.30,0.000000,0.000000,0.000000,0.520000,28.767123,0,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-04,0.000000,0.0,0.0,0.0,0.00,,0.68,2.50,0.46,Electronic Probe,...,0.00,0.00,-0.46,0.000000,0.000000,0.000000,,74.794521,1,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-05,0.000000,0.0,0.0,0.0,0.00,,0.46,2.50,0.51,Electronic Probe,...,0.00,0.00,-0.51,0.000000,0.000000,0.000000,,72.054795,1,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-06,0.000000,0.0,0.0,0.0,0.00,,0.48,2.50,0.24,Electronic Probe,...,0.00,0.00,-0.24,0.000000,0.000000,0.000000,,86.849315,1,Imputed eto. Stuck etc due to stuck eto. Heat ...
2017-08-07,0.800000,0.0,0.0,0.0,0.00,,0.40,2.65,0.31,Electronic Probe,...,0.00,0.00,-0.31,0.800000,0.800000,0.800000,,83.975187,1,Imputed eto. Stuck etc due to stuck eto. Unacc...
2017-08-08,2.750000,0.0,0.0,0.0,0.00,,0.38,2.65,0.24,Electronic Probe,...,0.00,0.00,-0.24,1.950000,2.750000,3.550000,,87.593693,1,Imputed eto. Stuck etc due to stuck eto. Unacc...
2017-08-09,1.250000,0.0,0.0,0.0,0.00,,0.32,2.65,0.50,Electronic Probe,...,0.00,0.00,-0.50,-1.500000,0.450000,4.800000,,74.153528,1,Imputed eto. Stuck etc due to stuck eto. Unacc...
2017-08-10,0.000000,0.0,0.0,0.0,0.00,,0.32,2.65,0.29,Electronic Probe,...,0.00,0.00,-0.30,-1.250000,-2.750000,4.800000,,85.009046,1,Imputed eto. Stuck etc due to stuck eto. Unacc...


In [56]:
master_data.count()

heat_units              550
rain                    550
erain                   550
total_irrig             550
tot_eff_irrig           550
etc                     256
ety                     550
eto                     545
etcp                    231
rzm_source              550
profile                 530
original_unit_system    550
calendar_week           550
eto_diff1               549
eto_diff2               548
profile_difference      517
hu_diff1                549
hu_diff2                548
cumul_heat_units        550
kcp                      58
kcp_perc_deviation      231
binary_value            550
description             550
dtype: int64

# Only retain month and day; discard year.  Replot $k_{cp}$.

In [57]:
condition = master_data["binary_value"] == 0
useful_dates = master_data[condition].index
starting_year = useful_dates[0].year

new_dates = []
for d in useful_dates:
    extracted_month = d.month
    if 8 <= extracted_month <= 12:
        new_dates.append(datetime.datetime(year=starting_year, month=d.month, day=d.day))
    else:
        new_dates.append(datetime.datetime(year=starting_year+1, month=d.month, day=d.day))

In [58]:
import matplotlib.dates as mdates
from matplotlib.ticker import MaxNLocator

fig, ax = plt.subplots()
fig.set_size_inches(8, 3)
plt.locator_params(axis="x", n_bins=12)

mdates.AutoDateFormatter
mdates.AutoDateLocator
ax.scatter(new_dates, master_data.loc[useful_dates, "kcp"], color="gold",
           label="Remaining $k_{cp}$", marker="D", s=10, edgecolors="black",
           linewidth=1, alpha=0.6)
ax.set_xlabel("Month")
ax.set_ylabel("$k_{cp}$")
ax.set_title("$k_{cp}$ versus time")
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b/%d'))
ax.set_xlim(left=datetime.datetime(year=starting_year, month=8, day=1),
            right=datetime.datetime(year=starting_year+1, month=7, day=31))
ax.legend()
fig.autofmt_xdate()
plt.show()

<IPython.core.display.Javascript object>