## pH Mapping

- W. Mutatu
- A. Marungudze
- S. G. Mampengesi

In [26]:
import pandas as pd
data = pd.read_excel("Estates Growers & ZSAES Soil 2023.xls")
data.columns

Index(['Date Received', 'Unnamed: 1', 'Remarks', 'Lab Ref ', 'Batch',
       'SOURCE/OWNER', 'farm /sec', 'Field #', 'Plot #', 'Rep #', 'Trial #',
       'Sampling Date', 'Depth (cm)', 'Colour', 'Texture', 'Clay (%)',
       'Silt (%)', 'Sand (%)', 'OM (%)', 'pH (CaCl2)', 'EC (uS/cm)',
       'P2O5 (ppm)', 'K (ppm)', 'Ca (ppm)', 'Mg (ppm)', 'Na (ppm)', 'SO4',
       'Initial N', 'Incubated N', 'Paste EC', 'Extract EC', 'Percentage',
       'K me/L', 'Ca me/L', 'Mg me/L', 'Na me/L', 'Cu (ppm)', 'Zn (ppm)',
       'Fe (ppm)', 'Mn (ppm)', 'Al (ppm)', 'TOTAL N', 'TOTAL N.1', 'K (me/L)',
       'Ca (me/L)', 'Mg (me/L)', 'Na (me/L)', '%', 'N(initial)',
       'N(incubated)'],
      dtype='object')

In [27]:
data = data[['Lab Ref ', 'SOURCE/OWNER', 'farm /sec', 'Field #', 'pH (CaCl2)']]
data['pH (CaCl2)'] = data['pH (CaCl2)'].astype(float)
data.head()

Unnamed: 0,Lab Ref,SOURCE/OWNER,farm /sec,Field #,pH (CaCl2)
0,HS01,HVE,25,2504 A,6.13
1,HS02,HVE,8,834 A,7.1
2,HS03,HVE,8,834 E,7.04
3,HS04,HVE,22,2223 A,7.09
4,HS05,HVE,2,235,6.99


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5648 entries, 0 to 5647
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Lab Ref       5644 non-null   object 
 1   SOURCE/OWNER  5641 non-null   object 
 2   farm /sec     5376 non-null   object 
 3   Field #       5068 non-null   object 
 4   pH (CaCl2)    5600 non-null   float64
dtypes: float64(1), object(4)
memory usage: 220.8+ KB


In [29]:
data['SOURCE/OWNER'].nunique()

326

In [30]:
# categorising the sample as either from Triangle, Hippo Valley, Mkwasine, ZSAES
cats = []
for obs in data['Lab Ref ']:
    if(str(obs).startswith('TS')):
        cats.append('TRI')
    elif(str(obs).startswith('HS')):
        cats.append('HVE')
    elif(str(obs).startswith('MS')):
        cats.append('MKW')
    elif(str(obs).startswith('ZS')):
        cats.append('ZSAES')
    else:
        cats.append('Private')

In [31]:
data['zone'] = cats
data.head()

Unnamed: 0,Lab Ref,SOURCE/OWNER,farm /sec,Field #,pH (CaCl2),zone
0,HS01,HVE,25,2504 A,6.13,HVE
1,HS02,HVE,8,834 A,7.1,HVE
2,HS03,HVE,8,834 E,7.04,HVE
3,HS04,HVE,22,2223 A,7.09,HVE
4,HS05,HVE,2,235,6.99,HVE


In [32]:
data['zone'].value_counts()

zone
HVE        2239
ZSAES      1493
TRI        1146
MKW         561
Private     209
Name: count, dtype: int64

In [33]:
# checking for null values
data.isnull().sum()

Lab Ref           4
SOURCE/OWNER      7
farm /sec       272
Field #         580
pH (CaCl2)       48
zone              0
dtype: int64

In [34]:
# dropping rows with nulls for pH and farm sect
data = data.dropna(subset=['pH (CaCl2)', 'farm /sec'])


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5334 entries, 0 to 5644
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Lab Ref       5334 non-null   object 
 1   SOURCE/OWNER  5332 non-null   object 
 2   farm /sec     5334 non-null   object 
 3   Field #       4782 non-null   object 
 4   pH (CaCl2)    5334 non-null   float64
 5   zone          5334 non-null   object 
dtypes: float64(1), object(5)
memory usage: 291.7+ KB


In [36]:
data = data[data['zone'] != 'Private']

In [37]:
grouped = data.groupby('zone')['pH (CaCl2)'].agg('mean')

In [38]:
grouped.head()

zone
HVE      7.019249
MKW      6.463987
TRI      6.737655
ZSAES    6.626129
Name: pH (CaCl2), dtype: float64

In [39]:
grouped.to_excel('summarised_by_zone.xlsx')

In [40]:
# group by area section 
data.head()

Unnamed: 0,Lab Ref,SOURCE/OWNER,farm /sec,Field #,pH (CaCl2),zone
0,HS01,HVE,25,2504 A,6.13,HVE
1,HS02,HVE,8,834 A,7.1,HVE
2,HS03,HVE,8,834 E,7.04,HVE
3,HS04,HVE,22,2223 A,7.09,HVE
4,HS05,HVE,2,235,6.99,HVE


In [41]:
# to be removed when adding individual farm data
data['farm /sec'] = pd.to_numeric(data['farm /sec'], errors='coerce')

In [42]:
data['zone_1'] = data['zone'] + ' - ' + data['farm /sec'].astype(str)
data.head()

Unnamed: 0,Lab Ref,SOURCE/OWNER,farm /sec,Field #,pH (CaCl2),zone,zone_1
0,HS01,HVE,25.0,2504 A,6.13,HVE,HVE - 25.0
1,HS02,HVE,8.0,834 A,7.1,HVE,HVE - 8.0
2,HS03,HVE,8.0,834 E,7.04,HVE,HVE - 8.0
3,HS04,HVE,22.0,2223 A,7.09,HVE,HVE - 22.0
4,HS05,HVE,2.0,235,6.99,HVE,HVE - 2.0


In [43]:
all = data['zone_1'].value_counts()
print(all)

zone_1
ZSAES - nan     1399
MKW - nan        451
HVE - nan        334
HVE - 8.0        271
TRI - nan        192
                ... 
HVE - 308.0        1
HVE - 41.0         1
TRI - 5.0          1
MKW - 610.0        1
HVE - 6144.0       1
Name: count, Length: 78, dtype: int64


In [44]:
# grouped_2 = data.groupby('zone_1')['pH (CaCl2)'].agg(['mean', 'min', 'max'])

# # min and max pH values
# min_zone = data.loc[data['pH (CaCl2)'].idxmin(), 'zone_1']
# max_zone = data.loc[data['pH (CaCl2)'].idxmax(), 'zone_1']

# print(grouped_2.head())
# print(f"Zone with minimum pH: {min_zone}")
# print(f"Zone with maximum pH: {max_zone}")


In [45]:
# Group by zone_1 and compute mean, min, and max pH
grouped_2 = data.groupby('zone_1')['pH (CaCl2)'].agg(['mean', 'min', 'max']).reset_index()

# Find the corresponding farm/section for min and max pH in each zone_1
sect_pH_min = data.loc[data.groupby('zone_1')['pH (CaCl2)'].idxmin(), ['zone_1', 'Field #']]
sect_pH_max = data.loc[data.groupby('zone_1')['pH (CaCl2)'].idxmax(), ['zone_1', 'Field #']]

# Rename columns for clarity
sect_pH_min = sect_pH_min.rename(columns={'Field #': 'sect_pH_min'})
sect_pH_max = sect_pH_max.rename(columns={'Field #': 'sect_pH_max'})

# Merge back with grouped data
grouped_2 = grouped_2.merge(sect_pH_min, on='zone_1', how='left')
grouped_2 = grouped_2.merge(sect_pH_max, on='zone_1', how='left')

print(grouped_2)


          zone_1      mean   min   max sect_pH_min sect_pH_max
0     HVE - 10.0  7.540548  6.74  8.83        1054        1016
1     HVE - 11.0  6.458000  5.15  7.88        1147       1118B
2     HVE - 12.0  6.825345  5.52  7.79        1213       H1257
3     HVE - 14.0  6.573418  5.51  7.83        1454       1408K
4    HVE - 140.0  6.453333  5.73  7.68       H1456        1401
..           ...       ...   ...   ...         ...         ...
73   ZSAES - 1.0  6.560000  6.23  7.38        5158        5158
74  ZSAES - 14.0  7.261667  6.58  7.64         NaN         NaN
75  ZSAES - 18.0  7.481774  6.39  8.43        1831        1831
76  ZSAES - 20.0  6.405000  5.83  7.30        2002        2006
77   ZSAES - nan  6.586305  0.30  8.72        L1 B         NaN

[78 rows x 6 columns]


In [46]:
grouped_2.to_excel("Grouped 2 2023.xlsx")
