# **Dairy farm code**


# Data Understanding

#### Imports

In [1]:
import pandas as pd

In [2]:
# Import the CSV file into a DataFrame
farm_df = pd.read_csv("Extended_Groundwater_Dataset.csv")
farm_df.head()  # Display the first few rows of the DataFrame

Unnamed: 0,date,pH,electrical_conductivity_dS_m,nitrate_input_mg_L,fertilizer_kg_ha,manure_kg_ha,ammonium_mg_L,chloride_mg_L,crop_type,livestock_density_LU_ha,...,Sodium (NA),Ammonium (NH4-N),Nitrate (NO3-N),Total Phosphorus (PTOT-P),Phosphate (PO4-P),Lead (PB),Zinc (ZN),sunlight_hours,humidity_percent,temperature_celsius
0,1979-11-05,6.05,0.77,0.0,276.08,142.23,7.28,68.18,maize,3.18,...,,,,,,,,5.76,87.5,10.58
1,1979-11-05,6.34,1.12,47.82,209.28,112.23,4.02,56.88,grass,3.85,...,,,,,,,,5.72,78.79,11.72
2,1979-11-05,5.51,1.38,37.02,237.73,144.38,2.69,33.43,potatoes,1.7,...,70.141,40.766,0.029,4.693,,0.2,4.0,3.42,89.31,11.85
3,1979-11-05,4.92,1.04,63.87,107.53,72.44,1.41,47.53,grass,1.23,...,,,,,,,,5.68,83.99,12.7
4,1979-11-05,6.76,1.16,7.26,179.88,96.18,7.59,59.12,maize,0.97,...,,,,,,,,3.71,79.97,10.46


In [3]:
farm_df.isnull().sum()

date                                 0
pH                                   0
electrical_conductivity_dS_m         0
nitrate_input_mg_L                   0
fertilizer_kg_ha                     0
manure_kg_ha                         0
ammonium_mg_L                        0
chloride_mg_L                        0
crop_type                            0
livestock_density_LU_ha              0
urbanization                         0
soil_type                            0
aquifer_type                         0
permeability_m_day                   0
groundwater_level_cm                 0
precipitation_mm                     0
drought_days                         0
evaporation_mm                       0
nitrate_groundwater_mg_L             0
Aluminum (AL)                    74440
Arsenic (AS)                     74747
Barium (BA)                      74503
Calcium (CA)                     74440
Cadmium (CD)                     74440
Chloride (CL)                    74494
Copper (CU)              

In [4]:
unique_values_per_column = farm_df.nunique()

# Print results
print(unique_values_per_column)

date                            16018
pH                                311
electrical_conductivity_dS_m      201
nitrate_input_mg_L               5349
fertilizer_kg_ha                 7764
manure_kg_ha                     7193
ammonium_mg_L                    1070
chloride_mg_L                    5602
crop_type                           5
livestock_density_LU_ha           501
urbanization                        3
soil_type                           4
aquifer_type                        2
permeability_m_day                991
groundwater_level_cm             6644
precipitation_mm                 6203
drought_days                       60
evaporation_mm                   4799
nitrate_groundwater_mg_L         2519
Aluminum (AL)                     490
Arsenic (AS)                      902
Barium (BA)                      1426
Calcium (CA)                     1281
Cadmium (CD)                      222
Chloride (CL)                    1418
Copper (CU)                       263
Electrical C

In [5]:
# Here we check which rows have data missing in them. This helps decide which to remove
missing_counts = farm_df.isna().sum()
total_counts = farm_df.shape[0]  # Total number of rows

result = pd.DataFrame({
    'Missing Values': missing_counts,
    'Total Values': total_counts,
    'Missing Percentage': (missing_counts / total_counts * 100).round(2)
})

print(result)



                              Missing Values  Total Values  Missing Percentage
date                                       0        144021                0.00
pH                                         0        144021                0.00
electrical_conductivity_dS_m               0        144021                0.00
nitrate_input_mg_L                         0        144021                0.00
fertilizer_kg_ha                           0        144021                0.00
manure_kg_ha                               0        144021                0.00
ammonium_mg_L                              0        144021                0.00
chloride_mg_L                              0        144021                0.00
crop_type                                  0        144021                0.00
livestock_density_LU_ha                    0        144021                0.00
urbanization                               0        144021                0.00
soil_type                                  0        

In [6]:
# Add time-based features
farm_df['date'] = pd.to_datetime(farm_df['date'])
farm_df['month'] = farm_df['date'].dt.month
farm_df['year'] = farm_df['date'].dt.year
farm_df['week'] = farm_df['date'].dt.isocalendar().week

In [9]:
# Group by 'year' and calculate missing values, total values, and missing percentage
missing_per_year = farm_df.groupby('date',).apply(lambda group: pd.DataFrame({
    'Missing Values': group.isna().sum(),
    'Total Values': len(group),
    'Missing Percentage': (group.isna().sum() / len(group) * 100).round(2)
}))

print(missing_per_year)


                                         Missing Values  Total Values  \
date                                                                    
1979-11-05 date                                       0            12   
           pH                                         0            12   
           electrical_conductivity_dS_m               0            12   
           nitrate_input_mg_L                         0            12   
           fertilizer_kg_ha                           0            12   
...                                                 ...           ...   
2023-09-13 humidity_percent                           0            12   
           temperature_celsius                        0            12   
           month                                      0            12   
           year                                       0            12   
           week                                       0            12   

                                         Missing P

  missing_per_year = farm_df.groupby('date',).apply(lambda group: pd.DataFrame({


In [10]:
unique_dates_per_year = farm_df.groupby('year')['date'].nunique()

print(unique_dates_per_year)

year
1979     57
1980    366
1981    365
1982    365
1983    365
1984    366
1985    365
1986    365
1987    365
1988    366
1989    365
1990    365
1991    365
1992    366
1993    365
1994    365
1995    365
1996    366
1997    365
1998    365
1999    365
2000    366
2001    365
2002    365
2003    365
2004    366
2005    365
2006    364
2007    365
2008    366
2009    365
2010    365
2011    365
2012    366
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    365
2022    365
2023    256
Name: date, dtype: int64
