In [1]:
# 📦 Importing core data handling and visualization libraries
import pandas as pd                          # For loading and manipulating data (DataFrames)
import matplotlib.pyplot as plt              # For creating basic static plots
import seaborn as sns                        # For more advanced and stylish plots

# 🤖 Importing ML tools from scikit-learn
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.ensemble import RandomForestClassifier   # Random Forest - the classification algorithm we'll use
from sklearn.multioutput import MultiOutputClassifier # To handle multiple output labels (e.g., multiple pumps)

from sklearn.metrics import classification_report     # For evaluating model performance with precision, recall, etc.

# 🛠️ Utility tools
from sklearn.preprocessing import MinMaxScaler        # To normalize/scale features so all values are in the same range
import joblib                                         # To save and load ML models (for later use or deployment)


In [2]:
# 📄 Load the dataset into a pandas DataFrame
# Make sure the CSV file is in the same folder or provide the correct path
df = pd.read_csv("irrigation_machine.csv")  # Replace with actual file name if different

In [3]:
# 👀 View the first 5 rows of the dataset to understand its structure and content
df.head()

# (You mentioned df.tail() in the comment — that would show the last 5 rows instead)
# If needed, you can also use: df.tail()  # To see the last 5 rows


Unnamed: 0.1,Unnamed: 0,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,parcel_0,parcel_1,parcel_2
0,0,1.0,2.0,1.0,7.0,0.0,1.0,1.0,4.0,0.0,...,8.0,1.0,0.0,2.0,1.0,9.0,2.0,0,1,0
1,1,5.0,1.0,3.0,5.0,2.0,2.0,1.0,2.0,3.0,...,4.0,5.0,5.0,2.0,2.0,2.0,7.0,0,0,0
2,2,3.0,1.0,4.0,3.0,4.0,0.0,1.0,6.0,0.0,...,3.0,3.0,1.0,0.0,3.0,1.0,0.0,1,1,0
3,3,2.0,2.0,4.0,3.0,5.0,0.0,3.0,2.0,2.0,...,4.0,1.0,1.0,4.0,1.0,3.0,2.0,0,0,0
4,4,4.0,3.0,3.0,2.0,5.0,1.0,3.0,1.0,1.0,...,1.0,3.0,2.0,2.0,1.0,1.0,0.0,1,1,0


In [4]:
# 📋 Get a concise summary of the DataFrame
# Shows column names, data types, number of non-null values, and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2000 non-null   int64  
 1   sensor_0    2000 non-null   float64
 2   sensor_1    2000 non-null   float64
 3   sensor_2    2000 non-null   float64
 4   sensor_3    2000 non-null   float64
 5   sensor_4    2000 non-null   float64
 6   sensor_5    2000 non-null   float64
 7   sensor_6    2000 non-null   float64
 8   sensor_7    2000 non-null   float64
 9   sensor_8    2000 non-null   float64
 10  sensor_9    2000 non-null   float64
 11  sensor_10   2000 non-null   float64
 12  sensor_11   2000 non-null   float64
 13  sensor_12   2000 non-null   float64
 14  sensor_13   2000 non-null   float64
 15  sensor_14   2000 non-null   float64
 16  sensor_15   2000 non-null   float64
 17  sensor_16   2000 non-null   float64
 18  sensor_17   2000 non-null   float64
 19  sensor_18   2000 non-null  

In [5]:
# 📑 Display a list of all column names in the dataset
# Useful for checking sensor and parcel names or spotting unnecessary columns
df.columns

Index(['Unnamed: 0', 'sensor_0', 'sensor_1', 'sensor_2', 'sensor_3',
       'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9',
       'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14',
       'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19',
       'parcel_0', 'parcel_1', 'parcel_2'],
      dtype='object')

In [6]:
# 🧹 Drop the unnecessary column 'Unnamed: 0'
# This column usually appears when a CSV file includes the index column during export
# axis=1 means you're dropping a column (axis=0 would be for rows)
df = df.drop('Unnamed: 0', axis=1)

# 🔁 Check the first 5 rows again to confirm the column was removed successfully
df.head()

Unnamed: 0,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,parcel_0,parcel_1,parcel_2
0,1.0,2.0,1.0,7.0,0.0,1.0,1.0,4.0,0.0,3.0,...,8.0,1.0,0.0,2.0,1.0,9.0,2.0,0,1,0
1,5.0,1.0,3.0,5.0,2.0,2.0,1.0,2.0,3.0,1.0,...,4.0,5.0,5.0,2.0,2.0,2.0,7.0,0,0,0
2,3.0,1.0,4.0,3.0,4.0,0.0,1.0,6.0,0.0,2.0,...,3.0,3.0,1.0,0.0,3.0,1.0,0.0,1,1,0
3,2.0,2.0,4.0,3.0,5.0,0.0,3.0,2.0,2.0,5.0,...,4.0,1.0,1.0,4.0,1.0,3.0,2.0,0,0,0
4,4.0,3.0,3.0,2.0,5.0,1.0,3.0,1.0,1.0,2.0,...,1.0,3.0,2.0,2.0,1.0,1.0,0.0,1,1,0


In [7]:
# 📊 Generate summary statistics of all numeric columns in the dataset
# Includes: count, mean, std deviation, min, 25%, 50%, 75%, and max
df.describe()

# Helps spot outliers, data ranges, and whether scaling/normalization might be needed

Unnamed: 0,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,parcel_0,parcel_1,parcel_2
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1.437,1.659,2.6545,2.6745,2.8875,1.411,3.3155,4.2015,1.214,1.901,...,2.7315,3.416,1.2065,2.325,1.7295,2.2745,1.8135,0.6355,0.7305,0.212
std,1.321327,1.338512,1.699286,1.855875,1.816451,1.339394,2.206444,2.280241,1.386782,1.518668,...,1.774537,1.960578,1.258034,1.715181,1.561265,1.67169,1.469285,0.48141,0.443811,0.408827
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,1.0,2.0,0.0,2.0,3.0,0.0,1.0,...,1.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,1.0,1.0,2.0,2.0,3.0,1.0,3.0,4.0,1.0,2.0,...,2.0,3.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,0.0
75%,2.0,2.0,4.0,4.0,4.0,2.0,5.0,6.0,2.0,3.0,...,4.0,5.0,2.0,3.0,3.0,3.0,3.0,1.0,1.0,0.0
max,8.0,9.0,10.0,11.0,12.0,7.0,13.0,12.0,8.0,9.0,...,11.0,11.0,6.0,10.0,11.0,10.0,7.0,1.0,1.0,1.0


In [8]:
# -------------------------------
# STEP 2: DEFINE FEATURES AND LABELS
# -------------------------------

# 🎯 X (Features): Selecting sensor data (columns 0 to 19)
# These are the inputs used by the model to make predictions
X = df.iloc[:, 0:20]

# 🎯 y (Labels/Targets): Selecting parcel data (columns 20 onward)
# These are the outputs the model will learn to predict
y = df.iloc[:, 20:]

In [9]:
# 🔍 View 10 random rows from the features (X) to inspect the sensor data
# Useful for checking if values look reasonable and varied
X.sample(10)

Unnamed: 0,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19
159,1.0,2.0,6.0,0.0,4.0,0.0,8.0,3.0,0.0,0.0,5.0,4.0,2.0,2.0,7.0,3.0,3.0,4.0,0.0,0.0
1474,1.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,0.0,2.0,7.0,3.0,7.0,5.0,0.0,1.0,0.0,3.0,2.0
494,3.0,3.0,2.0,2.0,4.0,3.0,2.0,5.0,2.0,1.0,2.0,5.0,1.0,4.0,4.0,3.0,3.0,1.0,0.0,3.0
405,0.0,2.0,4.0,5.0,1.0,1.0,5.0,6.0,0.0,2.0,3.0,2.0,6.0,5.0,6.0,0.0,1.0,0.0,3.0,2.0
775,0.0,1.0,1.0,6.0,4.0,2.0,0.0,4.0,3.0,1.0,4.0,5.0,3.0,2.0,4.0,0.0,0.0,0.0,5.0,2.0
1934,2.0,3.0,3.0,5.0,3.0,1.0,1.0,8.0,2.0,4.0,5.0,8.0,4.0,2.0,5.0,0.0,3.0,1.0,2.0,4.0
1242,4.0,2.0,2.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,5.0,5.0,1.0,4.0,3.0,3.0,5.0,1.0,1.0
573,0.0,3.0,1.0,5.0,5.0,0.0,1.0,3.0,0.0,3.0,6.0,1.0,4.0,6.0,5.0,1.0,0.0,1.0,2.0,1.0
1921,0.0,5.0,1.0,7.0,3.0,1.0,3.0,1.0,0.0,1.0,5.0,4.0,6.0,3.0,4.0,0.0,1.0,0.0,2.0,5.0
1158,0.0,1.0,3.0,4.0,5.0,2.0,6.0,2.0,4.0,2.0,4.0,4.0,6.0,2.0,1.0,1.0,1.0,2.0,5.0,4.0


In [10]:
# 🔍 View 10 random rows from the labels (y) to inspect the parcel outputs
# Helps verify the label distribution and format
y.sample(10)

Unnamed: 0,parcel_0,parcel_1,parcel_2
404,0,1,0
1537,1,0,0
275,1,1,0
433,1,0,0
58,1,0,0
612,1,1,0
1954,1,1,0
423,1,1,1
570,0,1,0
1133,1,1,1


In [11]:
# 🧠 Check the structure of the features (X)
# Shows number of entries, column types, and memory usage — helps verify all sensors are numeric
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sensor_0   2000 non-null   float64
 1   sensor_1   2000 non-null   float64
 2   sensor_2   2000 non-null   float64
 3   sensor_3   2000 non-null   float64
 4   sensor_4   2000 non-null   float64
 5   sensor_5   2000 non-null   float64
 6   sensor_6   2000 non-null   float64
 7   sensor_7   2000 non-null   float64
 8   sensor_8   2000 non-null   float64
 9   sensor_9   2000 non-null   float64
 10  sensor_10  2000 non-null   float64
 11  sensor_11  2000 non-null   float64
 12  sensor_12  2000 non-null   float64
 13  sensor_13  2000 non-null   float64
 14  sensor_14  2000 non-null   float64
 15  sensor_15  2000 non-null   float64
 16  sensor_16  2000 non-null   float64
 17  sensor_17  2000 non-null   float64
 18  sensor_18  2000 non-null   float64
 19  sensor_19  2000 non-null   float64
dtypes: float

In [12]:
# 🏷️ Check the structure of the labels (y)
# Useful for confirming how many parcel columns there are and ensuring they’re of correct type (e.g., int64)
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   parcel_0  2000 non-null   int64
 1   parcel_1  2000 non-null   int64
 2   parcel_2  2000 non-null   int64
dtypes: int64(3)
memory usage: 47.0 KB


In [13]:
# 👀 Display the entire features DataFrame (X)
# Helps get a complete view of all sensor data — use with caution on large datasets!
X

Unnamed: 0,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19
0,1.0,2.0,1.0,7.0,0.0,1.0,1.0,4.0,0.0,3.0,1.0,3.0,6.0,8.0,1.0,0.0,2.0,1.0,9.0,2.0
1,5.0,1.0,3.0,5.0,2.0,2.0,1.0,2.0,3.0,1.0,3.0,2.0,2.0,4.0,5.0,5.0,2.0,2.0,2.0,7.0
2,3.0,1.0,4.0,3.0,4.0,0.0,1.0,6.0,0.0,2.0,3.0,2.0,4.0,3.0,3.0,1.0,0.0,3.0,1.0,0.0
3,2.0,2.0,4.0,3.0,5.0,0.0,3.0,2.0,2.0,5.0,3.0,1.0,2.0,4.0,1.0,1.0,4.0,1.0,3.0,2.0
4,4.0,3.0,3.0,2.0,5.0,1.0,3.0,1.0,1.0,2.0,4.0,5.0,3.0,1.0,3.0,2.0,2.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,4.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,3.0,3.0,1.0,2.0,3.0,2.0,1.0,1.0,0.0
1996,1.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,1.0,5.0,2.0,2.0,4.0,3.0,3.0,0.0,1.0,0.0,6.0,2.0
1997,1.0,3.0,3.0,1.0,1.0,4.0,8.0,1.0,0.0,0.0,3.0,2.0,4.0,2.0,3.0,4.0,4.0,4.0,1.0,0.0
1998,2.0,1.0,0.0,2.0,2.0,0.0,1.0,3.0,0.0,0.0,0.0,5.0,2.0,2.0,4.0,0.0,2.0,0.0,3.0,0.0


In [14]:
# 📏 Check the shape (rows, columns) of features and labels
# Useful to confirm data alignment — number of rows should be equal in both
X.shape, y.shape

((2000, 20), (2000, 3))