In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


df = pd.read_csv('All_13100_Entries_with_Nutrient_Ranges.csv')
df.head()

Unnamed: 0,N,P,K,temperature,humidity,pH,rainfall,label,Plant_Category,EC (mS/cm) Range,Ca (ppm) Range,Mg (ppm) Range,Na (ppm) Range (Max)
0,58,34,42,18.1006,60.5567,7.5063,33.1599,Origanum majorana,Herb_Spice,1.5-2.5 mS/cm,100-150 ppm,30-50 ppm,< 50 ppm
1,56,40,57,23.7085,71.4758,7.6218,27.2026,Coriandrum sativum,Unclassified,1.5-2.5 mS/cm,100-180 ppm,30-50 ppm,< 60 ppm
2,136,37,81,18.2818,68.8113,7.3104,19.4648,Cucurbita pepo,Fruiting_Veg,2.0-4.5 mS/cm,180-250 ppm,40-70 ppm,< 50 ppm (Sensitive)
3,90,28,48,27.7032,66.6544,7.3479,48.6461,Mentha piperita,Herb_Spice,1.5-2.5 mS/cm,100-150 ppm,30-50 ppm,< 50 ppm
4,39,40,53,18.7403,53.3352,7.5331,36.7384,Raphanus sativus,Root_Tuber,1.8-2.8 mS/cm,100-180 ppm,30-60 ppm,< 70 ppm


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13100 entries, 0 to 13099
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   N                     13100 non-null  int64  
 1   P                     13100 non-null  int64  
 2   K                     13100 non-null  int64  
 3   temperature           13100 non-null  float64
 4   humidity              13100 non-null  float64
 5   pH                    13100 non-null  float64
 6   rainfall              13100 non-null  float64
 7   label                 13100 non-null  object 
 8   Plant_Category        13100 non-null  object 
 9   EC (mS/cm) Range      13100 non-null  object 
 10  Ca (ppm) Range        13100 non-null  object 
 11  Mg (ppm) Range        13100 non-null  object 
 12  Na (ppm) Range (Max)  13100 non-null  object 
dtypes: float64(4), int64(3), object(6)
memory usage: 1.3+ MB


In [3]:
df.describe()

Unnamed: 0,N,P,K,temperature,humidity,pH,rainfall
count,13100.0,13100.0,13100.0,13100.0,13100.0,13100.0,13100.0
mean,72.115802,39.446489,61.686031,21.921862,64.019293,7.145258,60.418725
std,43.418087,19.023604,36.20485,5.933828,10.740221,0.729565,37.181436
min,0.0,0.0,0.0,-1.9486,3.9284,4.2994,0.0
25%,36.0,26.0,36.0,17.493525,57.983775,6.6811,34.1827
50%,68.0,41.0,57.0,21.61505,64.34645,7.1999,53.23715
75%,104.0,53.0,84.0,26.16785,70.8541,7.657575,79.78575
max,213.0,110.0,185.0,46.4959,100.0,9.8546,300.8825


In [4]:
type(df['EC (mS/cm) Range'][0])

str

In [7]:
columns_to_process = {
    'EC (mS/cm) Range': 'mS/cm',
    'Ca (ppm) Range': 'ppm',
    'Mg (ppm) Range': 'ppm',
    'Na (ppm) Range (Max)': 'ppm'
}

def process_range_column(df, col_name, unit_to_remove):

    cleaned_series = df[col_name].str.replace(r"\(.*\)", "", regex=True)
    cleaned_series = cleaned_series.str.replace(unit_to_remove, "", regex=False).str.strip()


    # إنشاء أسماء الأعمدة الجديدة
    base_name = col_name.split('(')[0].strip().replace(' Range', '').replace(' (Max)', '')
    min_col_name = base_name + '_Min'
    max_col_name = base_name + '_Max'

    df[min_col_name] = np.nan
    df[max_col_name] = np.nan

    for index, value in cleaned_series.items():
        if pd.isna(value) or value == '':
            continue

        value = value.strip()

        if '-' in value:
            try:
                min_val, max_val = map(float, value.split('-'))
                df.loc[index, min_col_name] = min_val
                df.loc[index, max_col_name] = max_val
            except ValueError:
                pass

        elif '<' in value:
            try:
                max_val = float(value.replace('<', '').strip())
                df.loc[index, min_col_name] = 0.0
                df.loc[index, max_col_name] = max_val
            except ValueError:
                pass

        else:
            try:
                single_val = float(value)
                df.loc[index, min_col_name] = 0.0
                df.loc[index, max_col_name] = single_val
            except ValueError:
                pass

    return df

In [8]:
for col, unit in columns_to_process.items():
    df = process_range_column(df, col, unit)


In [9]:
df = df.drop(columns=list(columns_to_process.keys()))

In [10]:
df.head(100)

Unnamed: 0,N,P,K,temperature,humidity,pH,rainfall,label,Plant_Category,EC_Min,EC_Max,Ca_Min,Ca_Max,Mg_Min,Mg_Max,Na_Min,Na_Max
0,58,34,42,18.1006,60.5567,7.5063,33.1599,Origanum majorana,Herb_Spice,1.5,2.5,100.0,150.0,30.0,50.0,0.0,50.0
1,56,40,57,23.7085,71.4758,7.6218,27.2026,Coriandrum sativum,Unclassified,1.5,2.5,100.0,180.0,30.0,50.0,0.0,60.0
2,136,37,81,18.2818,68.8113,7.3104,19.4648,Cucurbita pepo,Fruiting_Veg,2.0,4.5,180.0,250.0,40.0,70.0,0.0,50.0
3,90,28,48,27.7032,66.6544,7.3479,48.6461,Mentha piperita,Herb_Spice,1.5,2.5,100.0,150.0,30.0,50.0,0.0,50.0
4,39,40,53,18.7403,53.3352,7.5331,36.7384,Raphanus sativus,Root_Tuber,1.8,2.8,100.0,180.0,30.0,60.0,0.0,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,22,3,46,25.4400,61.6076,7.5534,47.9932,Ficus carica,Tree_Fruit,1.0,2.5,100.0,200.0,30.0,60.0,0.0,50.0
96,64,31,42,13.6758,60.8599,7.3906,58.5838,Matricaria chamomilla,Unclassified,1.5,2.5,100.0,180.0,30.0,50.0,0.0,60.0
97,34,44,78,30.8434,65.8275,6.6650,68.9403,Vigna unguiculata,Legume,1.0,1.8,80.0,150.0,20.0,40.0,0.0,40.0
98,90,35,35,20.8466,60.2341,6.8216,73.2226,Mentha spicata,Herb_Spice,1.5,2.5,100.0,150.0,30.0,50.0,0.0,50.0


In [11]:
df["Na_Min"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 13100 entries, 0 to 13099
Series name: Na_Min
Non-Null Count  Dtype  
--------------  -----  
12100 non-null  float64
dtypes: float64(1)
memory usage: 102.5 KB


In [13]:
df["Na_Min"].unique()

array([ 0., nan])

In [14]:
df.isna().sum()

Unnamed: 0,0
N,0
P,0
K,0
temperature,0
humidity,0
pH,0
rainfall,0
label,0
Plant_Category,0
EC_Min,0


In [21]:
mean_Na = df['Na_Max'].mean()

In [22]:
df['Na_Max'].fillna(mean_Na, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Na_Max'].fillna(mean_Na, inplace=True)


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13100 entries, 0 to 13099
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   N               13100 non-null  int64  
 1   P               13100 non-null  int64  
 2   K               13100 non-null  int64  
 3   temperature     13100 non-null  float64
 4   humidity        13100 non-null  float64
 5   pH              13100 non-null  float64
 6   rainfall        13100 non-null  float64
 7   label           13100 non-null  object 
 8   Plant_Category  13100 non-null  object 
 9   EC_Min          13100 non-null  float64
 10  EC_Max          13100 non-null  float64
 11  Ca_Min          13100 non-null  float64
 12  Ca_Max          13100 non-null  float64
 13  Mg_Min          13100 non-null  float64
 14  Mg_Max          13100 non-null  float64
 15  Na_Min          12100 non-null  float64
 16  Na_Max          13100 non-null  float64
dtypes: float64(12), int64(3), objec

In [25]:
df['Na_Max'].unique()

array([50., 60., 70., 40.])

In [24]:
df['Na_Max'].isna().sum()

np.int64(0)

In [26]:
df.columns

Index(['N', 'P', 'K', 'temperature', 'humidity', 'pH', 'rainfall', 'label',
       'Plant_Category', 'EC_Min', 'EC_Max', 'Ca_Min', 'Ca_Max', 'Mg_Min',
       'Mg_Max', 'Na_Min', 'Na_Max'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,N,P,K,temperature,humidity,pH,rainfall,label,Plant_Category,EC_Min,EC_Max,Ca_Min,Ca_Max,Mg_Min,Mg_Max,Na_Min,Na_Max
0,58,34,42,18.1006,60.5567,7.5063,33.1599,Origanum majorana,Herb_Spice,1.5,2.5,100.0,150.0,30.0,50.0,0.0,50.0
1,56,40,57,23.7085,71.4758,7.6218,27.2026,Coriandrum sativum,Unclassified,1.5,2.5,100.0,180.0,30.0,50.0,0.0,60.0
2,136,37,81,18.2818,68.8113,7.3104,19.4648,Cucurbita pepo,Fruiting_Veg,2.0,4.5,180.0,250.0,40.0,70.0,0.0,50.0
3,90,28,48,27.7032,66.6544,7.3479,48.6461,Mentha piperita,Herb_Spice,1.5,2.5,100.0,150.0,30.0,50.0,0.0,50.0
4,39,40,53,18.7403,53.3352,7.5331,36.7384,Raphanus sativus,Root_Tuber,1.8,2.8,100.0,180.0,30.0,60.0,0.0,70.0


In [28]:
df.drop('Na_Min', axis=1, inplace=True)

In [29]:
df.columns

Index(['N', 'P', 'K', 'temperature', 'humidity', 'pH', 'rainfall', 'label',
       'Plant_Category', 'EC_Min', 'EC_Max', 'Ca_Min', 'Ca_Max', 'Mg_Min',
       'Mg_Max', 'Na_Max'],
      dtype='object')

In [31]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'pH', 'rainfall',
                  'EC_Min', 'EC_Max', 'Ca_Min', 'Ca_Max', 'Mg_Min', 'Mg_Max', 'Na_Max']

scaler = StandardScaler()

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print(df[numerical_cols].describe().round(2).T)

               count  mean  std   min   25%   50%   75%   max
N            13100.0  -0.0  1.0 -1.66 -0.83 -0.09  0.73  3.24
P            13100.0  -0.0  1.0 -2.07 -0.71  0.08  0.71  3.71
K            13100.0  -0.0  1.0 -1.70 -0.71 -0.13  0.62  3.41
temperature  13100.0   0.0  1.0 -4.02 -0.75 -0.05  0.72  4.14
humidity     13100.0   0.0  1.0 -5.60 -0.56  0.03  0.64  3.35
pH           13100.0  -0.0  1.0 -3.90 -0.64  0.07  0.70  3.71
rainfall     13100.0   0.0  1.0 -1.63 -0.71 -0.19  0.52  6.47
EC_Min       13100.0   0.0  1.0 -1.15 -1.15  0.36  0.36  1.87
EC_Max       13100.0  -0.0  1.0 -1.10 -0.81 -0.08 -0.08  2.85
Ca_Min       13100.0   0.0  1.0 -1.52 -0.07 -0.07 -0.07  2.85
Ca_Max       13100.0  -0.0  1.0 -1.68 -0.76  0.16  0.16  2.32
Mg_Min       13100.0   0.0  1.0 -2.17 -0.53  0.29  0.29  1.94
Mg_Max       13100.0   0.0  1.0 -1.81 -0.20 -0.20  0.88  1.96
Na_Max       13100.0   0.0  1.0 -1.64 -0.40 -0.40  0.85  2.09
