In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score
from sklearn.neural_network import MLPClassifier


# Load the dataset
data = pd.read_csv("./Soil Nutrients.csv")
print(data.select_dtypes(include=["object"]).columns)

data = data.drop(columns=["pH", "N_Ratio", "P_Ratio", "K_Ratio"])
data.head()


Index(['Name', 'Fertility', 'Photoperiod', 'Category_pH', 'Soil_Type',
       'Season'],
      dtype='object')


Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season
0,Strawberry,Moderate,Day Neutral,20.887923,747.860765,13.091483,533.762876,91.197196,170.800381,118.670058,243.331211,20.369555,low_acidic,Loam,Summer
1,Strawberry,Moderate,Day Neutral,18.062721,711.104329,13.063016,505.789101,91.939623,179.290364,121.020244,246.910378,20.402751,low_acidic,Loam,Spring
2,Strawberry,Moderate,Short Day Period,16.776782,774.038247,12.945927,512.985617,91.387286,181.440732,116.936806,242.699601,19.158847,low_acidic,Loam,Summer
3,Strawberry,Moderate,Short Day Period,14.281,665.633506,13.318922,484.860067,91.254598,176.165282,122.233153,237.096892,20.265745,low_acidic,Loam,Summer
4,Strawberry,Moderate,Day Neutral,21.44449,806.531455,13.312915,512.747307,92.354829,182.935334,126.088234,243.880364,20.397336,low_acidic,Loam,Spring


In [48]:
categorical_cols = data.select_dtypes(include=["object"]).columns
numerical_cols = data.select_dtypes(include=["float64"]).columns

print("Categorical Columns:", list(categorical_cols))
print("Numerical Columns:", list(numerical_cols))

Categorical Columns: ['Name', 'Fertility', 'Photoperiod', 'Category_pH', 'Soil_Type', 'Season']
Numerical Columns: ['Temperature', 'Rainfall', 'Light_Hours', 'Light_Intensity', 'Rh', 'Nitrogen', 'Phosphorus', 'Potassium', 'Yield']


In [49]:
# Check the distribution of the target variable
print(data["Category_pH"].value_counts(normalize=True))

Category_pH
low_acidic      0.614675
neutral         0.316299
low_alkaline    0.069026
Name: proportion, dtype: float64


In [50]:
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
data.head()

Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season
0,Strawberry,Moderate,Day Neutral,0.019536,-0.589525,1.403239,0.712387,1.267058,0.478975,0.151289,0.60441,-0.153172,low_acidic,Loam,Summer
1,Strawberry,Moderate,Day Neutral,-0.620371,-0.697355,1.392241,0.565548,1.306123,0.624047,0.183582,0.638829,-0.151036,low_acidic,Loam,Spring
2,Strawberry,Moderate,Short Day Period,-0.911635,-0.51273,1.347004,0.603324,1.27706,0.660791,0.127472,0.598336,-0.231077,low_acidic,Loam,Summer
3,Strawberry,Moderate,Short Day Period,-1.476929,-0.83075,1.491108,0.455688,1.270078,0.570647,0.200248,0.544456,-0.159852,low_acidic,Loam,Summer
4,Strawberry,Moderate,Day Neutral,0.145598,-0.417406,1.488787,0.602073,1.327971,0.68633,0.25322,0.609691,-0.151385,low_acidic,Loam,Spring


In [51]:
# Define features and target variable
X = data.drop("Category_pH", axis=1)
y = data["Category_pH"]

In [52]:
X_categorical_cols = X.select_dtypes(include=["object"]).columns

In [53]:
encoder = OneHotEncoder(sparse_output=False)
# Apply one-hot encoding to categorical columns
encoded_X = encoder.fit_transform(X[X_categorical_cols])
encoded_X_df = pd.DataFrame(encoded_X, columns=encoder.get_feature_names_out(X_categorical_cols))
X = X.drop(columns=X_categorical_cols)
X_encoded = pd.concat([X, encoded_X_df], axis=1)

X_encoded.head()
# X_encoded = X

# label_encoder = LabelEncoder()
# for col in X_categorical_cols:
#     X_encoded[col] = label_encoder.fit_transform(X[col])

# X_encoded.head()

Unnamed: 0,Temperature,Rainfall,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Name_Arugula,...,Photoperiod_Day Neutral,Photoperiod_Long Day Period,Photoperiod_Short Day Period,Soil_Type_Loam,Soil_Type_Sandy,Soil_Type_Sandy Loam,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,0.019536,-0.589525,1.403239,0.712387,1.267058,0.478975,0.151289,0.60441,-0.153172,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.620371,-0.697355,1.392241,0.565548,1.306123,0.624047,0.183582,0.638829,-0.151036,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.911635,-0.51273,1.347004,0.603324,1.27706,0.660791,0.127472,0.598336,-0.231077,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.476929,-0.83075,1.491108,0.455688,1.270078,0.570647,0.200248,0.544456,-0.159852,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.145598,-0.417406,1.488787,0.602073,1.327971,0.68633,0.25322,0.609691,-0.151385,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [54]:
X.columns, X_encoded.columns

(Index(['Temperature', 'Rainfall', 'Light_Hours', 'Light_Intensity', 'Rh',
        'Nitrogen', 'Phosphorus', 'Potassium', 'Yield'],
       dtype='object'),
 Index(['Temperature', 'Rainfall', 'Light_Hours', 'Light_Intensity', 'Rh',
        'Nitrogen', 'Phosphorus', 'Potassium', 'Yield', 'Name_Arugula',
        'Name_Asparagus', 'Name_Beet', 'Name_Broccoli', 'Name_Cabbage',
        'Name_Cauliflowers', 'Name_Chard', 'Name_Chilli Peppers', 'Name_Cress',
        'Name_Cucumbers', 'Name_Eggplants', 'Name_Endive', 'Name_Grapes',
        'Name_Green Peas', 'Name_Kale', 'Name_Lettuce', 'Name_Potatoes',
        'Name_Radicchio', 'Name_Spinach', 'Name_Strawberry', 'Name_Tomatoes',
        'Name_Watermelon', 'Fertility_High', 'Fertility_Moderate',
        'Photoperiod_Day Neutral', 'Photoperiod_Long Day Period',
        'Photoperiod_Short Day Period', 'Soil_Type_Loam', 'Soil_Type_Sandy',
        'Soil_Type_Sandy Loam', 'Season_Fall', 'Season_Spring', 'Season_Summer',
        'Season_Winter'],
   

In [55]:
# y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [56]:
from imblearn.over_sampling import SMOTE, ADASYN

smote = SMOTE(random_state=42)
X_encoded, y_encoded = smote.fit_resample(X_encoded, y_encoded)

# Check new class distribution
print(pd.Series(y_encoded).value_counts())

0    9466
2    9466
1    9466
Name: count, dtype: int64


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [12]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=300, activation="relu", solver="adam")
mlp_clf.fit(X_train, y_train)

y_pred = mlp_clf.predict(X_test)
accuracy_score(y_test, y_pred)



0.8116197183098591

In [13]:
from sklearn.linear_model import LogisticRegression

# # Now train the model with the 1D array of labels
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)


0.7651408450704226

In [14]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7651408450704226
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67      1878
           1       0.77      1.00      0.87      1899
           2       0.66      0.79      0.72      1903

    accuracy                           0.77      5680
   macro avg       0.81      0.76      0.75      5680
weighted avg       0.81      0.77      0.75      5680

Confusion Matrix:
 [[ 939  166  773]
 [   0 1899    0]
 [   0  395 1508]]


In [15]:
# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8088028169014084
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.61      0.73      1878
           1       0.83      1.00      0.91      1899
           2       0.73      0.82      0.77      1903

    accuracy                           0.81      5680
   macro avg       0.82      0.81      0.80      5680
weighted avg       0.82      0.81      0.80      5680

Confusion Matrix:
 [[1143  154  581]
 [   0 1898    1]
 [ 119  231 1553]]


In [16]:
from xgboost import XGBClassifier

# Initialize the RandomForestClassifier
model = XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7882042253521127
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.60      0.72      1878
           1       0.81      0.99      0.89      1899
           2       0.70      0.77      0.73      1903

    accuracy                           0.79      5680
   macro avg       0.80      0.79      0.78      5680
weighted avg       0.80      0.79      0.78      5680

Confusion Matrix:
 [[1132  144  602]
 [   2 1884   13]
 [ 154  288 1461]]


In [32]:
import keras

from keras.api.models import Sequential
from keras.api.layers import Dense, Dropout, BatchNormalization, Activation
from keras.api.optimizers import Adam, SGD, RMSprop
from keras.api.callbacks import EarlyStopping, ReduceLROnPlateau

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [46]:
# model = Sequential()
# model.add(Dense(128, activation="relu", input_shape=(X_train.shape[1],)))
# model.add(Dense(64, activation="relu"))
# model.add(Dense(32, activation="relu"))
# model.add(Dense(16, activation="relu"))
# model.add(Dense(8, activation="relu"))
# model.add(Dense(len(np.unique(y)), activation="softmax"))
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model = Sequential(
    [
        Dense(128, input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Activation("relu"),
        Dense(64),
        BatchNormalization(),
        Activation("relu"),
        Dense(len(np.unique(y)), activation="softmax"),
    ]
)
# Compile the model

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False)

# Reshape the labels and fit_transform
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

# Use the encoded labels in your model
model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])

callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.1, patience=5, min_lr=1e-6),
]
# model.compile(optimizer=RMSprop(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
# model.compile(optimizer=Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
# Train the model
history = model.fit(X_train, y_train_encoded, epochs=50, batch_size=64, validation_split=0.2, callbacks=callbacks)
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
# Plotting the training history
plt.figure(figsize=(12, 6))
plt.plot(history.history["accuracy"], label="Train Accuracy")
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

AttributeError: 'Series' object has no attribute 'reshape'