In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from lazypredict.Supervised import LazyClassifier, LazyRegressor

In [2]:
df = pd.read_csv("./Soil Nutrients.csv")
df.head()

Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season,N_Ratio,P_Ratio,K_Ratio
0,Strawberry,Moderate,Day Neutral,20.89,747.86,6.57,13.09,533.76,91.2,170.8,118.67,243.33,20.37,low_acidic,Loam,Summer,10.0,10.0,10.0
1,Strawberry,Moderate,Day Neutral,18.06,711.1,6.25,13.06,505.79,91.94,179.29,121.02,246.91,20.4,low_acidic,Loam,Spring,10.0,10.0,10.0
2,Strawberry,Moderate,Short Day Period,16.78,774.04,6.35,12.95,512.99,91.39,181.44,116.94,242.7,19.16,low_acidic,Loam,Summer,10.0,10.0,10.0
3,Strawberry,Moderate,Short Day Period,14.28,665.63,6.26,13.32,484.86,91.25,176.17,122.23,237.1,20.27,low_acidic,Loam,Summer,10.0,10.0,10.0
4,Strawberry,Moderate,Day Neutral,21.44,806.53,6.38,13.31,512.75,92.35,182.94,126.09,243.88,20.4,low_acidic,Loam,Spring,10.0,10.0,10.0


In [3]:
df = df.drop(columns=["pH"])

In [4]:
encoder = LabelEncoder()
label_mappings = {}

for column in df.select_dtypes("object"):
    df[column] = encoder.fit_transform(df[column])
    label_mappings[column] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

In [6]:
pca = PCA(n_components=3)
pca_soil_nutrients = pca.fit_transform(df.drop(columns=["Category_pH"]))
pca_soil_nutrients = pd.DataFrame(
    data=pca_soil_nutrients, columns=["PCA Component 1", "PCA Component 2", "PCA Component 3"]
)
pca_soil_nutrients

Unnamed: 0,PCA Component 1,PCA Component 2,PCA Component 3
0,-225.06,111.95,2.59
1,-257.68,82.00,18.38
2,-196.43,98.36,15.55
3,-298.20,51.11,15.54
4,-165.35,105.32,21.77
...,...,...,...
15395,149.93,-75.23,-19.28
15396,26.97,-71.11,-43.27
15397,20.91,-96.60,-34.71
15398,-58.44,-130.25,-33.12


In [7]:
pca_soil_nutrients["Category_pH"] = df["Category_pH"]

In [8]:
class CropClassifier:
    def __init__(self, data, features_for_classification, test_size=0.2, random_state=42):
        self.data = data
        self.features_for_classification = features_for_classification
        self.test_size = test_size
        self.random_state = random_state

        self.features = self.data.drop(self.features_for_classification, axis=1)
        self.targets = self.data[self.features_for_classification]
        self.features_train, self.features_test, self.targets_train, self.targets_test = train_test_split(
            self.features, self.targets, test_size=self.test_size, random_state=self.random_state, stratify=self.targets
        )

        self.lazyClassifier = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
        self.models = None
        self.predictions = None

    def fit(self):
        self.models, self.predictions = self.lazyClassifier.fit(
            self.features_train, self.features_test, self.targets_train, self.targets_test
        )

    def get_models(self):
        return self.models

    def get_predictions(self):
        return self.predictions

In [9]:
cropClassifier = CropClassifier(pca_soil_nutrients, ["Category_pH"])
cropClassifier.fit()

 97%|█████████▋| 30/31 [00:12<00:00,  2.66it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 12320, number of used features: 3
[LightGBM] [Info] Start training from score -0.486635
[LightGBM] [Info] Start training from score -2.673743
[LightGBM] [Info] Start training from score -1.151017


100%|██████████| 31/31 [00:13<00:00,  2.35it/s]


In [10]:
models = cropClassifier.get_models().style.format(precision=5)
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreeClassifier,0.61364,0.51667,,0.6164,0.01417
LGBMClassifier,0.62273,0.51407,,0.62039,1.02482
PassiveAggressiveClassifier,0.54253,0.51123,,0.51819,0.19935
KNeighborsClassifier,0.62792,0.50988,,0.62419,0.05196
NearestCentroid,0.50519,0.50725,,0.54195,0.07533
XGBClassifier,0.6263,0.50695,,0.62231,0.42741
RandomForestClassifier,0.6263,0.50528,,0.62227,1.24164
BaggingClassifier,0.6289,0.50133,,0.61801,0.29029
DecisionTreeClassifier,0.61948,0.49052,,0.61744,0.06064
ExtraTreesClassifier,0.61688,0.48979,,0.6132,0.58019
