In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

!pip install --quiet lazypredict
from lazypredict.Supervised import LazyClassifier, LazyRegressor

import warnings
warnings.filterwarnings('ignore')

In [3]:
soil_nutrients = pd.read_csv(r'C:\Users\dell\Desktop\MyDocs\Docs\MK\Soil Nutrients.csv')
soil_nutrients

Unnamed: 0,Name,Fertility,Photoperiod,Temperature,Rainfall,pH,Light_Hours,Light_Intensity,Rh,Nitrogen,Phosphorus,Potassium,Yield,Category_pH,Soil_Type,Season,N_Ratio,P_Ratio,K_Ratio
0,Strawberry,Moderate,Day Neutral,20.89,747.86,6.57,13.09,533.76,91.20,170.80,118.67,243.33,20.37,low_acidic,Loam,Summer,10.00,10.00,10.00
1,Strawberry,Moderate,Day Neutral,18.06,711.10,6.25,13.06,505.79,91.94,179.29,121.02,246.91,20.40,low_acidic,Loam,Spring,10.00,10.00,10.00
2,Strawberry,Moderate,Short Day Period,16.78,774.04,6.35,12.95,512.99,91.39,181.44,116.94,242.70,19.16,low_acidic,Loam,Summer,10.00,10.00,10.00
3,Strawberry,Moderate,Short Day Period,14.28,665.63,6.26,13.32,484.86,91.25,176.17,122.23,237.10,20.27,low_acidic,Loam,Summer,10.00,10.00,10.00
4,Strawberry,Moderate,Day Neutral,21.44,806.53,6.38,13.31,512.75,92.35,182.94,126.09,243.88,20.40,low_acidic,Loam,Spring,10.00,10.00,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15395,Green Peas,Moderate,Short Day Period,18.24,1079.57,6.78,6.91,314.94,65.06,150.52,48.86,124.69,4.98,neutral,Sandy,Fall,5.00,10.00,10.00
15396,Green Peas,Moderate,Short Day Period,16.60,958.20,5.84,6.83,345.86,66.75,144.31,44.65,121.59,4.99,neutral,Sandy,Fall,5.00,10.00,10.00
15397,Green Peas,Moderate,Short Day Period,12.15,947.90,6.50,6.94,320.29,65.80,147.07,42.35,120.39,5.04,low_acidic,Sandy,Fall,5.00,10.00,10.00
15398,Green Peas,Moderate,Short Day Period,17.49,863.90,5.94,6.78,300.50,64.56,144.42,44.41,119.29,4.69,low_acidic,Sandy,Spring,5.00,10.00,10.00


In [4]:
encoder = LabelEncoder()
label_mappings = {}

for column in soil_nutrients.select_dtypes('object'):
    soil_nutrients[column] = encoder.fit_transform(soil_nutrients[column])
    label_mappings[column] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

for column, mapping in label_mappings.items():
    print(f"Column: {column}")
    for label, encoding in mapping.items():
        print(f"{encoding} -> {label}")
    print()

Column: Name
0 -> Arugula
1 -> Asparagus
2 -> Beet
3 -> Broccoli
4 -> Cabbage
5 -> Cauliflowers
6 -> Chard
7 -> Chilli Peppers
8 -> Cress
9 -> Cucumbers
10 -> Eggplants
11 -> Endive
12 -> Grapes
13 -> Green Peas
14 -> Kale
15 -> Lettuce
16 -> Potatoes
17 -> Radicchio
18 -> Spinach
19 -> Strawberry
20 -> Tomatoes
21 -> Watermelon

Column: Fertility
0 -> High
1 -> Moderate

Column: Photoperiod
0 -> Day Neutral
1 -> Long Day Period
2 -> Short Day Period

Column: Category_pH
0 -> low_acidic
1 -> low_alkaline
2 -> neutral

Column: Soil_Type
0 -> Loam
1 -> Sandy
2 -> Sandy Loam

Column: Season
0 -> Fall
1 -> Spring
2 -> Summer
3 -> Winter



In [5]:
pca = PCA(n_components = 3)
pca_soil_nutrients = pca.fit_transform(soil_nutrients.drop(columns = ['Name', 'Yield']))
pca_soil_nutrients = pd.DataFrame(data = pca_soil_nutrients, columns = ['PCA Component 1', 'PCA Component 2', 'PCA Component 3'])
pca_soil_nutrients

Unnamed: 0,PCA Component 1,PCA Component 2,PCA Component 3
0,-225.17,111.97,2.66
1,-257.80,82.02,18.45
2,-196.57,98.39,15.62
3,-298.34,51.13,15.60
4,-165.45,105.34,21.85
...,...,...,...
15395,149.58,-75.22,-19.34
15396,26.59,-71.10,-43.34
15397,20.52,-96.58,-34.77
15398,-58.85,-130.23,-33.20


In [6]:
pca_soil_nutrients['Name'] = soil_nutrients['Name'].map({value: key for key, value in label_mappings['Name'].items()})
pca_soil_nutrients['Yield'] = soil_nutrients['Yield']
pca_soil_nutrients

Unnamed: 0,PCA Component 1,PCA Component 2,PCA Component 3,Name,Yield
0,-225.17,111.97,2.66,Strawberry,20.37
1,-257.80,82.02,18.45,Strawberry,20.40
2,-196.57,98.39,15.62,Strawberry,19.16
3,-298.34,51.13,15.60,Strawberry,20.27
4,-165.45,105.34,21.85,Strawberry,20.40
...,...,...,...,...,...
15395,149.58,-75.22,-19.34,Green Peas,4.98
15396,26.59,-71.10,-43.34,Green Peas,4.99
15397,20.52,-96.58,-34.77,Green Peas,5.04
15398,-58.85,-130.23,-33.20,Green Peas,4.69


In [7]:
class CropClassifier:
    def __init__(self, data, features_for_classification, test_size = 0.2, random_state = 42):
        self.data = data
        self.features_for_classification = features_for_classification
        self.test_size = test_size
        self.random_state = random_state
        
        self.features = self.data.drop(self.features_for_classification, axis = 1)
        self.targets = self.data[self.features_for_classification]
        self.features_train, self.features_test, self.targets_train, self.targets_test = train_test_split(self.features, self.targets, test_size = self.test_size, random_state = self.random_state, stratify = self.targets)
        
        self.lazyClassifier = LazyClassifier(verbose = 0, ignore_warnings = True, custom_metric = None)
        self.models = None
        self.predictions = None
        
    def fit(self):
        self.models, self.predictions = self.lazyClassifier.fit(self.features_train, self.features_test, self.targets_train, self.targets_test)
        
    def get_models(self):
        return self.models
        
    def get_predictions(self):
        return self.predictions

In [8]:
cropClassifier = CropClassifier(pca_soil_nutrients, ['Name'])
cropClassifier.fit()

 84%|████████▍ | 27/32 [00:36<00:04,  1.07it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 12320, number of used features: 4
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.091042


100%|██████████| 32/32 [00:38<00:00,  1.19s/it]


In [9]:
models = cropClassifier.get_models().style.format(precision = 5)
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
QuadraticDiscriminantAnalysis,0.99968,0.99968,,0.99968,0.07812
ExtraTreesClassifier,0.99903,0.99903,,0.99902,0.46812
SVC,0.99838,0.99838,,0.99838,0.66935
KNeighborsClassifier,0.99838,0.99838,,0.99838,0.20307
RandomForestClassifier,0.99805,0.99805,,0.99805,3.46897
LinearDiscriminantAnalysis,0.99675,0.99675,,0.99675,0.07855
LGBMClassifier,0.99675,0.99675,,0.99675,1.84009
BaggingClassifier,0.99578,0.99578,,0.99578,0.64945
LabelPropagation,0.99578,0.99578,,0.99578,5.36289
LabelSpreading,0.99513,0.99513,,0.99513,9.44028


In [10]:
class YieldRegressor:
    def __init__(self, data, features_for_regression, test_size = 0.2, random_state = 42):
        self.data = data
        self.features_for_regression = features_for_regression
        self.test_size = test_size
        self.random_state = random_state
        
        self.features = self.data.drop(self.features_for_regression, axis = 1)
        self.targets = self.data[self.features_for_regression]
        self.features_train, self.features_test, self.targets_train, self.targets_test = train_test_split(self.features, self.targets, test_size = self.test_size, random_state = self.random_state)
        
        self.lazyRegressor = LazyRegressor(verbose = 0, ignore_warnings = True, custom_metric = None)
        self.models = None
        self.predictions = None
        
    def fit(self):
        self.models, self.predictions = self.lazyRegressor.fit(self.features_train, self.features_test, self.targets_train, self.targets_test)
        
    def get_models(self):
        return self.models
        
    def get_predictions(self):
        return self.predictions

In [11]:
yieldRegressor = YieldRegressor(pca_soil_nutrients, ['Yield'])
yieldRegressor.fit()

100%|██████████| 42/42 [11:34<00:00, 16.54s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 787
[LightGBM] [Info] Number of data points in the train set: 12320, number of used features: 4
[LightGBM] [Info] Start training from score 22.684950





In [12]:
models = yieldRegressor.get_models().style.format(precision = 5)
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.99362,0.99362,1.26621,0.04304
LGBMRegressor,0.99347,0.99348,1.28036,0.14655
ExtraTreesRegressor,0.99345,0.99346,1.28269,2.553
GaussianProcessRegressor,0.9889,0.98891,1.66968,37.73679
BaggingRegressor,0.98886,0.98887,1.67298,0.75809
RandomForestRegressor,0.9888,0.98882,1.67686,6.98978
HistGradientBoostingRegressor,0.98824,0.98826,1.71832,0.36945
GradientBoostingRegressor,0.98811,0.98813,1.72768,2.10192
ExtraTreeRegressor,0.98802,0.98804,1.73441,0.0363
DecisionTreeRegressor,0.9784,0.97843,2.3291,0.13614
