In [2]:
# importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# loading the dataset

df = pd.read_csv("/content/fert_dataset.csv")


In [None]:
#Exploration of Dataset

df.head()
df.info()
df.describe()


In [6]:
#check the unique values in dataset
df.apply(lambda x: len(x.unique()))
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

['Soil Type', 'Crop Type', 'Fertilizer Name']

In [None]:
# print the categorical columns
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

In [None]:
plt.figure(figsize = (15,5))

plt.subplot(1,3,1)
sns.histplot(df['Nitrogen'],color = 'blue', kde = True)

plt.subplot(1,3,2)
sns.histplot(df['Potassium'],color = 'red', kde = True)

plt.subplot(1,3,3)
sns.histplot(df['Phosphorous'],color = 'green', kde = True)

In [None]:
plt.figure(figsize=(15, 8))

plt.subplot(2, 2, 1)
sns.histplot(df['Temparature'], color='red', kde=True)
plt.xlabel('Temperature')

plt.subplot(2, 2, 2)
sns.histplot(df['Humidity'], color='purple', kde=True)
plt.xlabel('Humidity')

plt.subplot(2, 2, 3)
sns.histplot(df['Soil Moisture'], color='green', kde=True)
plt.xlabel('Soil Moisture')

plt.tight_layout()
plt.show()


In [None]:
#it show the count of each crop type
plt.figure(figsize=(10,8))
sns.countplot(x='Crop Type', data = df)

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Fertilizer Name', data = df)

In [None]:
# it shows the count of each soil type
plt.figure(figsize=(10,5))
sns.countplot(x='Soil Type', data = df)

In [None]:
# Data Preprocessing

# Checking missing values of the dataset in each column
df.isnull().sum()


In [None]:
# importing label encoder for converting categorical
from sklearn.preprocessing import LabelEncoder

#encoding Soil Type variable
encode_soil = LabelEncoder()
df['Soil Type'] = encode_soil.fit_transform(df['Soil Type'])

#creating the DataFrame
Soil_Type = pd.DataFrame(zip(encode_soil.classes_,encode_soil.transform(encode_soil.classes_)),columns=['Original','Encoded'])
Soil_Type = Soil_Type.set_index('Original')
Soil_Type

In [None]:
# encoding the crop type variables
encode_crop =  LabelEncoder()
df['Crop Type'] = encode_crop.fit_transform(df['Crop Type'])

#creating the DataFrame
Crop_Type = pd.DataFrame(zip(encode_crop.classes_,encode_crop.transform(encode_crop.classes_)),columns=['Original','Encoded'])
Crop_Type = Crop_Type.set_index('Original')
Crop_Type

In [None]:
# encoding fertilizer name type variable
encode_ferti = LabelEncoder()
df['Fertilizer Name'] = encode_ferti.fit_transform(df['Fertilizer Name'])

#creating the DataFrame
Fertilizer = pd.DataFrame(zip(encode_ferti.classes_,encode_ferti.transform(encode_ferti.classes_)),columns=['Original','Encoded'])
Fertilizer = Fertilizer.set_index('Original')
Fertilizer


In [None]:
#correlation heatmap
plt.figure(figsize=[10,8])
sns.heatmap(df.corr(),annot=True)
plt.show()

In [27]:

X=df.drop(["Fertilizer Name"],axis=1)
y=df["Fertilizer Name"]



In [28]:
# Classification Models

#Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics


# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Define a dictionary containing the parameters to be tuned and their respective values
param_dict = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}

# Create a Base Decision Tree Model
test_dec_tree = DecisionTreeClassifier(random_state=2)
test_dec_tree.fit(X_train, y_train)

# Performing hyperparameter tuning for the Decision Tree classifier using GridSearchCV
grid = GridSearchCV(test_dec_tree, param_dict, cv=5, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

# Output the best parameters and best score




Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'criterion': 'gini', 'max_depth': 6, 'random_state': 2}
0.9714285714285715


In [29]:
# Training a Decision Tree classifier with specified hyperparameters
#and evaluating its performance on the training and testing datasets

#Creating Decision Tree Classifier:

Dec_tree = DecisionTreeClassifier(max_depth=6, criterion='gini', random_state=2)

#Fitting the Model:
Dec_tree.fit(X_train, y_train)

dt_train_score = Dec_tree.score(X_train, y_train)
print(f'Decision Tree Train Accuracy is: {dt_train_score:.4f}')

dt_predicted_values = Dec_tree.predict(X_test)
dt_test_score = metrics.accuracy_score(y_test, dt_predicted_values)
print(f'Decision Tree Test Accuracy is: {dt_test_score:.4f}')

dt_report = classification_report(y_test, dt_predicted_values, digits=4)
print(dt_report)


Decision Tree Train Accuracy is: 1.0000
Decision Tree Test Accuracy is: 0.9667
              precision    recall  f1-score   support

           0     1.0000    0.5000    0.6667         2
           1     1.0000    1.0000    1.0000         4
           2     0.6667    1.0000    0.8000         2
           3     1.0000    1.0000    1.0000         3
           4     1.0000    1.0000    1.0000         8
           5     1.0000    1.0000    1.0000         4
           6     1.0000    1.0000    1.0000         7

    accuracy                         0.9667        30
   macro avg     0.9524    0.9286    0.9238        30
weighted avg     0.9778    0.9667    0.9644        30



In [None]:
#Generating a heatmap to visualize the confusion matrix of the Decision Tree classifier's predictions on the test data

from sklearn.metrics import confusion_matrix
#Computing the Confusion Matrix:
cm_dt = confusion_matrix(y_test, dt_predicted_values)
#Creating the Heatmap:
f, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(cm_dt, annot=True, linewidth=0.5, fmt=".0f", cmap='crest', ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()


In [31]:
# Random Forest

# RandomForestClassifier with GridSearchCV for hyperparameter tuning

from sklearn.ensemble import RandomForestClassifier
#Initializing and Fitting the RandomForestClassifier:
test_rdf_clf = RandomForestClassifier(random_state=2)
test_rdf_clf.fit(X_train, y_train)
#Defining Hyperparameters for GridSearchCV:
param_dict = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}
#Performing GridSearchCV:
grid = GridSearchCV(test_rdf_clf, param_dict, cv=5, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)


Fitting 5 folds for each of 88 candidates, totalling 440 fits
{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 50, 'random_state': 2}


In [None]:
#Initializing and Fitting the RandomForestClassifier:

rdf_clf = RandomForestClassifier(n_estimators=50, criterion='entropy', max_depth=8, random_state=2)
rdf_clf.fit(X_train, y_train)
rdf_train_score = rdf_clf.score(X_train, y_train)
print(f'Random Forest Train Accuracy is: {rdf_train_score:.4f}')
rdf_predicted_values = rdf_clf.predict(X_test)
rdf_test_score = metrics.accuracy_score(y_test, rdf_predicted_values)
print(f'Random Forest Test Accuracy is: {rdf_test_score:.4f}')
rdf_report = classification_report(y_test, rdf_predicted_values, digits=4)
print(rdf_report)


In [None]:
#confusion matrix for the RandomForestClassifier's predictions on the test set
cm_rdf = confusion_matrix(y_test,rdf_predicted_values)

f, ax = plt.subplots(figsize=(10,7))
sns.heatmap(cm_rdf, annot=True, linewidth=0.5, fmt=".0f",  cmap='crest', ax = ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()

In [34]:
#K-Nearest Neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#preprocessing the features by scaling them using StandardScaler and then transforming both the training and test sets

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)


In [None]:
#Iterating over different values of n_neighbors for the K-nearest neighbors classifier

score_list = []
for i in range(4, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_scaled, y_train)

    knn_train_score = knn.score(X_train_scaled, y_train)

    knn_predicted_values = knn.predict(X_test_scaled)
    knn_test_score = metrics.accuracy_score(y_test, knn_predicted_values)

    score_list.append((i, knn_train_score, knn_test_score))
    score_knn_df = pd.DataFrame(score_list, columns=['k', 'Train Score', 'Test Score'])
print(score_knn_df)

In [None]:
#Training the K-nearest neighbors classifier:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_scaled, y_train)

knn_train_score = knn.score(X_train_scaled, y_train)
print(f'K-Nearest Neighbors Train Accuracy is : {knn_train_score :.4f}')

knn_predicted_values = knn.predict(X_test_scaled)
knn_test_score = metrics.accuracy_score(y_test, knn_predicted_values)
print(f'K-Nearest Neighbors Test Accuracy is : {knn_test_score :.4f}')

knn_report = classification_report(y_test, knn_predicted_values, digits=4)
print(knn_report)


In [None]:
#confusion matrix for the predictions made by the K-nearest neighbors classifier on the test set
cm_knn = confusion_matrix(y_test,knn_predicted_values)

f, ax = plt.subplots(figsize=(10,7))
sns.heatmap(cm_knn, annot=True, linewidth=0.5, fmt=".0f",  cmap='crest', ax = ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()

In [39]:
#Exporting Random Forest Model

from joblib import Parallel, delayed
import joblib

final_rdf_clf = RandomForestClassifier(n_estimators=50, criterion='entropy', max_depth=8, random_state=2)
final_rdf_clf.fit(X,y)

joblib.dump(final_rdf_clf, 'ferti_rdf_clf.pkl')

['ferti_rdf_clf.pkl']

In [40]:
# Predicted encoded fertilizer value
predict_inputs = [[32,54,40,1,2,40,30,48]]
encoded_predicted_fertilizer = final_rdf_clf.predict(predict_inputs)[0]

# Retrieve the corresponding fertilizer name from the Fertilizer DataFrame
predicted_fertilizer_name = Fertilizer.index[encoded_predicted_fertilizer]
Fertilizer.to_csv('fertilizer_data.csv', index=True)

# Print the predicted fertilizer name
print("Predicted Fertilizer Name:", predicted_fertilizer_name)

Predicted Fertilizer Name: 14-35-14


