In [1]:
# importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# loading the dataset

df = pd.read_csv("Crop_recommendation.csv")


In [3]:
#Exploration of Dataset

df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            3000 non-null   int64  
 1   P            3000 non-null   int64  
 2   K            3000 non-null   int64  
 3   temperature  3000 non-null   float64
 4   humidity     3000 non-null   float64
 5   ph           3000 non-null   float64
 6   rainfall     3000 non-null   float64
 7   label        3000 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 187.6+ KB


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,60.549333,51.358667,47.407333,24.968993,68.430428,6.428686,286.762771
std,41.3285,29.52374,45.206953,5.7381,20.902451,0.711778,406.430256
min,0.0,5.0,5.0,7.015319,14.25804,3.504752,20.211267
25%,27.0,30.0,22.0,21.654108,55.045989,6.019265,71.203275
50%,52.0,49.0,34.0,25.346869,71.584142,6.416284,110.964352
75%,95.0,65.0,50.0,28.563593,84.679019,6.821754,241.356065
max,200.0,145.0,205.0,43.675493,99.981876,9.935091,2498.90653


In [None]:
# Print the number of unique crop types
print('Number of Crop types:', df['label'].nunique())

# Extract the labels from the DataFrame
labels_df = df['label']

# Count the occurrences of each label
crops_labels = pd.DataFrame(labels_df.value_counts())

# Sort the DataFrame by index (label)
crops_labels.sort_index(inplace=True)

# Reset the index to make 'label' a regular column
crops_labels.reset_index(inplace=True)

# Rename the columns for clarity
crops_labels.rename(columns={'index':'label', 'label':'count'}, inplace=True)

# Set the name of the index column
crops_labels.index.name = 'index'

# Print the DataFrame with explanations
print(crops_labels)

In [None]:
plt.figure(figsize = (15,5))

plt.subplot(1,3,1)
sns.histplot(df['N'],color = 'blue', kde = True)

plt.subplot(1,3,2)
sns.histplot(df['P'],color = 'red', kde = True)

plt.subplot(1,3,3)
sns.histplot(df['K'],color = 'green', kde = True)

In [None]:
plt.figure(figsize = (15,8))

plt.subplot(2,2,1)
sns.histplot(df['ph'],color = 'blue', kde = True)

plt.subplot(2,2,2)
sns.histplot(df['temperature'],color = 'red', kde = True)

plt.subplot(2,2,3)
sns.histplot(df['humidity'],color = 'purple', kde = True)

plt.subplot(2,2,4)
sns.histplot(df['rainfall'],color = 'green', kde = True)


In [None]:
# Create the pivot table
crop_desc = pd.pivot_table(df, index='label', aggfunc='mean')

# Reset the index to make 'label' a regular column
crop_desc.reset_index(inplace=True)

# Print the pivot table
print(crop_desc)

In [None]:
labels = crop_desc['label'].unique()

n_value = crop_desc['N']
p_value = crop_desc['P']
k_value = crop_desc['K']

x = np.arange(len(labels))
width = 0.2


fig, ax = plt.subplots(1,1, figsize = (17,7))
n_bar = ax.bar(x - width, n_value, width, label='N')
p_bar = ax.bar(x, p_value, width, label='P')
k_bar = ax.bar(x + width, k_value, width, label='K')


ax.set_ylabel('kg/ha (Mean)')
ax.set_title('NPK Means by Crop')
ax.set_xticks(x, labels, rotation = 45)
ax.legend()

ax.bar_label(n_bar, padding=3,label_type='edge',fmt = '%.f')
ax.bar_label(p_bar, padding=3, label_type='edge',fmt = '%.f')
ax.bar_label(k_bar, padding=3, label_type='edge',fmt = '%.f')

fig.tight_layout()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 5))
ph_boxplot = sns.boxplot(data = df, x = 'label', y = 'ph')
ph_boxplot.set_xlabel('Crop',fontsize = 14)
ph_boxplot.set_ylabel('pH', fontsize = 14)
ph_boxplot.axes.set_title('Boxplot - pH by Crop', fontsize=14)

ph_boxplot.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

In [10]:
# Data Preprocessing

# Checking missing values of the dataset in each column
df.isnull().sum()


N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [21]:


# Define updated thresholds for temperature, humidity, and rainfall for each season
spring_threshold = {'temperature': (10, 35), 'humidity': (5, 80), 'rainfall': (40, 200)}
summer_threshold = {'temperature': (30, 45), 'humidity': (40, 80), 'rainfall': (0, 120)}
monsoon_threshold = {'temperature': (20, 38), 'humidity': (55, 95), 'rainfall': (100, 5000)}
autumn_threshold = {'temperature': (20, 35), 'humidity': (30, 95), 'rainfall': (25, 900)}
winter_threshold = {'temperature': (5, 30), 'humidity': (30, 90), 'rainfall': (0, 700)}

# Function to classify each data point into a season
def classify_season(row):
    temp, hum, rain = row['temperature'], row['humidity'], row['rainfall']
    if spring_threshold['temperature'][0] <= temp <= spring_threshold['temperature'][1] \
        and spring_threshold['humidity'][0] <= hum <= spring_threshold['humidity'][1] \
        and spring_threshold['rainfall'][0] <= rain <= spring_threshold['rainfall'][1]:
        return 'Spring'
    elif summer_threshold['temperature'][0] <= temp <= summer_threshold['temperature'][1] \
        and summer_threshold['humidity'][0] <= hum <= summer_threshold['humidity'][1] \
        and summer_threshold['rainfall'][0] <= rain <= summer_threshold['rainfall'][1]:
        return 'Summer'
    elif monsoon_threshold['temperature'][0] <= temp <= monsoon_threshold['temperature'][1] \
        and monsoon_threshold['humidity'][0] <= hum <= monsoon_threshold['humidity'][1] \
        and monsoon_threshold['rainfall'][0] <= rain <= monsoon_threshold['rainfall'][1]:
        return 'Monsoon'
    elif autumn_threshold['temperature'][0] <= temp <= autumn_threshold['temperature'][1] \
        and autumn_threshold['humidity'][0] <= hum <= autumn_threshold['humidity'][1] \
        and autumn_threshold['rainfall'][0] <= rain <= autumn_threshold['rainfall'][1]:
        return 'Autumn'
    elif winter_threshold['temperature'][0] <= temp <= winter_threshold['temperature'][1] \
        and winter_threshold['humidity'][0] <= hum <= winter_threshold['humidity'][1] \
        and winter_threshold['rainfall'][0] <= rain <= winter_threshold['rainfall'][1]:
        return 'Winter'
    else:
        return 'Monsoon'
        print(temp,hum,rain)

# Apply the function to classify each data point
df['season'] = df.apply(classify_season, axis=1)

# Save the classified data to a new CSV file
df.to_csv('Crop_recommendation_with_season.csv', index=False)


In [22]:
from sklearn.preprocessing import LabelEncoder

# Load the classified data
classified_data = pd.read_csv('Crop_recommendation_with_season.csv')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'season' column to numerical labels
classified_data['season'] = label_encoder.fit_transform(classified_data['season'])

# Display the mapping between original labels and numerical labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

# Save the updated data with numerical labels to the same CSV file
classified_data.to_csv('Crop_recommendation_with_season_labels.csv', index=False)

#Label Mapping: {'Autumn': 0, 'Monsoon': 1, 'Spring': 2, 'Summer': 3, 'Winter': 4}

Label Mapping: {'Autumn': 0, 'Monsoon': 1, 'Spring': 2, 'Summer': 3, 'Winter': 4}


In [26]:
df1 = pd.read_csv('Crop_recommendation_with_season_labels.csv')
X = df1.drop('label', axis = 1)
y = df1['label']

In [27]:
# Classification Models

# Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics

#Data Splitting:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

#Defines a dictionary containing the parameters to be tuned (criterion and max_depth) and their respective values.
param_dict = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}

#Creating a Base Decision Tree Model:
test_dec_tree = DecisionTreeClassifier(random_state=2)
test_dec_tree.fit(X_train, y_train)

#performing hyperparameter tuning for a Decision Tree classifier using GridSearchCV
grid = GridSearchCV(test_dec_tree, param_dict, cv=5, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)



Fitting 5 folds for each of 22 candidates, totalling 110 fits
{'criterion': 'entropy', 'max_depth': 8, 'random_state': 2}
0.9709523809523809


In [None]:
# Training a Decision Tree classifier with specified hyperparameters
#and evaluating its performance on the training and testing datasets

#Creating Decision Tree Classifier:

Dec_tree = DecisionTreeClassifier(max_depth=8, criterion='entropy', random_state=2)

#Fitting the Model:
Dec_tree.fit(X_train, y_train)

dt_train_score = Dec_tree.score(X_train, y_train)
print(f'Decision Tree Train Accuracy is: {dt_train_score:.4f}')

dt_predicted_values = Dec_tree.predict(X_test)
dt_test_score = metrics.accuracy_score(y_test, dt_predicted_values)
print(f'Decision Tree Test Accuracy is: {dt_test_score:.4f}')

dt_report = classification_report(y_test, dt_predicted_values, digits=4)
print(dt_report)


In [None]:
#Generating a heatmap to visualize the confusion matrix of the Decision Tree classifier's predictions on the test data

from sklearn.metrics import confusion_matrix
#Computing the Confusion Matrix:
cm_dt = confusion_matrix(y_test, dt_predicted_values)
#Creating the Heatmap:
f, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(cm_dt, annot=True, linewidth=0.5, fmt=".0f", cmap='crest', ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()


In [31]:
# Random Forest

# RandomForestClassifier with GridSearchCV for hyperparameter tuning

from sklearn.ensemble import RandomForestClassifier
#Initializing and Fitting the RandomForestClassifier:
test_rdf_clf = RandomForestClassifier(random_state=2)
test_rdf_clf.fit(X_train, y_train)
#Defining Hyperparameters for GridSearchCV:
param_dict = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}
#Performing GridSearchCV:
grid = GridSearchCV(test_rdf_clf, param_dict, cv=5, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)


{'criterion': 'gini', 'max_depth': 18, 'n_estimators': 100, 'random_state': 2}


In [None]:
#Initializing and Fitting the RandomForestClassifier:

rdf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=18, random_state=2)
rdf_clf.fit(X_train, y_train)
rdf_train_score = rdf_clf.score(X_train, y_train)
print(f'Random Forest Train Accuracy is: {rdf_train_score:.4f}')
rdf_predicted_values = rdf_clf.predict(X_test)
rdf_test_score = metrics.accuracy_score(y_test, rdf_predicted_values)
print(f'Random Forest Test Accuracy is: {rdf_test_score:.4f}')
rdf_report = classification_report(y_test, rdf_predicted_values, digits=4)
print(rdf_report)


In [None]:
#confusion matrix for the RandomForestClassifier's predictions on the test set
cm_rdf = confusion_matrix(y_test,rdf_predicted_values)

f, ax = plt.subplots(figsize=(10,7))
sns.heatmap(cm_rdf, annot=True, linewidth=0.5, fmt=".0f",  cmap='crest', ax = ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()

In [35]:
#K-Nearest Neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#preprocessing the features by scaling them using StandardScaler and then transforming both the training and test sets

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)


In [None]:
#Iterating over different values of n_neighbors for the K-nearest neighbors classifier

score_list = []
for i in range(4, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_scaled, y_train)

    knn_train_score = knn.score(X_train_scaled, y_train)

    knn_predicted_values = knn.predict(X_test_scaled)
    knn_test_score = metrics.accuracy_score(y_test, knn_predicted_values)

    score_list.append((i, knn_train_score, knn_test_score))
    score_knn_df = pd.DataFrame(score_list, columns=['k', 'Train Score', 'Test Score'])
print(score_knn_df)

In [None]:
#Training the K-nearest neighbors classifier:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

knn_train_score = knn.score(X_train_scaled, y_train)
print(f'K-Nearest Neighbors Train Accuracy is : {knn_train_score :.4f}')

knn_predicted_values = knn.predict(X_test_scaled)
knn_test_score = metrics.accuracy_score(y_test, knn_predicted_values)
print(f'K-Nearest Neighbors Test Accuracy is : {knn_test_score :.4f}')

knn_report = classification_report(y_test, knn_predicted_values, digits=4)
print(knn_report)


In [None]:
#confusion matrix for the predictions made by the K-nearest neighbors classifier on the test set
cm_knn = confusion_matrix(y_test,knn_predicted_values)

f, ax = plt.subplots(figsize=(10,7))
sns.heatmap(cm_knn, annot=True, linewidth=0.5, fmt=".0f",  cmap='crest', ax = ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title('Predicted vs actual')
plt.show()

In [44]:
#Exporting Random Forest Model

from joblib import Parallel, delayed
import joblib

final_rdf_clf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', max_depth = 18, random_state = 2)
final_rdf_clf.fit(X,y)

joblib.dump(final_rdf_clf, 'crop_rdf_clf.pkl')

['crop_rdf_clf.pkl']