<h2 style="color:blue">Project Library Imports</h2>

In [1]:
# for manipulations
import numpy as np
import pandas as pd

# for data visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# for interactivity
from ipywidgets import interact

%matplotlib widget

import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

#for machine learning processes
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# For Model Performance Evaluation
from sklearn.metrics import confusion_matrix

# For creating the Predictive Model

from sklearn.linear_model import LogisticRegression

ModuleNotFoundError: No module named 'ipympl'

<h2 style="color:blue">Exploratory Data Analysis (EDA)</h2>

In [None]:
df = pd.read_csv("Data/Crop_recommendation.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check for missing value
df.isnull().sum()

In [None]:
# check the Crops available in this dataset.
# Will get the Unique column values.. with their count or occurance in the data observations.
df.label.value_counts()

In [None]:
# Summary for all Crops.
print("   Average Ratio of Nitrogen in the Soil : {0:.2f}".format(df.N.mean()))
print("Average Ratio of Phosphorous in the Soil : {0:.2f}".format(df.P.mean()))
print("  Average Ratio of Potassium in the Soil : {0:.2f}".format(df.K.mean()))
print("          Average Temperature in Celsius : {0:.2f}".format(df.temperature.mean()))
print("               Average Relative Humidity : {0:.2f}".format(df['humidity'].mean()))
print("            Average PH Value of the Soil : {0:.2f}".format(df['ph'].mean()))
print("                  Average Rainfall in mm : {0:.2f}".format(df['rainfall'].mean()))

In [None]:
# Summary Statistics for each of the Crops 

@interact
def summary(crops = list(df['label'].value_counts().index)):
    x = df[df['label'] == crops]
    
    print("-"*30)
    print("STatistics for Nitrogen")
    print("Minimum Nitrogen required : ", x['N'].min())
    print("Average Nitrogen required : ", x['N'].mean())
    print("Maxiumm Nitrogen required : ", x['N'].max())
    
    print("-"*30)
    print("STatistics for Phosphorous")
    print("Minimum Phosphorous required : ", x['P'].min())
    print("Average Phosphorous required : ", x['P'].mean())
    print("Maxiumm Phosphorous required : ", x['P'].max())
    
    print("-"*30)
    print("STatistics for Potassium")
    print("Minimum Potassium required : ", x['K'].min())
    print("Average Potassium required : ", x['K'].mean())
    print("Maxiumm Potassium required : ", x['K'].max())
    
    print("-"*30)
    print('Similar for other features')

In [None]:
def summary(crops):
    x = df[df['label'] == crops]
    
    print("-"*30)
    print("STatistics for Nitrogen")
    print("Minimum Nitrogen required : ", x['N'].min())
    print("Average Nitrogen required : ", x['N'].mean())
    print("Maxiumm Nitrogen required : ", x['N'].max())
    
    print("-"*30)
    print("STatistics for Phosphorous")
    print("Minimum Phosphorous required : ", x['P'].min())
    print("Average Phosphorous required : ", x['P'].mean())
    print("Maxiumm Phosphorous required : ", x['P'].max())
    
    print("-"*30)
    print("STatistics for Potassium")
    print("Minimum Potassium required : ", x['K'].min())
    print("Average Potassium required : ", x['K'].mean())
    print("Maxiumm Potassium required : ", x['K'].max())
    
    print("-"*30)
    print('Similar for other features')
    
interact(summary, crops = list(df['label'].value_counts().index) ); 

In [None]:
# Compare the Average Requirement fr each crops with average condition

all_features = list(df.columns.values)
# all_features = list(df.columns.values.tolist())
# type(all_features)

@interact
def compare(conditions = all_features):
    print("Average Value for ", conditions, "is {0:.2f}".format(df[conditions].mean()))
    print('-'*40)
    print("Rice  : {0:.2f}".format(df[(df['label'] == 'rice')][conditions].mean()))
    print("Mango : {0:.2f}".format(df[(df['label'] == 'mango')][conditions].mean()))

In [None]:
# Lets make this function more Intuitive

@interact
def compare(conditions = ['N', 'P', 'K', 'temperature','ph', 'rainfall']):
    print("Crops which require greater than average", conditions, '\n')
    print(df[df[conditions] > df[conditions].mean()]['label'].unique())
    print('*'*40)
    print("Crops which require less than avergae", conditions, '\n')
    print(df[df[conditions] <= df[conditions].mean()]['label'].unique())

<h2 style="color:blue">Distribution</h2>

In [None]:
plt.subplot(2,4,1)
sns.distplot(df['N'], color = 'darkblue')
plt.xlabel("Ratio of Nitrogen", fontsize = 12)
plt.grid()

plt.subplot(2,4,2)
sns.distplot(df['P'], color = 'darkblue')
plt.xlabel("Ratio of Phosphorous", fontsize = 12)
plt.grid()


plt.subplot(2,4,3)
sns.distplot(df['K'], color = 'darkblue')
plt.xlabel("Ratio of Potassium", fontsize = 12)
plt.grid()

plt.subplot(2,4,4)
sns.distplot(df['temperature'], color = 'black')
plt.xlabel("Temperature", fontsize = 12)
plt.grid()

plt.subplot(2,4,5)
sns.distplot(df['rainfall'], color = 'grey')
plt.xlabel("Rainfall", fontsize = 12)
plt.grid()

plt.subplot(2,4,6)
sns.distplot(df['humidity'], color = 'lightgreen')
plt.xlabel("Humidity", fontsize = 12)
plt.grid()

plt.subplot(2,4,7)
sns.distplot(df['ph'], color = 'darkgreen')
plt.xlabel("pH Level", fontsize = 12)
plt.grid()

plt.suptitle("Distribution for Agriculture Conditions", fontsize = 20)
plt.show()

In [None]:
## Lets find out some interesting facts

print("   Crops which requires very high ratio of Nitrogen content in Soil : ", df[df['N'] > 120]['label'].unique())
print("Crops which requires very high ratio of Phosphorous content in Soil : ", df[df['P'] > 100]['label'].unique())
print("  Crops which requires very high ratio of Potassium content in Soil : ", df[df['K'] > 200]['label'].unique())
print("                            Crops which requires very high Rainfall : ", df[df['rainfall'] > 200]['label'].unique())
print("                          Crops which requires very low temperature : ", df[df['temperature'] < 10]['label'].unique())
print("                         Crops which requires very high temperature : ", df[df['temperature'] > 40]['label'].unique())
print("                             Crops which requires very low humidity : ", df[df['humidity'] < 20]['label'].unique())
print("                                   Crops which requires very low ph : ", df[df['ph'] < 4]['label'].unique())
print("                                  Crops which requires very high ph : ", df[df['ph'] > 9]['label'].unique())

In [None]:
# Lets understand which crops can only be grown in SUmmer season; winter season; rainy season

print("Summer Crops -->")
print(df[(df['temperature'] > 30) & (df['humidity'] > 50 )]['label'].unique())
print('*'*50)

print("Winter Crops -->")
print(df[(df['temperature'] < 20) & (df['humidity'] > 30 )]['label'].unique())
print('*'*50)

print("Rainy Crops -->")
print(df[(df['rainfall'] > 200) & (df['humidity'] > 30 )]['label'].unique())
print('*'*50)

<h2 style="color:blue">Data Pre-processing and Modelling</h2>

In [None]:
# Removing the label column
X = df.drop(['label'], axis = 1)

In [None]:
X.head()

In [None]:
# Selecting all the values.. output will be of ndarray.
X = X.values

In [None]:
X[:2]

In [None]:
X.shape

In [None]:
# Lets determine the optimum number of Clusters within the DataSet.
# (WCSS) Means Within-Cluster Sum of Squares, a metric used in clustering algorithms, particularly in the context of K-means clustering. 
## The purpose of printing wcss is often to inspect or analyze the result of a clustering operation.

plt.rcParams['figure.figsize'] = (10,4)

wcss = []

for i in range(1,11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(X)
    wcss.append(km.inertia_)
    print("Iteration ", i, " Done.")

In [None]:
print(wcss)

In [None]:
plt.rcParams['figure.figsize'] = (10,4)

# Lets plot the results
plt.plot(range(1,11), wcss)
plt.title("The Elbow Method", fontsize = 20)
plt.xlabel("No. of Clusters")
plt.ylabel("wcss")
plt.show()

From above we see two elbow.. one at 3 and another at 4... but we can consider the last elbow.. so considering at 4.

With this we can say that we have or need 4 clusters.

Lets implement KMeans cluster.

In [None]:
# Lets implement the K means algorithm to perform clustering analysis.

km = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(X)

# Lets find out the Results
a = df['label']
y_means = pd.DataFrame(y_means)
z = pd.concat([y_means, a], axis = 1)
z = z.rename(columns = {0:'Cluster'})

z.head()

In [None]:
# Lets check the CLuster of each crops
print("Lets check the Results After Applying the K Means Clustering Analysis \n")
print("Crops in First Cluster : ", z[z['Cluster'] == 0]['label'].unique())
print('-'*50)

print("Crops in Second Cluster : ", z[z['Cluster'] == 1]['label'].unique())
print('-'*50)

print("Crops in Third Cluster : ", z[z['Cluster'] == 2]['label'].unique())
print('-'*50)

print("Crops in Fourth Cluster : ", z[z['Cluster'] == 3]['label'].unique())
print('-'*50)

In [None]:
# Lets split the datase
X = df.drop(['label'], axis = 1)
y = df['label']

X.shape, y.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = .2, random_state = 0)

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
# model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
# model = LogisticRegression(solver='lbfgs', multi_class='multinomial')
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)

In [None]:
cm = confusion_matrix(y_valid, y_pred)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu')
plt.title("Confusion Matrix for Logistic Regression", fontsize = 15)
plt.show()

In [None]:
# Lets print classification Report
cr = classification_report(y_valid, y_pred)
print(cr)

<h2 style="color:blue">Predictions</h2>

In [None]:
pred = model.predict(
    (
        np.array(
            [
                [
                    90,   # Value for N
                    40,   # Value for P
                    40,   # Value for K
                    20,   # Value for Temp
                    80,   # Value for Humidity
                    7,    # Value for pH
                    200   # Value for Rainfall
                ]
            ]
        )
    )
)

print("The Suggested Crop for Given Climatic Condition is: ", pred)

In [None]:
pred = model.predict(
    (
        np.array(
            [
                [
                    89,   # Value for N
                    60,   # Value for P
                    17,   # Value for K
                    25,   # Value for Temp
                    57,   # Value for Humidity
                    6,    # Value for pH
                    100   # Value for Rainfall
                ]
            ]
        )
    )
)

print("The Suggested Crop for Given Climatic Condition is: ", pred)