In [1]:
    import pandas as pd
    import pandas_profiling as pp
    import numpy as np
    import seaborn as sns
    import seaborn as sns
    import matplotlib.pyplot as plt
    import warnings
    import os
    import plotly.graph_objects as go
    import plotly.io as pio
    import pickle
    from sklearn.utils import resample
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve
    from sklearn.model_selection import train_test_split, cross_val_score, KFold
    from sklearn.pipeline import Pipeline, make_pipeline
    # Tuning
    from sklearn.model_selection import GridSearchCV
    from sklearn.feature_selection import RFE
    from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, LabelEncoder

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import ExtraTreesClassifier

    warnings.filterwarnings('ignore')
    sns.set_style("whitegrid", {'axes.grid' : False})
    pio.templates.default = "plotly_white"

    def explore_data(df):
        print("Number of Instances and Attributes:", df.shape)
        print('\n')
        print('Dataset columns:',df.columns)
        print('\n')
        print('Data types of each columns: ', df.info())

    def checking_removing_duplicates(df):
        count_dups = df.duplicated().sum()
        print("Number of Duplicates: ", count_dups)
        if count_dups >= 1:
            df.drop_duplicates(inplace=True)
            print('Duplicate values removed!')
        else:
            print('No Duplicate values')

        # Split training and validation set
    def read_in_and_split_data(data, target):
        X = data.drop(target, axis=1)
        y = data[target]
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
        return X_train, X_test, y_train, y_test

        # Spot-Check Algorithms
    def GetModel():
        Models = []
        Models.append(('LR'   , LogisticRegression()))
        Models.append(('LDA'  , LinearDiscriminantAnalysis()))
        Models.append(('KNN'  , KNeighborsClassifier()))
        Models.append(('CART' , DecisionTreeClassifier()))
        Models.append(('NB'   , GaussianNB()))
        Models.append(('SVM'  , SVC(probability=True)))
        return Models

    def ensemblemodels():
        ensembles = []
        ensembles.append(('AB'   , AdaBoostClassifier()))
        ensembles.append(('GBM'  , GradientBoostingClassifier()))
        ensembles.append(('RF'   , RandomForestClassifier()))
        ensembles.append(( 'Bagging' , BaggingClassifier()))
        ensembles.append(('ET', ExtraTreesClassifier()))
        return ensembles

    def NormalizedModel(nameOfScaler):
        
        if nameOfScaler == 'standard':
            scaler = StandardScaler()
        elif nameOfScaler =='minmax':
            scaler = MinMaxScaler()
        elif nameOfScaler == 'normalizer':
            scaler = Normalizer()
        elif nameOfScaler == 'binarizer':
            scaler = Binarizer()

        pipelines = []
        pipelines.append((nameOfScaler+'LR'  , Pipeline([('Scaler', scaler),('LR'  , LogisticRegression())])))
        pipelines.append((nameOfScaler+'LDA' , Pipeline([('Scaler', scaler),('LDA' , LinearDiscriminantAnalysis())])))
        pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , KNeighborsClassifier())])))
        pipelines.append((nameOfScaler+'CART', Pipeline([('Scaler', scaler),('CART', DecisionTreeClassifier())])))
        pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , GaussianNB())])))
        pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , SVC())])))
        pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , AdaBoostClassifier())])  ))
        pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , GradientBoostingClassifier())])  ))
        pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , RandomForestClassifier())])  ))
        pipelines.append((nameOfScaler+'ET'  , Pipeline([('Scaler', scaler),('ET'  , ExtraTreesClassifier())])  ))

        return pipelines

    # Train model
    def fit_model(X_train, y_train,models):
        # Test options and evaluation metric
        num_folds = 10
        scoring = 'accuracy'

        results = []
        names = []
        for name, model in models:
            kfold = KFold(n_splits=num_folds, shuffle=True, random_state=0)
            cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            print(msg)
            
        return names, results

    # Save trained model
    def save_model(model,filename):
        pickle.dump(model, open(filename, 'wb'))

    # Performance Measure
    def classification_metrics(model, conf_matrix):
        print(f"Training Accuracy Score: {model.score(X_train, y_train) * 100:.1f}%")
        print(f"Validation Accuracy Score: {model.score(X_test, y_test) * 100:.1f}%")
        fig,ax = plt.subplots(figsize=(8,6))
        sns.heatmap(pd.DataFrame(conf_matrix), annot = True, cmap = 'YlGnBu',fmt = 'g')
        ax.xaxis.set_label_position('top')
        plt.tight_layout()
        plt.title('Confusion Matrix', fontsize=20, y=1.1)
        plt.ylabel('Actual label', fontsize=15)
        plt.xlabel('Predicted label', fontsize=15)
        plt.show()
        print(classification_report(y_test, y_pred))
        
    # ROC_AUC
    def roc_auc(y_test, y_pred):
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        plt.figure(figsize=(8,6))
        print(f"roc_auc score: {auc(fpr, tpr)*100:.1f}%")
        plt.plot(fpr, tpr, color='orange', label='ROC')
        plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
        plt.xlabel('False Positive Rate',fontsize=12)
        plt.ylabel('True Positive Rate', fontsize=12)
        plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=20)
        plt.legend()
        plt.show()

In [2]:
df = pd.read_csv('/content/Crop_recommendation (1).csv')
df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee


In [3]:
explore_data(df)

Number of Instances and Attributes: (2200, 8)


Dataset columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
Data types of each columns:  None


In [4]:
checking_removing_duplicates(df)


Number of Duplicates:  0
No Duplicate values


In [5]:
df.isna().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [6]:
# All columns contain outliers except for rice and label you can check outliers by using boxplot
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [7]:
#Train model
target ='label'
X_train, X_test, y_train, y_test = read_in_and_split_data(df, target)

models = GetModel()
names,results = fit_model(X_train, y_train,models)

LR: 0.955114 (0.016749)
LDA: 0.965909 (0.013203)
KNN: 0.979545 (0.005207)
CART: 0.983523 (0.010914)
NB: 0.993750 (0.004720)
SVM: 0.976705 (0.008214)


In [8]:
ScaledModel = NormalizedModel('minmax')
name,results = fit_model(X_train, y_train, ScaledModel)

minmaxLR: 0.932955 (0.025763)
minmaxLDA: 0.965909 (0.013203)
minmaxKNN: 0.978409 (0.008351)
minmaxCART: 0.984659 (0.011378)
minmaxNB: 0.993750 (0.004720)
minmaxSVM: 0.982955 (0.006723)
minmaxAB: 0.135795 (0.036862)
minmaxGBM: 0.988636 (0.005682)
minmaxRF: 0.994886 (0.003977)
minmaxET: 0.992045 (0.006327)


In [9]:
ScaledModel = NormalizedModel('standard')
name,results = fit_model(X_train, y_train, ScaledModel)

standardLR: 0.969886 (0.014827)
standardLDA: 0.965909 (0.013203)
standardKNN: 0.968750 (0.011149)
standardCART: 0.984091 (0.011307)
standardNB: 0.993750 (0.004720)
standardSVM: 0.982386 (0.008966)
standardAB: 0.140341 (0.036827)
standardGBM: 0.988636 (0.005682)
standardRF: 0.994318 (0.005682)
standardET: 0.991477 (0.005238)


In [10]:
ScaledModel = NormalizedModel('normalizer')
name,results = fit_model(X_train, y_train, ScaledModel)

normalizerLR: 0.853409 (0.024871)
normalizerLDA: 0.934659 (0.012768)
normalizerKNN: 0.944318 (0.015622)
normalizerCART: 0.935795 (0.014827)
normalizerNB: 0.956250 (0.011090)
normalizerSVM: 0.951705 (0.012768)
normalizerAB: 0.152841 (0.033465)
normalizerGBM: 0.950000 (0.012396)
normalizerRF: 0.975568 (0.008821)
normalizerET: 0.978409 (0.005567)


In [19]:
pipeline = make_pipeline(MinMaxScaler(),  GaussianNB())
model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test,y_pred)
classification_metrics(pipeline, conf_matrix)

# save model
pickle.dump(model, open('model.pkl','wb'))

Training Accuracy Score: 99.5%
Validation Accuracy Score: 99.3%
              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        18
      banana       1.00      1.00      1.00        18
   blackgram       1.00      1.00      1.00        22
    chickpea       1.00      1.00      1.00        23
     coconut       1.00      1.00      1.00        15
      coffee       1.00      1.00      1.00        17
      cotton       1.00      1.00      1.00        16
      grapes       1.00      1.00      1.00        18
        jute       0.88      1.00      0.93        21
 kidneybeans       1.00      1.00      1.00        20
      lentil       1.00      1.00      1.00        17
       maize       1.00      1.00      1.00        18
       mango       1.00      1.00      1.00        21
   mothbeans       1.00      1.00      1.00        25
    mungbean       1.00      1.00      1.00        17
   muskmelon       1.00      1.00      1.00        23
      orange     

In [15]:
N = 92
P = 46
K = 9
temperature = 20.82312
humidity = 80.00284
ph = 6.50232
rainfall = 2.93536


sample = [N, P, K, temperature, humidity, ph, rainfall]
single_sample = np.array(sample).reshape(1,-1)
pred = model.predict(single_sample)
pred.item().title()

'Maize'

In [20]:
pic_model=pickle.load(open('model.pkl','rb'))

In [22]:
pre=pic_model.predict(single_sample)

In [23]:
pre

array(['maize'], dtype='<U11')

In [24]:
pre[0]

'maize'