# Crop Recommendation System
Author:'Najaf-Ali'

## importing all required libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score



## Loading the dataset


In [None]:
data=pd.read_csv("Crop_recommendation.csv")
data.head()
# N means nitrogen, P means phosphorous and k means potassium 
# they are given in kg/hector

## Data Analysis
There are total 2200 entries/rows and 8 columns
Data doesnot contains any null value
All input features are numerical columns except the target column, no need of encoding
It is the problem of Classification.



In [None]:
print(data.columns)
data['label'].value_counts()

## Machine Learning

### Data Splitting

In [17]:
X=data.drop('label',axis=1)
y=data['label']
X_train,X_test,y_train,y_test=train_test_split(X,y, train_size=0.8,random_state=42)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['labeled_y']=le.fit_transform(data['label'])



### Selecting the best model based on cross validation score

In [None]:
predictor=CatBoostClassifier()
predictor.fit(X_train,y_train)
y_pred=predictor.predict(X_test)
#generalization_Score=cross_val_score(predictor,X_test,y_test)
#print('genralization score is ',generalization_Score.mean())
print("accuracy score is ",accuracy_score(y_test,y_pred))
print('Generalization score is ',cross_val_score(predictor,X,data['labeled_y']).mean())
print("Classification report is \n",classification_report(y_test,y_pred))
print("confusion_matrix\n",confusion_matrix(y_test,y_pred))




In [None]:

from sklearn.pipeline import Pipeline
models={
    ('Linear Regression',LogisticRegression(random_state=42)),
    ('Decision Tree',DecisionTreeClassifier(random_state=42)),
    ('Random Forest',RandomForestClassifier(random_state=42)),
    ('Support Vector Machine',SVC(random_state=42)),
    ('Xg boost',XGBClassifier(random_state=42)),
    ('Naive Bayes',GaussianNB())
}
best_model_name=None,
best_model=None
best_accuracy=0
best_val_score=0
for name,model in models:
    pipeline=Pipeline(
        steps=[
            ("models",model)
        ]
    )
    accuracy=accuracy_score(y_test,y_pred)
    cross_validation_score=cross_val_score(model,X,data['labeled_y'])
    mean_accuracy=cross_validation_score.mean()
    print("The name of the model is ",name)
    print("The accuracy score of model is ",accuracy)
    print("The generalization score of model is ",cross_validation_score.mean())
    print(" ")
    
    if  mean_accuracy>best_val_score:
        best_model_name=name
        best_accuracy=accuracy
        best_model=model
        best_val_score=cross_validation_score.mean()

print("The name of best model is ",best_model_name)
print("The accuracy score of best model is ",best_accuracy)
print("The cross validation score of best model is ",best_val_score)
print("The best model is ",best_model)

best_model.fit(X_train,y_train)
prediction=best_model.predict([[104,18,30,23.60301571,60.39647474,6.779832611000002,140.9370415]])[0]

### Saving the best model

In [34]:
import pickle 
pickle.dump(best_model,open("Crop_Recommendation_model.pkl",'wb'))