<a href="https://colab.research.google.com/github/Garima918/Suitable_Crop_Prediction_via_Multi-Class_Classification/blob/main/Suitable_Crop_Prediction_via_Multi_Class_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dataset Loading**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import sklearn as sk
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Crop_recommendation.csv')
# We can import file via local repository/drive or through Dataset librairies like Kaggle.

**Describing Dataset**

In [None]:
print(df.info())
print(df.describe())
# If we get total entries equal to the entries in all variable types, then we can conclude that there are no missing values.

**Visualization**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot([df['temperature'], df['humidity'], df['rainfall']], bins= 50, kde=True, alpha=0.8, palette = ['orange','blue','green'])
# We can add as many columns we want to visualize and change other paramenters as required.

In [None]:
df_g = df.groupby('label')
df_g_c = df_g["label"].count()
print(df_g_c)
sns.barplot(hue = df_g_c.index, y = df_g_c.values)

In [None]:
k = df.select_dtypes(include = 'number').corr()
sns.heatmap(k, annot=True, cmap='bwr')
#Other colormaps can be - bwr, reds, oranges etc.
#Nominal categorical data must be excluded, not encoded, for correlation analysis.

In [None]:
sns.pairplot(df, hue='label')

**Dataset Splitting**

In [None]:
x = df.drop(columns = 'label', axis=1)
y = df['label']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 9)
pmf = pd.DataFrame()
#pmf- performance_metrics_file
#Additional dataframe (pmf) created to store all the model's performance score for better visualization among models trained.

**Data Preprocessing**

**Handling Class Imbalance**

In [None]:
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state=9)
#X, Y = sm.fit_resample(x, y)
#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=9)
#x_train_ros, y_train_ros = ros.fit_resample(x_train, y_train)

**Data Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)  # fit only on train
x_test_scaled = scaler.transform(x_test)    #transform test
X_train_df = pd.DataFrame(x_train_scaled, columns=x.columns)
X_test_df  = pd.DataFrame(x_test_scaled, columns=x.columns)
print(x_train_scaled)

**Performance Measures**


In [None]:
#Importing modules for Performance Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
#from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
metrics_indicator = ['Precision','Accuracy','Recall','F1 Score']

In [None]:
y1 = y_test.groupby(y_test).count()
print(y1)
# We will get to know how many instances of different categories are there in testing dataset.
#Series don't have column, hence we need to group by series name in brackets also and use count().

**Fitting Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
log_r = LogisticRegression()
log_r.fit(x_train, y_train)
log_r_pred = log_r.predict(x_test)

log_r_precision_score = precision_score(y_test, log_r_pred, average = "weighted")
log_r_accuracy_score = accuracy_score(y_test, log_r_pred)
log_r_recall_score = recall_score(y_test, log_r_pred, average = "weighted")
log_r_f1_score = f1_score(y_test, log_r_pred, average = "weighted")

pmf["Log_R"] = pd.DataFrame({'Log R': [log_r_precision_score, log_r_accuracy_score, log_r_recall_score, log_r_f1_score]}, index = metrics_indicator)
cm = confusion_matrix(y_test, log_r_pred, labels = log_r.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = log_r.classes_)
disp.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title('Logistic Regression')
plt.show()

**Fitting Decision Tree Model**

In [None]:
from sklearn.tree import DecisionTreeClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)

dt_precision_score = precision_score(y_test, dt_pred, average = "weighted")
dt_accuracy_score = accuracy_score(y_test, dt_pred)
dt_recall_score = recall_score(y_test, dt_pred, average = "weighted")
dt_f1_score = f1_score(y_test, dt_pred, average = "weighted")

pmf["DT"] = pd.DataFrame({'DT':[dt_precision_score, dt_accuracy_score, dt_recall_score, dt_f1_score]}, index = metrics_indicator)
cm1 = confusion_matrix(y_test, dt_pred,labels = dt.classes_)
disp1 = ConfusionMatrixDisplay(confusion_matrix = cm1, display_labels = dt.classes_)
disp1.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title('Decision Tree')
plt.show()

**Fitting Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_pred = rfc.predict(x_test)

rfc_precision_score = precision_score(y_test, rfc_pred, average = "weighted")
rfc_accuracy_score = accuracy_score(y_test, rfc_pred)
rfc_recall_score = recall_score(y_test, rfc_pred, average = "weighted")
rfc_f1_score = f1_score(y_test, rfc_pred,average = "weighted")

pmf["RFC"] = pd.DataFrame({"RFC":[rfc_precision_score, rfc_accuracy_score,rfc_recall_score, rfc_f1_score]}, index = metrics_indicator)

cm2 = confusion_matrix(y_test, rfc_pred, labels = rfc.classes_)
disp2 = ConfusionMatrixDisplay(cm2, display_labels = rfc.classes_)
disp2.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title('Random Forest')
plt.show()


**Fitting KNN Model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn_pred = knn.predict(x_test)

knn_precision_score = precision_score(y_test, knn_pred, average = "weighted")
knn_accuracy_score = accuracy_score(y_test, knn_pred)
knn_recall_score = recall_score(y_test, knn_pred, average = "weighted")
knn_f1_score = f1_score(y_test, knn_pred, average = "weighted")

pmf["KNN"] = pd.DataFrame({"KNN":[knn_precision_score, knn_accuracy_score, knn_recall_score, knn_f1_score]}, index = metrics_indicator)

cm3 = confusion_matrix(y_test, knn_pred, labels = knn.classes_)
disp = ConfusionMatrixDisplay(cm3, display_labels = knn.classes_)
disp.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title('KNN')
plt.show()

**Fitting Naive Bayes Model**

In [None]:
from sklearn.naive_bayes import GaussianNB
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_pred = nb.predict(x_test)

nb_precision_score = precision_score(y_test, nb_pred, average = "weighted")
nb_accuracy_score = accuracy_score(y_test, nb_pred)
nb_recall_score = recall_score(y_test, nb_pred, average = "weighted")
nb_f1_score = f1_score(y_test, nb_pred, average = "weighted")

pmf["NB"] = pd.DataFrame({"NB": [nb_precision_score, nb_accuracy_score, nb_recall_score, nb_f1_score]}, index = metrics_indicator)

cm4 = confusion_matrix(y_test, nb_pred, labels = nb.classes_)
disp = ConfusionMatrixDisplay(cm4, display_labels = nb.classes_)
disp.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title("Naive Bayes")
plt.show()


**Fitting Gradient Boosting Model**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
gb = GradientBoostingClassifier()
gb.fit(x_train,y_train)
gb_pred = gb.predict(x_test)

gb_precision_score = precision_score(y_test, gb_pred, average = "weighted")
gb_accuracy_score = accuracy_score(y_test, gb_pred)
gb_recall_score = recall_score(y_test, gb_pred, average = "weighted")
gb_f1_score = f1_score(y_test, gb_pred, average = "weighted")

pmf["GB"] = pd.DataFrame({"GB":[gb_precision_score, gb_accuracy_score, gb_recall_score, gb_f1_score]}, index = metrics_indicator)

cm5 = confusion_matrix(y_test, gb_pred, labels = gb.classes_)
disp= ConfusionMatrixDisplay(cm5, display_labels = gb.classes_)
disp.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title("Gradient Boosting")
plt.show()


**Fitting Ada Boost Model**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
x_train, y_train, x_test, y_test = x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()
ab = AdaBoostClassifier()
ab.fit(x_train, y_train)
ab_pred = ab.predict(x_test)

ab_precision_score = precision_score(y_test, ab_pred, average = "weighted")
ab_accuracy_score = accuracy_score(y_test, ab_pred)
ab_recall_score = recall_score(y_test, ab_pred, average = "weighted")
ab_f1_score = f1_score(y_test, ab_pred, average = "weighted")

pmf["AB"] = pd.DataFrame({"AB":[ab_precision_score, ab_accuracy_score, ab_recall_score, ab_f1_score]}, index = metrics_indicator)

cm6 = confusion_matrix(y_test, ab_pred, labels = ab.classes_)
disp= ConfusionMatrixDisplay(cm6, display_labels = ab.classes_)
disp.plot()
plt.xticks(rotation = 90)
plt.yticks(rotation = 0)
plt.title("Ada Boost")
plt.show()


**Writing Performance Metrics to File**

In [None]:
#pmf_ad = pd.DataFrame.from_dict(pmf, orient='index', columns=['Score'])
pmf.to_csv("./ Suitable_Crop_Performance_Prediction_Of_Models.csv")