In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import xgboost as xgb

In [2]:
df_orig  = pd.read_csv('Crop Classification 2.csv', encoding = 'utf-8')

In [3]:
#Remove Crop categories with less than 5 % frequency
dataframe_5per = df_orig[df_orig['Crop Name'] != 'Potato']
dataframe_5per = dataframe_5per[dataframe_5per['Crop Name'] != 'Tobacco']
dataframe_5per = dataframe_5per[dataframe_5per['Crop Name'] != 'Sugarcane']
dataframe_5per = dataframe_5per[dataframe_5per['Crop Name'] != 'Fennel']

print('Crop record frequency')
print(dataframe_5per['Crop Name'].value_counts(normalize=True))

Crop record frequency
Wheat       0.313736
Cumin       0.255218
Jowar       0.129166
Castor      0.085018
Maize       0.082013
Rapeseed    0.076822
Gram        0.058026
Name: Crop Name, dtype: float64


In [4]:
# Remove Crop categories with less than 7 % frequency

print('Crop record frequency :')
print(df_orig['Crop Name'].value_counts(normalize=True))

dataframe_7per = dataframe_5per[dataframe_5per['Crop Name'] != 'Gram']

Crop record frequency :
Wheat        0.304874
Cumin        0.248009
Jowar        0.125518
Castor       0.082617
Maize        0.079696
Rapeseed     0.074652
Gram         0.056387
Fennel       0.019911
Sugarcane    0.005044
Tobacco      0.002336
Potato       0.000956
Name: Crop Name, dtype: float64


In [5]:
X = dataframe_7per.iloc[:, 2:35].values
y = dataframe_7per.iloc[:, 1].values
print(X.shape)
print(y.shape)

(17240, 33)
(17240,)


In [6]:
#SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE('minority')
X, y = smote.fit_sample(X, y)
print(X.shape)
print(y.shape)

(21576, 33)
(21576,)


In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify = y, random_state = 0)

In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 5)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

#n_components : int, optional (default=None)
#Number of components (<= min(n_classes - 1, n_features)) for dimensionality reduction. 
#If None, will be set to min(n_classes - 1, n_features).

explained_variance = lda.explained_variance_ratio_

In [10]:
len(list(explained_variance))
list(explained_variance)

[0.6550290699132733,
 0.15033581311701677,
 0.11846804872174069,
 0.0670523768731492,
 0.009114691374820119]

In [23]:
from sklearn.model_selection import GridSearchCV
grid_param = {  
    'n_estimators': [800, 1000, 1200],
    'learning_rate' : [0.03, 0.04, 0.05, 0.06, 0.07, 0.08],
    'max_depth' : [7]
    }

print('GridSearch started...')

#XGB Classifier
xg_cl = xgb.XGBClassifier(objective= "multi:softprob")

gd_sr = GridSearchCV(estimator = xg_cl,
                     param_grid = grid_param,
                     scoring = 'accuracy',
                     n_jobs=1
                     )


gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_  
print(best_parameters)

best_result = gd_sr.best_score_  
print(best_result)

GridSearch started...
{'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 800}
0.7086326767091541


In [24]:
from sklearn.model_selection import GridSearchCV
grid_param = {  
    'n_estimators': [500, 650, 800],
    'learning_rate' : [0.01, 0.02, 0.03],
    'max_depth' : [7]
    }

print('GridSearch started...')

#XGB Classifier
xg_cl = xgb.XGBClassifier(objective= "multi:softprob")

gd_sr = GridSearchCV(estimator = xg_cl,
                     param_grid = grid_param,
                     scoring = 'accuracy',
                     n_jobs=1
                     )


gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_  
print(best_parameters)

best_result = gd_sr.best_score_  
print(best_result)

GridSearch started...
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
0.7146581691772885


In [12]:
from sklearn.model_selection import GridSearchCV
grid_param = {  
    'n_estimators': [300, 400, 500],
    'learning_rate' : [0.008, 0.009, 0.01],
    'max_depth' : [7]
    }

print('GridSearch started...')

#XGB Classifier
xg_cl = xgb.XGBClassifier(objective= "multi:softprob")

gd_sr = GridSearchCV(estimator = xg_cl,
                     param_grid = grid_param,
                     scoring = 'accuracy',
                     n_jobs=1
                     )


gd_sr.fit(X_train, y_train)

best_parameters = gd_sr.best_params_  
print(best_parameters)

best_result = gd_sr.best_score_  
print(best_result)

GridSearch started...
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
0.7130938586326767
