In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/crop_dataset2.csv'
crop_data = pd.read_csv(file_path)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Separate features and target variable
ipt = crop_data.drop(columns=['Crop'])
opt = crop_data['Crop']

In [3]:
# Accessing the shape of the crop_data DataFrame
# This returns a tuple representing the number of rows and columns in the DataFrame
crop_data.shape

(3377934, 7)

In [4]:
# Split the data into training and testing sets
ipt_train, ipt_test, opt_train, opt_test = train_test_split(ipt, opt, test_size=0.2)



In [5]:
model = DecisionTreeClassifier(max_depth=10, min_samples_split=500, min_samples_leaf=100, max_features='sqrt')
# Fit the model on the training data
model.fit(ipt_train, opt_train)


In [6]:
# # Make predictions on the training data
predictions = model.predict(ipt_test)

In [7]:
# Calculate the accuracy score on the testing data
score = accuracy_score(opt_test, predictions)
print("Accuracy score:", score)


Accuracy score: 0.7032861792781685


In [8]:
# Importing the necessary modules for cross-validation and stratified k-fold
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Creating an instance of StratifiedKFold with 5 splits for cross-validation
folds = StratifiedKFold(n_splits = 5)


In [9]:
# Performing cross-validation using a Decision Tree classifier
cross_val_score(DecisionTreeClassifier(), ipt, opt)

array([0.79420267, 0.89811823, 0.84362488, 0.90868534, 0.67512056])

In [18]:
# Importing the necessary module for performing grid search cross-validation
from sklearn.model_selection import GridSearchCV


# Creating an instance of GridSearchCV with a Decision Tree classifier
# and defining the hyperparameter grid to search over
# 'max_depth', 'min_samples_split', 'min_samples_leaf', and 'max_features' are the hyperparameters being tuned
# The values within the square brackets are the different options to explore for each hyperparameter
# cv=5 specifies 5-fold cross-validation
# return_train_score=False means that the training scores won't be returned in the results
clf = GridSearchCV(DecisionTreeClassifier(),{
    'max_depth':[1,10,20],
    'min_samples_split':[50,100,500],
    'min_samples_leaf':[50,100] ,
    'max_features':['sqrt','log2','int','float']}, cv = 5, return_train_score = False)

In [19]:
# Fitting the GridSearchCV object to the input features (ipt) and target variable (opt)
# This performs the grid search cross-validation to find the best combination of hyperparameters
clf.fit(ipt, opt)

# Accessing the results of the cross-validation performed by the GridSearchCV object
# This provides information such as mean scores, standard deviations, and other metrics
clf.cv_results_

180 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/

{'mean_fit_time': array([4.97357097, 5.02930579, 5.08049183, 5.75502667, 4.94173307,
        4.88981543, 4.93555422, 5.58315296, 5.20459576, 5.25702562,
        5.07434626, 6.02022204, 0.31337929, 0.25847111, 0.20434275,
        0.2690001 , 0.24413147, 0.23870378, 0.26103916, 0.20436382,
        0.20977988, 0.2287087 , 0.31449156, 0.29974465, 7.27666554,
        8.13074985, 7.31128192, 7.2575902 , 7.45503535, 7.62967038,
        7.18094196, 6.86386132, 7.52019372, 7.42451334, 7.08145714,
        7.25779467, 0.19459581, 0.19461179, 0.19458413, 0.19482379,
        0.21587529, 0.29123611, 0.2887692 , 0.24842019, 0.19322515,
        0.19527006, 0.19276996, 0.19245572, 7.37935119, 7.15607185,
        6.80612864, 7.7371614 , 7.21475101, 7.18835917, 7.71402664,
        7.08921103, 7.07695842, 6.98282304, 7.56381998, 7.18724718,
        0.19222898, 0.19369512, 0.24983993, 0.28473682, 0.29153147,
        0.2128273 , 0.19118996, 0.19424872, 0.19182138, 0.19342661,
        0.19086566, 0.19086127]

In [20]:
# Creating a pandas DataFrame from the cross-validation results stored in clf.cv_results_
# This allows for easier analysis and visualization of the results
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.973571,0.889218,1.084962,0.317083,1,sqrt,50,50,"{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.302404,0.220389,0.211934,0.302402,0.220389,0.251504,0.041674,35
1,5.029306,0.662919,1.140770,0.328910,1,sqrt,50,100,"{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.211937,0.220389,0.302402,0.220389,0.302403,0.251504,0.041673,33
2,5.080492,0.924286,1.060652,0.261287,1,sqrt,50,500,"{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.227577,0.302402,0.302402,0.302402,0.302403,0.287437,0.029930,25
3,5.755027,0.759030,0.986951,0.227996,1,sqrt,100,50,"{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.302404,0.220389,0.211934,0.220389,0.302403,0.251504,0.041674,34
4,4.941733,0.817320,1.021519,0.329682,1,sqrt,100,100,"{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.227577,0.302402,0.302402,0.302402,0.302403,0.287437,0.029930,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.194249,0.005281,0.000000,0.000000,20,float,50,100,"{'max_depth': 20, 'max_features': 'float', 'mi...",,,,,,,,37
68,0.191821,0.002945,0.000000,0.000000,20,float,50,500,"{'max_depth': 20, 'max_features': 'float', 'mi...",,,,,,,,37
69,0.193427,0.006332,0.000000,0.000000,20,float,100,50,"{'max_depth': 20, 'max_features': 'float', 'mi...",,,,,,,,37
70,0.190866,0.002625,0.000000,0.000000,20,float,100,100,"{'max_depth': 20, 'max_features': 'float', 'mi...",,,,,,,,37


In [21]:
# Accessing the best mean cross-validated score from the grid search
# This represents the highest average performance achieved by the model with the best hyperparameter combination
clf.best_score_

0.7777437358487849

In [22]:
# Accessing the best hyperparameters found during the grid search
# These are the optimal values for the hyperparameters that yielded the best mean cross-validated score
clf.best_params_

{'max_depth': 20,
 'max_features': 'log2',
 'min_samples_leaf': 50,
 'min_samples_split': 500}