In [2]:
# importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset

df = pd.read_csv("/content/crop_production.csv")


In [None]:
#Exploration of Dataset

df.head()
df.info()
df.describe()


In [None]:
#check the unique values in dataset
df.apply(lambda x: len(x.unique()))
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

In [None]:
# print the categorical columns
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

In [9]:
# Set the figure size
plt.figure(figsize=(15, 5))

# Plot histogram for Area
plt.subplot(1, 2, 1)
sns.histplot(df['Area'], color='blue', kde=True)
plt.title('Area Distribution')
plt.xlabel('Area')
plt.ylabel('Frequency')

# Plot histogram for Production
plt.subplot(1, 2, 2)
sns.histplot(df['Production'], color='red', kde=True)
plt.title('Production Distribution')
plt.xlabel('Production')
plt.ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Creating a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['Crop_Year'], df['Production'], color='red', alpha=0.5)
plt.title('Production Over Years')
plt.xlabel('Year')
plt.ylabel('Production')
plt.grid(True)
plt.show()

In [None]:
#it show the count of each crop type
plt.figure(figsize=(10,8))
sns.countplot(x='Crop', data = df)

In [None]:
# Data Preprocessing

# Checking missing values of the dataset in each column
df.isnull().sum()


In [None]:
# Dropping missing values
df = df.dropna()
# Checking missing values of the dataset in each column
df.isnull().sum()

In [None]:
# Adding a new column Yield which indicates Production per unit Area.

df['Yield'] = (df['Production'] / df['Area'])
df.head(10)

In [None]:
# importing label encoder for converting categorical
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders for 'District_Name', 'Crop', and 'Season'
district_encoder = LabelEncoder()
crop_encoder = LabelEncoder()
season_encoder = LabelEncoder()

# Encode 'District_Name', 'Crop', and 'Season' columns
df['State'] = district_encoder.fit_transform(df['State_Name'])
df['District'] = district_encoder.fit_transform(df['District_Name'])
df['crop'] = crop_encoder.fit_transform(df['Crop'])
df['season'] = season_encoder.fit_transform(df['Season'])

# Drop the original categorical columns
df.drop(['State_Name','District_Name', 'Crop', 'Season'], axis=1, inplace=True)
data = df

# Display the encoded dataset
print(df.head())

In [None]:
#correlation heatmap
plt.figure(figsize=[10,8])
sns.heatmap(df.corr(),annot=True)
plt.show()

In [14]:

X = data.drop(["Production","Yield"], axis=1)
y = data["Production"]



In [None]:
# Classification Models

#Decision Tree

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import metrics


# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=3)

# Define a dictionary containing the parameters to be tuned and their respective values
param_dict = {
    'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}

# Create a Base Decision Tree Model
test_dec_tree = DecisionTreeRegressor(random_state=2)
test_dec_tree.fit(X_train, y_train)

# Performing hyperparameter tuning for the DecisionTreeRegressor using GridSearchCV
grid = GridSearchCV(test_dec_tree, param_dict, cv=2, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

# Output the best parameters and best score




In [16]:
from sklearn.metrics import mean_squared_error, r2_score

# Training a DecisionTreeRegressor with specified hyperparameters
# and evaluating its performance on the training and testing datasets

# Creating DecisionTreeRegressor
Dec_tree = DecisionTreeRegressor(max_depth=8, criterion='poisson', random_state=2)

# Fitting the Model:
Dec_tree.fit(X_train, y_train)

# Evaluating on training data
dt_train_predicted_values = Dec_tree.predict(X_train)
dt_train_mse = mean_squared_error(y_train, dt_train_predicted_values)
dt_train_r2 = r2_score(y_train, dt_train_predicted_values)
print(f'Decision Tree Train MSE: {dt_train_mse:.4f}')
print(f'Decision Tree Train R2 Score: {dt_train_r2:.4f}')

# Evaluating on testing data
dt_test_predicted_values = Dec_tree.predict(X_test)
dt_test_mse = mean_squared_error(y_test, dt_test_predicted_values)
dt_test_r2 = r2_score(y_test, dt_test_predicted_values)
print(f'Decision Tree Test MSE: {dt_test_mse:.4f}')
print(f'Decision Tree Test R2 Score: {dt_test_r2:.4f}')


Decision Tree Train MSE: 18610902052310.1602
Decision Tree Train R2 Score: 0.9366
Decision Tree Test MSE: 25020956658256.1797
Decision Tree Test R2 Score: 0.6722


In [None]:
import matplotlib.pyplot as plt

# Plotting actual vs predicted values
plt.figure(figsize=(10, 7))
plt.scatter(y_test, dt_test_predicted_values, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values (DecisionTreeRegressor)')
plt.show()


In [None]:
# Random Forest

# RandomForestRegressor with GridSearchCV for hyperparameter tuning

from sklearn.ensemble import RandomForestRegressor
#Initializing and Fitting the RandomForestRegressor:
test_rdf_clf = RandomForestRegressor(random_state=2)
test_rdf_clf.fit(X_train, y_train)
#Defining Hyperparameters for GridSearchCV:
param_dict = {
    'n_estimators': [50, 100, 150, 200],
    'criterion':['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
    'random_state': [2]
}
#Performing GridSearchCV:
grid = GridSearchCV(test_rdf_clf, param_dict, cv=2, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)


In [20]:
#Initializing and Fitting the RandomForestRegressor:

rdf_clf = RandomForestRegressor(n_estimators=50, criterion='poisson', max_depth=8, random_state=2)
rdf_clf.fit(X_train, y_train)
rdf_train_score = rdf_clf.score(X_train, y_train)
print(f'Random Forest Train Accuracy is: {rdf_train_score:.4f}')
rdf_predicted_values = rdf_clf.predict(X_test)
rdf_test_score = metrics.accuracy_score(y_test, rdf_predicted_values)
print(f'Random Forest Test Accuracy is: {rdf_test_score:.4f}')
rdf_report = mean_squared_error(y_test, rdf_predicted_values)
print(rdf_report)


Random Forest Train Accuracy is: 0.9573


ValueError: continuous is not supported

In [None]:
import matplotlib.pyplot as plt

# Plotting actual vs predicted values
plt.figure(figsize=(10, 7))
plt.scatter(y_test, dt_test_predicted_values, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values (RandomForestRegressor)')
plt.show()


In [None]:
#Exporting Random Forest Model

from joblib import Parallel, delayed
import joblib

final_rdf_clf = RandomForestRegressor(n_estimators=50, criterion='poisson', max_depth=8, random_state=2)
final_rdf_clf.fit(X,y)

joblib.dump(final_rdf_clf, 'yield_rdf_clf.pkl')