In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv('crop_yield.csv')

In [3]:
df.head(5)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [4]:
df['Season'].unique()

array(['Whole Year ', 'Kharif     ', 'Rabi       ', 'Autumn     ',
       'Summer     ', 'Winter     '], dtype=object)

In [5]:
df.shape

(19689, 10)

In [6]:
# removing spaces prresent in season column
df['Season'] = df['Season'].str.strip()

In [7]:
# Convert Fertilizer and pesticide columns from kgs into tons

df['Fertilizer'] = df['Fertilizer'].apply(lambda x: x/1000)
df['Fertilizer'] = df['Fertilizer'].apply(lambda x: np.round(x,3))

df['Pesticide'] = df['Pesticide'].apply(lambda x: x/1000)
df['Pesticide'] = df['Pesticide'].apply(lambda x: np.round(x,3))

In [8]:
# dropping this row because it's a extreme outlier
df.drop(119, inplace=True)

In [9]:
# Categorize crop_year
bins = [1990, 2000, 2010, 2020]  # Example bins, adjust based on your data range
labels = ['90s', '2000s', '2010s']
df['Year_Interval'] = pd.cut(df['Crop_Year'], bins=bins, labels=labels)

In [10]:
df.drop(columns=['Crop_Year'],inplace=True)

In [11]:
df.shape

(19688, 10)

In [12]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Fertilizer', 'Pesticide', 'Yield', 'Year_Interval'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19688 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Crop             19688 non-null  object  
 1   Season           19688 non-null  object  
 2   State            19688 non-null  object  
 3   Area             19688 non-null  float64 
 4   Production       19688 non-null  int64   
 5   Annual_Rainfall  19688 non-null  float64 
 6   Fertilizer       19688 non-null  float64 
 7   Pesticide        19688 non-null  float64 
 8   Yield            19688 non-null  float64 
 9   Year_Interval    19688 non-null  category
dtypes: category(1), float64(5), int64(1), object(3)
memory usage: 1.5+ MB


In [14]:
df.describe()

Unnamed: 0,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19688.0,19688.0,19688.0,19688.0,19688.0,19688.0
mean,179935.2,16436780.0,1437.73409,24104.48,48.850667,79.958055
std,732846.3,263063500.0,816.924978,94948.27,213.292524,878.328316
min,0.5,0.0,301.3,0.054,0.0,0.0
25%,1390.0,1392.5,940.7,188.0085,0.357,0.6
50%,9316.5,13804.5,1247.3,1235.283,2.421,1.030111
75%,75119.0,122729.5,1643.7,10005.14,20.04475,2.388949
max,50808100.0,6326000000.0,6552.7,4835407.0,15750.511,21105.0


In [15]:
df['Year_Interval'].unique()

['90s', '2000s', '2010s']
Categories (3, object): ['90s' < '2000s' < '2010s']

In [16]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Fertilizer', 'Pesticide', 'Yield', 'Year_Interval'],
      dtype='object')

In [17]:
df.head(5)

Unnamed: 0,Crop,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield,Year_Interval
0,Arecanut,Whole Year,Assam,73814.0,56708,2051.4,7024.878,22.882,0.796087,90s
1,Arhar/Tur,Kharif,Assam,6637.0,4685,2051.4,631.643,2.057,0.710435,90s
2,Castor seed,Kharif,Assam,796.0,22,2051.4,75.755,0.247,0.238333,90s
3,Coconut,Whole Year,Assam,19656.0,126905000,2051.4,1870.662,6.093,5238.051739,90s
4,Cotton(lint),Kharif,Assam,1739.0,794,2051.4,165.501,0.539,0.420909,90s


In [18]:
num_col = ['Area','Production','Annual_Rainfall','Fertilizer','Pesticide']
ordinal_col = ['Season','Year_Interval']
nominal_col = ['Crop','State']
season_order = ['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']
year_order = ['90s','2000s','2010s']

In [19]:
df['Season'].unique()

array(['Whole Year', 'Kharif', 'Rabi', 'Autumn', 'Summer', 'Winter'],
      dtype=object)

In [20]:
from sklearn.preprocessing import RobustScaler, PowerTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [21]:
X = df.drop(columns=['Yield'])
y = df[['Yield']]

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
preprocessor = ColumnTransformer(transformers=[
    ('yeo_johnson_transform', PowerTransformer(method='yeo-johnson'), num_col),
    ('robust_scaler', RobustScaler(),num_col),
    ('Season_order', OrdinalEncoder(categories=[['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']]), ['Season']),
    ('Year_order',OrdinalEncoder(categories=[['90s','2000s','2010s']]), ['Year_Interval']),
    ('OHE', OneHotEncoder(drop='first',handle_unknown='ignore'), nominal_col)
], remainder='passthrough')

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif

In [25]:
# Feature selection using KBest method to reduce curse of dimensionality
kbest = SelectKBest(score_func=f_classif, k=30)

In [26]:
# After hyperparameter tuning best parameters are selected to reduce runtime and increse performance of model
rf = RandomForestRegressor(max_features=0.75, max_samples=0.75, n_estimators=400, n_jobs=-1)

In [27]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kbest', kbest),
    ('RF_regressor', rf)
])

In [28]:
pipe.fit(X_train,y_train.values.ravel())

In [29]:
y_pred = pipe.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score, r2_score
r2_score(y_test,y_pred)

0.9741363158896972