### Let's try CatBoost without any preprocessing other than removing instances with "yield_per_acre" outliers

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, precision_score, recall_score, mean_squared_error, r2_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import tree
from sklearn.model_selection import cross_val_score, cross_validate
import imblearn
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_text, plot_tree
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import VarianceThreshold
from lime.lime_tabular import LimeTabularExplainer
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv("data/Train.csv", index_col=0)
df.head()

Unnamed: 0_level_0,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,SeedingSowingTransplanting,...,Harv_method,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_GTFAC7PEVWQ9,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,2022-07-21,...,machine,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600
ID_TK40ARLSPOKS,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTrac...,2022-07-18,5,Manual_PuddledRandom,2022-06-20,2022-07-20,...,hand,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600
ID_1FJY2CRIMLZZ,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,2022-08-13,...,hand,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225
ID_I3IPXS4DB7NE,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,2022-07-17,...,hand,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468
ID_4T8YQWXWHB4A,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,2022-07-20,...,machine,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550


In [3]:
# Calculate the yield per acre 
df['yield_per_acre'] = df['Yield'] / df['Acre'] 

In [4]:
# Drop the index column 
df = df.reset_index(drop=True)

In [5]:
# Calculate the 99th percentile threshold for 'yield_per_acre'
yield_per_acre_threshold = df['yield_per_acre'].quantile(0.99)

# Filter out rows where 'yield_per_acre' is above the 99th percentile
df_under99 = df[df['yield_per_acre'] <= yield_per_acre_threshold]
df_under99.head()

Unnamed: 0,District,Block,CultLand,CropCultLand,LandPreparationMethod,CropTillageDate,CropTillageDepth,CropEstMethod,RcNursEstDate,SeedingSowingTransplanting,...,Harv_date,Harv_hand_rent,Threshing_date,Threshing_method,Residue_length,Residue_perc,Stubble_use,Acre,Yield,yield_per_acre
0,Nalanda,Noorsarai,45,40,TractorPlough FourWheelTracRotavator,2022-07-20,5,Manual_PuddledRandom,2022-06-27,2022-07-21,...,2022-11-16,,2022-11-16,machine,30,40,plowed_in_soil,0.3125,600,1920.0
1,Nalanda,Rajgir,26,26,WetTillagePuddling TractorPlough FourWheelTrac...,2022-07-18,5,Manual_PuddledRandom,2022-06-20,2022-07-20,...,2022-11-25,3.0,2022-12-24,machine,24,10,plowed_in_soil,0.3125,600,1920.0
2,Gaya,Gurua,10,10,TractorPlough FourWheelTracRotavator,2022-06-30,6,Manual_PuddledRandom,2022-06-20,2022-08-13,...,2022-12-12,480.0,2023-01-11,machine,30,10,plowed_in_soil,0.148148,225,1518.75
3,Gaya,Gurua,15,15,TractorPlough FourWheelTracRotavator,2022-06-16,6,Manual_PuddledRandom,2022-06-17,2022-07-17,...,2022-12-02,240.0,2022-12-29,hand,26,10,plowed_in_soil,0.222222,468,2106.0
4,Nalanda,Noorsarai,60,60,TractorPlough WetTillagePuddling,2022-07-19,4,Manual_PuddledRandom,2022-06-21,2022-07-20,...,2022-11-30,,2022-12-02,machine,24,40,plowed_in_soil,0.46875,550,1173.333333


In [6]:
# Step 1: Identify categorical columns
categorical_features = df_under99.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_features = df_under99.select_dtypes(exclude=['object', 'category']).columns.tolist()

In [7]:
# Step 2: Handle missing values
# For simplicity, let's use mean imputation for numeric columns and mode for categorical columns
numeric_imputer = SimpleImputer(strategy='mean')
df_under99[numeric_features] = numeric_imputer.fit_transform(df_under99[numeric_features])

categorical_imputer = SimpleImputer(strategy='most_frequent')
df_under99[categorical_features] = categorical_imputer.fit_transform(df_under99[categorical_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_under99[numeric_features] = numeric_imputer.fit_transform(df_under99[numeric_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_under99[categorical_features] = categorical_imputer.fit_transform(df_under99[categorical_features])


In [8]:
# Split the data after removing the outliers
X = df_under99.drop(columns=['Yield', 'yield_per_acre'])  # Drop original Yield and yield_per_acre
y = df_under99['yield_per_acre']  # Use yield_per_acre as the target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Step 5: Define the CatBoost Pools for training and testing, specifying categorical features
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)


In [10]:
# Step 6: Initialize and train the CatBoost regressor
model = CatBoostRegressor()
model.fit(train_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.060723
0:	learn: 506.5840216	test: 505.8246903	best: 505.8246903 (0)	total: 64.2ms	remaining: 1m 4s
100:	learn: 285.5346489	test: 303.5039057	best: 303.5037584 (99)	total: 437ms	remaining: 3.88s
200:	learn: 265.8111288	test: 296.3283436	best: 296.3149444 (198)	total: 798ms	remaining: 3.17s
300:	learn: 252.5834502	test: 293.8443741	best: 293.8443741 (300)	total: 1.17s	remaining: 2.72s
400:	learn: 241.7013804	test: 293.1824662	best: 292.9999828 (394)	total: 1.53s	remaining: 2.29s
500:	learn: 229.5165742	test: 291.6044233	best: 291.6044233 (500)	total: 1.92s	remaining: 1.91s
600:	learn: 220.7671509	test: 291.6430607	best: 291.3806317 (527)	total: 2.31s	remaining: 1.54s
700:	learn: 211.2183392	test: 291.2044314	best: 291.1649003 (695)	total: 2.71s	remaining: 1.15s
800:	learn: 202.2576557	test: 291.2637425	best: 290.9663164 (740)	total: 3.06s	remaining: 761ms
900:	learn: 194.2491757	test: 291.6712293	best: 290.9663164 (740)	total: 3.42s	remaining: 376ms
999:	learn: 18

<catboost.core.CatBoostRegressor at 0x134f496a0>

In [11]:
# Make predictions on the test set
y_pred = model.predict(test_pool)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 192.0871568336216
Mean Squared Error (MSE): 84661.39755186938
Root Mean Squared Error (RMSE): 290.9663168682406
R-squared (R2): 0.6846844773674758
