In [1]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data1 = pd.read_csv('/content/drive/MyDrive/Data_Science/Data_Frames/Fertilizer/train.csv')
data2 = pd.read_csv('/content/drive/MyDrive/Data_Science/Data_Frames/Fertilizer/test.csv')
data3 = pd.read_csv('/content/drive/MyDrive/Data_Science/Data_Frames/Fertilizer/Fertilizer Prediction.csv')

train_df = data1.copy()
test_df = data2.copy()
additional_df = data3.copy()

print("First 3 rows of train_df: \n", train_df.head(3))
print("First 3 rows of test_df: \n", test_df.head(3))
print("First 3 rows of test_df: \n", additional_df.head(3))

First 3 rows of train_df: 
    id  Temparature  Humidity  Moisture Soil Type  Crop Type  Nitrogen  \
0   0           37        70        36    Clayey  Sugarcane        36   
1   1           27        69        65     Sandy    Millets        30   
2   2           29        63        32     Sandy    Millets        24   

   Potassium  Phosphorous Fertilizer Name  
0          4            5           28-28  
1          6           18           28-28  
2         12           16        17-17-17  
First 3 rows of test_df: 
        id  Temparature  Humidity  Moisture Soil Type    Crop Type  Nitrogen  \
0  750000           31        70        52     Sandy        Wheat        34   
1  750001           27        62        45       Red    Sugarcane        30   
2  750002           28        72        28    Clayey  Ground Nuts        14   

   Potassium  Phosphorous  
0         11           24  
1         14           15  
2         15            4  
First 3 rows of test_df: 
    Temparature  Humi

In [4]:
#function to make the columns snake_case
import re

def to_snake_case(df):
    def convert(name):
        #lower case
        name = name.lower()
        # Replace spaces and hyphens with underscores
        name = re.sub(r'[\s\-]+', '_', name)
        # Add underscore before capital letters (except at the beginning)
        name = re.sub(r'(?<!^)(?=[A-Z])', '_', name)
        # Convert to lowercase
        return name.lower()

    df.columns = [convert(col) for col in df.columns]
    return df

In [5]:
to_snake_case(train_df)
to_snake_case(test_df)
to_snake_case(additional_df)

Unnamed: 0,temparature,humidity,moisture,soil_type,crop_type,nitrogen,potassium,phosphorous,fertilizer_name
0,32,51,41,Red,Ground Nuts,7,3,19,14-35-14
1,35,58,35,Black,Cotton,4,14,16,Urea
2,27,55,43,Sandy,Sugarcane,28,0,17,20-20
3,33,56,56,Loamy,Ground Nuts,37,5,24,28-28
4,32,70,60,Red,Ground Nuts,4,6,9,14-35-14
...,...,...,...,...,...,...,...,...,...
99995,32,71,61,Black,Tobacco,23,1,25,20-20
99996,35,72,47,Loamy,Millets,38,1,32,17-17-17
99997,28,50,61,Sandy,Maize,10,11,14,14-35-14
99998,29,57,63,Loamy,Ground Nuts,7,10,4,DAP


In [6]:
target_id = train_df['id']
train_df = train_df.drop(columns='id')

In [7]:
full_df = pd.concat([train_df, additional_df], ignore_index= True)

print("Dfs successfully concatenated", full_df.head())

Dfs successfully concatenated    temparature  humidity  moisture soil_type  crop_type  nitrogen  potassium  \
0           37        70        36    Clayey  Sugarcane        36          4   
1           27        69        65     Sandy    Millets        30          6   
2           29        63        32     Sandy    Millets        24         12   
3           35        62        54     Sandy     Barley        39         12   
4           35        58        43       Red      Paddy        37          2   

   phosphorous fertilizer_name  
0            5           28-28  
1           18           28-28  
2           16        17-17-17  
3            4        10-26-26  
4           16             DAP  


In [8]:
from sklearn.preprocessing import LabelEncoder

le_soil        = LabelEncoder().fit(full_df['soil_type'])
le_crop        = LabelEncoder().fit(full_df['crop_type'])
le_fertilizer  = LabelEncoder().fit(full_df['fertilizer_name'])  # target encoder

full_df['soil_type']      = le_soil.transform(full_df['soil_type'])
full_df['crop_type']      = le_crop.transform(full_df['crop_type'])
full_df['fertilizer_name'] = le_fertilizer.transform(full_df['fertilizer_name'])

In [9]:
test_df['soil_type']      = le_soil.transform(test_df['soil_type'])
test_df['crop_type']      = le_crop.transform(test_df['crop_type'])

In [10]:
full_df = full_df.rename(columns= {'temparature' : 'temperature'})

In [11]:
test_df = test_df.rename(columns={'temparature' : 'temperature'})

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [13]:
from sklearn.model_selection import train_test_split

X = full_df.drop(columns= 'fertilizer_name')
y = full_df['fertilizer_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)

In [14]:
!pip install optuna



In [15]:
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [16]:
def objective(trial):
    """Optuna objective function for hyperparameter optimization"""

    params = {
        'objective': 'multi:softprob',
        'num_class': len(np.unique(y_train)),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 2),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss',
        'tree_method': 'hist',
        'device': "cuda"
    }

    model = XGBClassifier(**params)

    cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_log_loss')

    return np.median(cv_scores)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best parameters:", study.best_params)
print("Best CV score:", study.best_value)

[I 2025-06-17 10:11:49,912] A new study created in memory with name: no-name-b5c2ab38-60ba-4815-a296-e998b92e2974
[I 2025-06-17 10:12:10,233] Trial 0 finished with value: -1.9318032538568306 and parameters: {'n_estimators': 341, 'max_depth': 3, 'learning_rate': 0.07657048719293037, 'subsample': 0.919021411659183, 'colsample_bytree': 0.666009501146865, 'min_child_weight': 6, 'gamma': 0.6561756498778906, 'reg_alpha': 0.3004402645270796, 'reg_lambda': 0.587427289042765}. Best is trial 0 with value: -1.9318032538568306.
[I 2025-06-17 10:12:56,768] Trial 1 finished with value: -1.9222009652536312 and parameters: {'n_estimators': 454, 'max_depth': 8, 'learning_rate': 0.02460947823535783, 'subsample': 0.6289838129817297, 'colsample_bytree': 0.688117734448529, 'min_child_weight': 1, 'gamma': 1.3213504041111233, 'reg_alpha': 0.12079809928352714, 'reg_lambda': 0.5528370614307594}. Best is trial 1 with value: -1.9222009652536312.
[I 2025-06-17 10:13:14,620] Trial 2 finished with value: -1.9298584

Best parameters: {'n_estimators': 404, 'max_depth': 9, 'learning_rate': 0.03787713846906852, 'subsample': 0.8719097921188239, 'colsample_bytree': 0.8661047313967585, 'min_child_weight': 8, 'gamma': 0.8284400340679914, 'reg_alpha': 0.6271077771819105, 'reg_lambda': 0.12476575339911822}
Best CV score: -1.9189465925177112


In [20]:
params = {'n_estimators': 404,
          'max_depth': 9,
          'learning_rate': 0.03787713846906852,
          'subsample': 0.8719097921188239,
          'colsample_bytree': 0.8661047313967585,
          'min_child_weight': 8,
          'gamma': 0.8284400340679914,
          'reg_alpha': 0.6271077771819105,
          'reg_lambda': 0.12476575339911822,
          'random_state': 42,
          'tree_method': 'hist',
          'device': "cuda"
         }
best_model = XGBClassifier(**params)
cv_res = cross_val_score(best_model, X_train, y_train, cv=3, scoring='neg_log_loss')

print(f"Final CV scores: {cv_res}")
print(f"Mean CV score: {cv_res.mean():.4f} ± {cv_res.std():.4f}")

best_model.fit(X_train, y_train)

Final CV scores: [-1.91869527 -1.91932409 -1.91894659]
Mean CV score: -1.9190 ± 0.0003


In [24]:
X_submission = test_df.drop(columns='id')

y_pred_enc = best_model.predict(X_submission)

y_pred = le_fertilizer.inverse_transform(y_pred_enc)

submission_df = pd.DataFrame({
    'id' : test_df['id'],
    'Fertilizer Name' : y_pred
})

submission_df.to_csv("submission5.csv", index= False)

print("Success!")

Success!


In [25]:
submission_df

Unnamed: 0,id,Fertilizer Name
0,750000,DAP
1,750001,17-17-17
2,750002,28-28
3,750003,Urea
4,750004,20-20
...,...,...
249995,999995,17-17-17
249996,999996,14-35-14
249997,999997,DAP
249998,999998,10-26-26
