Importing the packages

In [843]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score



Importing the data

In [804]:
train_data = pd.read_csv('Datasets/train.csv')
test_data = pd.read_csv('Datasets/test.csv')


In [805]:
train_data

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [806]:
test_data

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14
...,...,...,...,...,...,...,...,...,...
249995,999995,26,66,30,Red,Sugarcane,14,7,18
249996,999996,33,62,55,Red,Pulses,28,14,7
249997,999997,36,53,64,Black,Paddy,28,11,27
249998,999998,36,67,26,Clayey,Paddy,33,0,10


In [807]:
train_data['Fertilizer Name'].value_counts()

Fertilizer Name
14-35-14    114436
10-26-26    113887
17-17-17    112453
28-28       111158
20-20       110889
DAP          94860
Urea         92317
Name: count, dtype: int64

In [808]:
train_data.isnull().sum()

id                 0
Temparature        0
Humidity           0
Moisture           0
Soil Type          0
Crop Type          0
Nitrogen           0
Potassium          0
Phosphorous        0
Fertilizer Name    0
dtype: int64

In [809]:
test_data.isnull().sum()

id             0
Temparature    0
Humidity       0
Moisture       0
Soil Type      0
Crop Type      0
Nitrogen       0
Potassium      0
Phosphorous    0
dtype: int64

In [810]:
train_data.dtypes

id                  int64
Temparature         int64
Humidity            int64
Moisture            int64
Soil Type          object
Crop Type          object
Nitrogen            int64
Potassium           int64
Phosphorous         int64
Fertilizer Name    object
dtype: object

In [811]:
scaler = StandardScaler()

In [812]:
test_data.value_counts()

id      Temparature  Humidity  Moisture  Soil Type  Crop Type    Nitrogen  Potassium  Phosphorous
750000  31           70        52        Sandy      Wheat        34        11         24             1
750001  27           62        45        Red        Sugarcane    30        14         15             1
750002  28           72        28        Clayey     Ground Nuts  14        15         4              1
750003  37           53        57        Black      Ground Nuts  18        17         36             1
750004  31           55        32        Red        Pulses       13        19         14             1
                                                                                                    ..
999995  26           66        30        Red        Sugarcane    14        7          18             1
999996  33           62        55        Red        Pulses       28        14         7              1
999997  36           53        64        Black      Paddy        28        11 

In [813]:
#Scaling the values
for i in train_data.columns:
    if train_data[i].dtype == 'int64' and test_data[i].dtype == 'int64':
  
        train_data[i] = scaler.fit_transform(train_data[[i]])
        test_data[i] = scaler.fit_transform(test_data[[i]])

In [814]:
test_data

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,-1.732044,-0.122176,1.349366,0.577417,Sandy,Wheat,0.968300,0.262229,0.232451
1,-1.732030,-1.116191,0.143886,-0.016149,Red,Sugarcane,0.611664,0.782444,-0.494478
2,-1.732016,-0.867687,1.650737,-1.457664,Clayey,Ground Nuts,-0.814877,0.955849,-1.382948
3,-1.732002,1.368846,-1.212279,1.001392,Black,Ground Nuts,-0.458242,1.302659,1.201690
4,-1.731988,-0.122176,-0.910909,-1.118484,Red,Pulses,-0.904036,1.649469,-0.575248
...,...,...,...,...,...,...,...,...,...
249995,1.731988,-1.364695,0.746626,-1.288074,Red,Sugarcane,-0.814877,-0.431391,-0.252169
249996,1.732002,0.374831,0.143886,0.831802,Red,Pulses,0.433347,0.782444,-1.140638
249997,1.732016,1.120342,-1.212279,1.594957,Black,Paddy,0.433347,0.262229,0.474761
249998,1.732030,1.120342,0.897311,-1.627254,Clayey,Paddy,0.879141,-1.645225,-0.898328


In [815]:
train_data.head()


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,-1.732048,1.36538,1.348,-0.778675,Clayey,Sugarcane,1.150683,-0.950166,-1.301811,28-28
1,-1.732044,-1.11874,1.197572,1.68008,Sandy,Millets,0.615738,-0.603282,-0.248908,28-28
2,-1.732039,-0.621916,0.295003,-1.117813,Sandy,Millets,0.080794,0.437369,-0.410893,17-17-17
3,-1.732035,0.868556,0.144575,0.747449,Sandy,Barley,1.418155,0.437369,-1.382803,10-26-26
4,-1.73203,0.868556,-0.457138,-0.185182,Red,Paddy,1.23984,-1.29705,-0.410893,DAP


In [816]:
le = LabelEncoder()


In [817]:
fertilizer_encoder = LabelEncoder()
train_data['Fertilizer Name'] = fertilizer_encoder.fit_transform(train_data['Fertilizer Name'])

In [818]:
feature_encoder = LabelEncoder()


In [819]:
#Encoding the categorical values
for col in ['Soil Type', 'Crop Type']:
    train_data[col] = feature_encoder.fit_transform(train_data[col])
    test_data[col] = feature_encoder.transform(test_data[col])  

In [820]:
train_data

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,-1.732048,1.365380,1.348000,-0.778675,1,8,1.150683,-0.950166,-1.301811,4
1,-1.732044,-1.118740,1.197572,1.680080,4,4,0.615738,-0.603282,-0.248908,4
2,-1.732039,-0.621916,0.295003,-1.117813,4,4,0.080794,0.437369,-0.410893,2
3,-1.732035,0.868556,0.144575,0.747449,4,0,1.418155,0.437369,-1.382803,0
4,-1.732030,0.868556,-0.457138,-0.185182,3,6,1.239840,-1.297050,-0.410893,5
...,...,...,...,...,...,...,...,...,...,...
749995,1.732030,-1.615563,1.197572,-1.287383,1,3,-1.345725,1.131137,-1.220818,4
749996,1.732035,1.365380,0.445431,1.086588,2,8,1.328997,-0.256399,-0.086923,2
749997,1.732039,0.868556,1.047144,1.171372,4,2,-1.524040,0.263927,0.642009,0
749998,1.732044,-0.125092,1.047144,-1.372167,3,1,-1.256567,0.263927,-0.734863,3


In [821]:
train_data

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,-1.732048,1.365380,1.348000,-0.778675,1,8,1.150683,-0.950166,-1.301811,4
1,-1.732044,-1.118740,1.197572,1.680080,4,4,0.615738,-0.603282,-0.248908,4
2,-1.732039,-0.621916,0.295003,-1.117813,4,4,0.080794,0.437369,-0.410893,2
3,-1.732035,0.868556,0.144575,0.747449,4,0,1.418155,0.437369,-1.382803,0
4,-1.732030,0.868556,-0.457138,-0.185182,3,6,1.239840,-1.297050,-0.410893,5
...,...,...,...,...,...,...,...,...,...,...
749995,1.732030,-1.615563,1.197572,-1.287383,1,3,-1.345725,1.131137,-1.220818,4
749996,1.732035,1.365380,0.445431,1.086588,2,8,1.328997,-0.256399,-0.086923,2
749997,1.732039,0.868556,1.047144,1.171372,4,2,-1.524040,0.263927,0.642009,0
749998,1.732044,-0.125092,1.047144,-1.372167,3,1,-1.256567,0.263927,-0.734863,3


In [822]:
test_data.drop(columns=['id'],inplace=True,axis=1)

In [823]:
test_data

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,-0.122176,1.349366,0.577417,4,10,0.968300,0.262229,0.232451
1,-1.116191,0.143886,-0.016149,3,8,0.611664,0.782444,-0.494478
2,-0.867687,1.650737,-1.457664,1,2,-0.814877,0.955849,-1.382948
3,1.368846,-1.212279,1.001392,0,2,-0.458242,1.302659,1.201690
4,-0.122176,-0.910909,-1.118484,3,7,-0.904036,1.649469,-0.575248
...,...,...,...,...,...,...,...,...
249995,-1.364695,0.746626,-1.288074,3,8,-0.814877,-0.431391,-0.252169
249996,0.374831,0.143886,0.831802,3,7,0.433347,0.782444,-1.140638
249997,1.120342,-1.212279,1.594957,0,6,0.433347,0.262229,0.474761
249998,1.120342,0.897311,-1.627254,1,6,0.879141,-1.645225,-0.898328


Splitting the data

In [824]:
X = train_data.drop(['id','Fertilizer Name'],axis=1)
Y = train_data['Fertilizer Name']

In [825]:
X

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,1.365380,1.348000,-0.778675,1,8,1.150683,-0.950166,-1.301811
1,-1.118740,1.197572,1.680080,4,4,0.615738,-0.603282,-0.248908
2,-0.621916,0.295003,-1.117813,4,4,0.080794,0.437369,-0.410893
3,0.868556,0.144575,0.747449,4,0,1.418155,0.437369,-1.382803
4,0.868556,-0.457138,-0.185182,3,6,1.239840,-1.297050,-0.410893
...,...,...,...,...,...,...,...,...
749995,-1.615563,1.197572,-1.287383,1,3,-1.345725,1.131137,-1.220818
749996,1.365380,0.445431,1.086588,2,8,1.328997,-0.256399,-0.086923
749997,0.868556,1.047144,1.171372,4,2,-1.524040,0.263927,0.642009
749998,-0.125092,1.047144,-1.372167,3,1,-1.256567,0.263927,-0.734863


train_test_split

In [826]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size=0.2 , random_state=42)

In [827]:
print(X.shape, X_train.shape , X_test.shape)

(750000, 8) (600000, 8) (150000, 8)


Model Implementation

In [828]:
model = LGBMClassifier()

In [829]:
model.fit(X_train,Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 8
[LightGBM] [Info] Start training from score -1.885565
[LightGBM] [Info] Start training from score -1.877350
[LightGBM] [Info] Start training from score -1.900582
[LightGBM] [Info] Start training from score -1.909654
[LightGBM] [Info] Start training from score -1.910836
[LightGBM] [Info] Start training from score -2.069993
[LightGBM] [Info] Start training from score -2.091474


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [831]:
probs = model.predict_proba(X_test)
top3 = np.argsort(probs, axis=1)[:, -3:][:, ::-1]
top3_labels = fertilizer_encoder.inverse_transform(top3.flatten()).reshape(top3.shape)
top3_preds = [' '.join(row) for row in top3_labels]

In [832]:
top3_preds

['28-28 20-20 17-17-17',
 '10-26-26 20-20 28-28',
 '20-20 14-35-14 28-28',
 '20-20 14-35-14 17-17-17',
 '17-17-17 14-35-14 10-26-26',
 '10-26-26 20-20 14-35-14',
 '17-17-17 28-28 20-20',
 'Urea 17-17-17 20-20',
 '20-20 Urea 28-28',
 '28-28 17-17-17 Urea',
 '14-35-14 17-17-17 10-26-26',
 '28-28 20-20 14-35-14',
 '17-17-17 28-28 10-26-26',
 '17-17-17 14-35-14 20-20',
 '28-28 17-17-17 14-35-14',
 '10-26-26 14-35-14 28-28',
 '10-26-26 14-35-14 28-28',
 '28-28 14-35-14 17-17-17',
 '14-35-14 17-17-17 DAP',
 '17-17-17 28-28 10-26-26',
 '14-35-14 Urea 28-28',
 '14-35-14 20-20 Urea',
 'DAP 20-20 28-28',
 '20-20 14-35-14 10-26-26',
 '28-28 20-20 14-35-14',
 'DAP 10-26-26 28-28',
 '28-28 14-35-14 17-17-17',
 '10-26-26 17-17-17 14-35-14',
 '28-28 17-17-17 14-35-14',
 '10-26-26 28-28 17-17-17',
 '10-26-26 14-35-14 17-17-17',
 '17-17-17 10-26-26 28-28',
 '17-17-17 14-35-14 20-20',
 '20-20 14-35-14 28-28',
 '17-17-17 14-35-14 28-28',
 '17-17-17 10-26-26 14-35-14',
 '28-28 10-26-26 17-17-17',
 '17-17-

Kaggle test

In [None]:
test_data

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,-0.122176,1.349366,0.577417,Sandy,Wheat,0.968300,0.262229,0.232451
1,-1.116191,0.143886,-0.016149,Red,Sugarcane,0.611664,0.782444,-0.494478
2,-0.867687,1.650737,-1.457664,Clayey,Ground Nuts,-0.814877,0.955849,-1.382948
3,1.368846,-1.212279,1.001392,Black,Ground Nuts,-0.458242,1.302659,1.201690
4,-0.122176,-0.910909,-1.118484,Red,Pulses,-0.904036,1.649469,-0.575248
...,...,...,...,...,...,...,...,...
249995,-1.364695,0.746626,-1.288074,Red,Sugarcane,-0.814877,-0.431391,-0.252169
249996,0.374831,0.143886,0.831802,Red,Pulses,0.433347,0.782444,-1.140638
249997,1.120342,-1.212279,1.594957,Black,Paddy,0.433347,0.262229,0.474761
249998,1.120342,0.897311,-1.627254,Clayey,Paddy,0.879141,-1.645225,-0.898328


In [833]:
test_probability = model.predict_proba(test_data)
test_top3 = np.argsort(test_probability, axis=1)[:, -3:][:, ::-1]
test_top3_labels = fertilizer_encoder.inverse_transform(test_top3.flatten()).reshape(test_top3.shape)
test_top3_preds = [' '.join(row) for row in test_top3_labels]

In [834]:
test_top3_preds

['DAP 28-28 14-35-14',
 '17-17-17 20-20 10-26-26',
 '10-26-26 20-20 14-35-14',
 '14-35-14 10-26-26 17-17-17',
 '20-20 10-26-26 17-17-17',
 '28-28 20-20 14-35-14',
 '28-28 10-26-26 14-35-14',
 '17-17-17 28-28 14-35-14',
 '20-20 14-35-14 17-17-17',
 '17-17-17 14-35-14 20-20',
 '17-17-17 14-35-14 10-26-26',
 '20-20 14-35-14 28-28',
 '14-35-14 20-20 10-26-26',
 '28-28 20-20 17-17-17',
 '28-28 Urea 20-20',
 '17-17-17 20-20 14-35-14',
 '10-26-26 14-35-14 28-28',
 'Urea 14-35-14 20-20',
 '17-17-17 10-26-26 14-35-14',
 '14-35-14 10-26-26 17-17-17',
 '10-26-26 14-35-14 17-17-17',
 '28-28 10-26-26 14-35-14',
 '17-17-17 14-35-14 28-28',
 '20-20 10-26-26 14-35-14',
 '10-26-26 28-28 17-17-17',
 '17-17-17 14-35-14 28-28',
 '14-35-14 17-17-17 28-28',
 '10-26-26 DAP 20-20',
 '17-17-17 10-26-26 14-35-14',
 '17-17-17 20-20 28-28',
 '10-26-26 20-20 17-17-17',
 '14-35-14 20-20 28-28',
 '10-26-26 28-28 20-20',
 '14-35-14 Urea 10-26-26',
 'DAP 17-17-17 14-35-14',
 '28-28 17-17-17 Urea',
 'DAP 10-26-26 14-35

In [835]:
sample = pd.read_csv('Datasets/test.csv')


In [840]:
Final_prediction = pd.DataFrame({
    'id': sample['id'],  
    'Fertilizer Name': test_top3_preds
})

In [841]:
Final_prediction

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 14-35-14
1,750001,17-17-17 20-20 10-26-26
2,750002,10-26-26 20-20 14-35-14
3,750003,14-35-14 10-26-26 17-17-17
4,750004,20-20 10-26-26 17-17-17
...,...,...
249995,999995,17-17-17 20-20 14-35-14
249996,999996,14-35-14 20-20 10-26-26
249997,999997,14-35-14 28-28 DAP
249998,999998,10-26-26 28-28 DAP


In [842]:
Final_prediction.to_csv('Datasets/submission.csv', index=False)