# Fertilizer Recommender- Pytorch Annual Hackathon

![](https://gardenerpick.com/wp-content/uploads/2021/06/sweet-corn-fertilizer-recommendations-1.jpg?ezimgfmt=rs:372x248/rscb2/ng:webp/ngcb2)

In [1]:
# Importing libraries

from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading dataset

In [2]:
df = pd.read_csv("/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv")

In [3]:
df.head()

In [4]:
df.describe()

In [5]:
df['Soil Type'].unique()

In [6]:
# grouping soil type and NPK values
soil_type=['Sandy', 'Loamy', 'Black', 'Red', 'Clayey']
chemicalProp=[]
for i in soil_type:
    dict_N={'MAX':df[df['Soil Type']==i].describe()['Nitrogen'].loc['max'],'MIN':df[df['Soil Type']==i].describe()['Nitrogen'].loc['min']}
    dict_K={'MAX':df[df['Soil Type']==i].describe()['Potassium'].loc['max'],'MIN':df[df['Soil Type']==i].describe()['Potassium'].loc['min']}
    dict_P={'MAX':df[df['Soil Type']==i].describe()['Phosphorous'].loc['max'],'MIN':df[df['Soil Type']==i].describe()['Phosphorous'].loc['min']}
    dict_Soil={i:{'Nitrogen':dict_N,'Potassium':dict_K,'Phosphorous':dict_P}}
    chemicalProp.append(dict_Soil)

    


In [7]:
#Chemical Properties in every soil type
chemicalProp

# Visualizing Data

In [8]:
import seaborn as sns
sns.countplot(x='Soil Type', data = df)

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
sns.countplot(x='Crop Type', data = df)

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
sns.countplot(x='Fertilizer Name', data = df)

In [11]:
#Defining function for Continuous and catogorical variable
def plot_conti(x):
    fig, axes = plt.subplots(nrows=1,ncols=3,figsize=(15,5),tight_layout=True)
    axes[0].set_title('Histogram')
    sns.histplot(x,ax=axes[0])
    axes[1].set_title('Checking Outliers')
    sns.boxplot(x,ax=axes[1])
    axes[2].set_title('Relation with output variable')
    sns.boxplot(y = x,x = df['Fertilizer Name'])
    
def plot_cato(x):
    fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),tight_layout=True)
    axes[0].set_title('Count Plot')
    sns.countplot(x,ax=axes[0])
    axes[1].set_title('Relation with output variable')
    sns.countplot(x = x,hue = df['Fertilizer Name'], ax=axes[1])

In [12]:
#EDA - Temparature variable
plot_conti(df['Temparature'])

In [13]:
#EDA - Humidity variable
plot_conti(df['Humidity '])

In [14]:
#EDA - Potassium variable
plot_conti(df['Potassium'])

In [15]:
#EDA - Phosphorous variable
plot_conti(df['Phosphorous'])

# Preprocessing

In [16]:
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

df['FertilizerNameE']= label_encoder.fit_transform(df['Fertilizer Name'].values)

df2=df.copy()

df2=df2.drop_duplicates('Fertilizer Name')

df2

In [17]:
df['FertilizerNameE'].unique()

### Encoding Soil Type for training

In [18]:
from sklearn.preprocessing import LabelEncoder

encode_soil = LabelEncoder()
df['Soil Type'] = encode_soil.fit_transform(df['Soil Type'])

#creating the DataFrame
Soil_Type = pd.DataFrame(zip(encode_soil.classes_,encode_soil.transform(encode_soil.classes_)),columns=['Original','Encoded'])
Soil_Type = Soil_Type.set_index('Original')
Soil_Type

In [19]:
df.head()

In [20]:
# splitting feature columns and target column
features = df[['Temparature', 'Humidity ','Moisture','Soil Type', 'Nitrogen', 'Phosphorous', 'Potassium']]
target = df['FertilizerNameE']
labels = df['FertilizerNameE']

In [21]:
# Initializing empty lists to append all model's name and corresponding name
acc = []
model = []

# PyTorch Model Setup and Training

In [22]:
# function to get the count of classes
def get_class_distribution(obj):
    count_dict = {
        'Urea':0, 'DAP':0, '14-35-14':0, '28-28':0, '17-17-17':0,
       '20-20':0, '10-26-26':0}
    
    for i in obj:
        if i == 0: 
            count_dict['10-26-26'] += 1
        elif i == 1: 
            count_dict['14-35-14'] += 1
        elif i == 2: 
            count_dict['17-17-17'] += 1
        elif i == 3: 
            count_dict['20-20'] += 1
        elif i == 4: 
            count_dict['28-28'] += 1  
        elif i == 5: 
            count_dict['DAP'] += 1
        elif i == 6: 
            count_dict['Urea'] += 1  
        else:
            print(i)
            print("\nCheck classes.")
            
    return count_dict

In [23]:
# !Importing torch libraries
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [24]:
# Splitting into train and test data

from sklearn.model_selection import train_test_split
# Split into train+val and test
X_trainval, X_test, y_trainval, y_test = train_test_split(features, target, test_size=0.2, stratify=target, random_state=69)

# Split train into train-val
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=21)

X_train= np.array(X_train)
y_train= np.array(y_train)
X_val=np.array(X_val)
y_val=np.array(y_val)
X_test= np.array(X_test)
y_test =  np.array(y_test)

In [25]:
# passing the test, train and sample data to the DataLoader
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [26]:
target_list = []  # List of outputs
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list) # convert to tensor
target_list = target_list[torch.randperm(len(target_list))]

In [27]:
# getting the class count and class wights by reciprocal
class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
print(class_weights)

In [28]:
class_weights_all = class_weights[target_list]


In [29]:
# oversampling mini batch with class of less values
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [30]:
EPOCHS = 300
BATCH_SIZE = 16
LEARNING_RATE = 0.0007
NUM_FEATURES = len(features.columns)
NUM_CLASSES = 7

In [31]:
# loading datasets for torch compatibility
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          sampler=weighted_sampler
)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)

test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [32]:
# Defining classifier torch class
class FertilizerClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(FertilizerClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [33]:
# Setting the device for model training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [34]:
# creating model class instance and printing
model = FertilizerClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

In [35]:
# defining the model accuracy function during train
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [36]:
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

In [37]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

In [38]:
# Visualizing the loss curve
fig = plt.figure(figsize=(20,10))
plt.title("Loss Curve-Train vs Validation")
plt.plot( loss_stats['train'], label='train')
plt.plot( loss_stats['val'], label='validation')
plt.xlabel('num_epochs', fontsize=12)
plt.ylabel('loss', fontsize=12)
plt.legend(loc='best')

# Saving the torch Model

In [39]:
'''
PATH_OP = 'fertilizer.pth'

print("!Begin Saving")


torch.save(model.state_dict(), PATH_OP)

print('!Model Saved')
# Model class must be defined somewhere
model2 = FertilizerClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model2.to(device)
model2.load_state_dict(torch.load(PATH_OP))

model2.eval()
'''

In [40]:
'''
y_pred_list = []
with torch.no_grad():
    model2.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model2(X_batch)
        _, y_pred_tags = torch.max(y_test_pred, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
'''

In [41]:
'''
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_list))
'''

# Thank You!