# Model Training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/raw/Crop recommendation.csv')
print(df.shape)
df.head()

(2200, 8)


Unnamed: 0,Nitrogen,Phosphorus,K(Potassium),Temperature,Humidity,Ph Value,Rainfall,Crop Name
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import yaml

In [4]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("Crop Name", axis=1), df["Crop Name"], test_size=0.25
    )

In [24]:
# Create a pipeline to normalize the data
scaler = MinMaxScaler()
pipeline = Pipeline([("scaler", scaler)])
pipeline.fit(X_train)

X_train_norm = pipeline.transform(X_train)
X_test_norm = pipeline.transform(X_test)

In [25]:
# Create a pipeline to train the models
models = []
models.append(("rf", RandomForestClassifier()))
models.append(("svc", SVC()))
models.append(("lr", LogisticRegression()))
models.append(("dt", DecisionTreeClassifier()))

In [26]:
# Train the models
for name, model in models:
    model.fit(X_train_norm, y_train)

In [27]:
# Evaluate the models
accuracies = {}
for name, model in models:
    y_pred = model.predict(X_test_norm)
    acc = accuracy_score(y_test, y_pred)
    accuracies.update({name: float(acc)})

In [28]:
# Print the accuracies
for name, acc in accuracies.items():
    print(f"{name}: {acc}")

rf: 0.9818181818181818
svc: 0.9818181818181818
lr: 0.92
dt: 0.9345454545454546


In [17]:
# Store the accuracies in a YML file
with open("accuracies_with_pca.yml", "w+") as f:
    yaml.dump(accuracies, f, indent=4)

In [18]:
accuracies

{'rf': 0.8418181818181818,
 'svc': 0.84,
 'lr': 0.7345454545454545,
 'dt': 0.7945454545454546}