In [40]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import utils
import params

import wandb
wandb.login()

True

# Initializing Weights and Biases

In [41]:
wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="ratio_model", name="random_forest_ratio_model")

# Downloading data from WandB

In [42]:
df_crop = utils.get_df_wandb("crop")
df_crop, dict_crop = utils.encode_crop(df_crop)

[34m[1mwandb[0m:   2 of 2 files downloaded.  


## Calculating N-P-K ratio values

In [43]:
df_crop["N-P"] = df_crop["N"] / df_crop["P"]
df_crop["N-K"] = df_crop["N"] / df_crop["K"]
df_crop["P-K"] = df_crop["P"] / df_crop["K"]

In [44]:
x_columns = df_crop.columns[-3:].to_list() + df_crop.columns[3:7].to_list()

x = df_crop[x_columns].values
y = df_crop["label"].values

In [45]:
x_scaled = utils.standardize(x, "ratio_model")

x_train, x_test, y_train, y_test = utils.split_data(x_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

y_pred = logistic_regression.predict(x_test)
y_probas = logistic_regression.predict_proba(x_test)

y_pred_train = logistic_regression.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)

test_accuracy = accuracy_score(y_test, y_pred)
labels = [x for x in dict(sorted(dict_crop.items(), key= lambda x: x[1])).keys()]

print(f"Accuracy of Logistic Regression: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(logistic_regression,
                              x_train, 
                              x_test,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="LogisticRegression"
)

utils.save_model(logistic_regression, "logistic_regression_ratio")
utils.upload_model("logistic_regression_ratio")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting LogisticRegression.
[34m[1mwandb[0m: Logged feature importances.


Accuracy of Logistic Regression: 0.9204545454545454


[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


In [32]:
wandb.finish()

0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.92045
train_accuracy,0.9375


# Decision Tree

In [39]:
from sklearn.tree import DecisionTreeClassifier

# wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="train_model", name="decision_tree_ratio_model")

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)

y_pred = decision_tree.predict(x_test)
y_probas = decision_tree.predict_proba(x_test)

y_pred_train = decision_tree.predict(x_train)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of Decision Tree: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(decision_tree,
                              x_train, 
                              x_test,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="DecisionTreeClassifier"
)

utils.save_model(decision_tree, "decision_tree_ratio")
utils.upload_model("decision_tree_ratio")

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting DecisionTreeClassifier.


Accuracy of Decision Tree: 0.9545454545454546


[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.95455
train_accuracy,1.0


In [47]:
decision_tree.get_depth()

13

# Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

# wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="train_model", name="random_forest_model")

random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)

y_pred = random_forest.predict(x_test)
y_probas = random_forest.predict_proba(x_test)

y_pred_train = random_forest.predict(x_train)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of Random Forest: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(random_forest,
                              x_train, 
                              x_test,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="RandomForestClassifier"
)

utils.save_model(random_forest, "random_forest_ratio")
utils.upload_model("random_forest_ratio")

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RandomForestClassifier.


Accuracy of Random Forest: 0.9772727272727273


[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.97727
train_accuracy,1.0
