In [1]:
import wandb
wandb.login()

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import params

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33marihantsheth[0m ([33mtri-nit[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="baseline_model", name="get_data")

In [4]:
def get_df_wandb(name):
    data_artifact = wandb.use_artifact(f"{params.DATA_ART}:latest")
    table = data_artifact.get(f"{name}_data_table")
    df = pd.DataFrame(data=table.data, columns=table.columns)

    return df

def encode_crop(df_crop):
    label_encoder = LabelEncoder()

    encoded_crop = label_encoder.fit_transform(df_crop["label"])

    dict_crop={}
    for i in range(len(encoded_crop)):
        dict_crop[df_crop.loc[i, "label"]]=encoded_crop[i]
    
    df_crop["label"] = df_crop["label"].map(dict_crop)

    return df_crop, dict_crop

def split_data(df, test_size=0.2, random_state=42):
    x = np.array(df.iloc[:, :-1].copy())
    y = np.array(df.iloc[:, -1:].copy())

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size = test_size,
                                                    random_state = random_state)

    y_train = np.reshape(y_train, (-1,))
    y_test = np.reshape(y_test, (-1,))

    return x_train, x_test, y_train, y_test

def standardize(x):
    scaler = StandardScaler().fit(x)
    x_scaled = scaler.transform(x)

    return x_scaled

In [5]:
df_crop = get_df_wandb("crop")
df_crop, dict_crop = encode_crop(df_crop)
x_train, x_test, y_train, y_test = split_data(df_crop)

x_train_scaled = standardize(x_train)
x_test_scaled = standardize(x_test)

# wandb.log({"dict_crop": wandb.Table(dataframe=pd.DataFrame.from_dict(dict_crop, orient="index", columns=["label"]))})

wandb.finish()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


# Logistic Regression

In [9]:
wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="train_model", name="logistic_regression_model")

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train_scaled, y_train)

y_pred = logistic_regression.predict(x_test_scaled)
y_probas = logistic_regression.predict_proba(x_test_scaled)

y_pred_train = logistic_regression.predict(x_train_scaled)
train_accuracy = accuracy_score(y_train, y_pred_train)

test_accuracy = accuracy_score(y_test, y_pred)
labels = [x for x in dict(sorted(dict_crop.items(), key= lambda x: x[1])).keys()]

print(f"Accuracy of Logistic Regression: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(logistic_regression,
                              x_train_scaled, 
                              x_test_scaled,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="LogisticRegression"
)

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting LogisticRegression.
[34m[1mwandb[0m: Logged feature importances.


Accuracy of Logistic Regression: 0.9545454545454546


[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.95455
train_accuracy,0.97784


# Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="train_model", name="decision_tree_model")

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train_scaled, y_train)

y_pred = decision_tree.predict(x_test_scaled)
y_probas = decision_tree.predict_proba(x_test_scaled)

y_pred_train = decision_tree.predict(x_train_scaled)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of Decision Tree: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(decision_tree,
                              x_train_scaled, 
                              x_test_scaled,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="DecisionTreeClassifier"
)

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting DecisionTreeClassifier.


Accuracy of Decision Tree: 0.9454545454545454


[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.94545
train_accuracy,1.0


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

wandb.init(project=params.PROJECT, entity=params.ENTITY, job_type="train_model", name="random_forest_model")

random_forest = RandomForestClassifier()
random_forest.fit(x_train_scaled, y_train)

y_pred = random_forest.predict(x_test_scaled)
y_probas = random_forest.predict_proba(x_test_scaled)

y_pred_train = random_forest.predict(x_train_scaled)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of Random Forest: {test_accuracy}")

wandb.log({
    "test_accuracy": test_accuracy,
    "train_accuracy": train_accuracy
})

wandb.sklearn.plot_classifier(random_forest,
                              x_train_scaled, 
                              x_test_scaled,
                              y_train,
                              y_test,
                              y_pred,
                              y_probas,
                              labels,
                              is_binary=False,
                              model_name="RandomForestClassifier"
)

wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting RandomForestClassifier.


Accuracy of Random Forest: 0.9795454545454545


[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


0,1
test_accuracy,▁
train_accuracy,▁

0,1
test_accuracy,0.97955
train_accuracy,1.0


In [None]:
# Saving model as a pickle file
import pickle

with open("../models/random_forest_model.pkl", "wb") as f:
    pickle.dump(random_forest, f)