## Imports

In [33]:
# DO NOT CHANGE THESE LINES.
using Suppressor
@suppress begin
    using DataFrames, CSV, Random, Statistics, Serialization, LazyJSON, StatsBase, DecisionTree, ScientificTypes, MLJ
end


## Paths

In [34]:
# DO NOT CHANGE THESE LINES 
ROOT_DIR = dirname(pwd())
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
OUTPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "outputs")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.ser")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
PREDICTIONS_DIR = joinpath(OUTPUT_DIR, "predictions")
PREDICTIONS_FILE = joinpath(PREDICTIONS_DIR, "predictions.csv")
TARGET_LEVELS = joinpath(MODEL_ARTIFACTS_PATH, "target_levels.ser")



if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

### Reading the schema

In [35]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)  # Read file content as a string
schema = LazyJSON.parse(schema_string)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]
target_classes = schema["target"]["classes"]

19-element LazyJSON.Array{Nothing, String}:
 " 2-4-d-injury"
 " alternarialeaf-spot"
 " anthracnose"
 " bacterial-blight"
 " bacterial-pustule"
 " brown-spot"
 " brown-stem-rot"
 " charcoal-rot"
 " cyst-nematode"
 " diaporthe-pod-&-stem-blight"
 " diaporthe-stem-canker"
 " downy-mildew"
 " frog-eye-leaf-spot"
 " herbicide-injury"
 " phyllosticta-leaf-spot"
 " phytophthora-rot"
 " powdery-mildew"
 " purple-seed-stain"
 " rhizoctonia-root-rot"

### Reading test data.

In [36]:
file_name = filter(x -> occursin(".csv", x), readdir(TEST_DIR))[1]
file_path = joinpath(TEST_DIR, file_name)
df = DataFrame(CSV.File(file_path))

Unnamed: 0_level_0,id,date,plant-stand,precip,temp,hail,crop-hist
Unnamed: 0_level_1,Int64,String15,String15?,String15?,String15?,String7?,String31?
1,178,september,normal,gt-norm,norm,yes,same-lst-yr
2,603,august,normal,norm,norm,no,same-lst-sev-yrs
3,341,july,lt-normal,norm,norm,missing,same-lst-two-yrs
4,599,august,normal,gt-norm,norm,yes,same-lst-yr
5,536,july,normal,lt-norm,norm,no,same-lst-two-yrs
6,615,july,normal,gt-norm,norm,yes,same-lst-yr
7,478,july,lt-normal,gt-norm,norm,yes,same-lst-sev-yrs
8,620,september,lt-normal,gt-norm,gt-norm,yes,same-lst-sev-yrs
9,16,october,normal,lt-norm,gt-norm,no,diff-lst-year
10,224,september,lt-normal,gt-norm,norm,yes,diff-lst-year


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [37]:
imputation_values = open(deserialize, IMPUTATION_FILE)
for column in nullable_features
    df[!, Symbol(column)] .= coalesce.(df[!, Symbol(column)], get(imputation_values, string(column), missing))
end

# Saving the id column in a different variable
ids = df[!, Symbol(id_feature)]

# Dropping the id from the DataFrame
select!(df, Not([Symbol(id_feature)]))

Unnamed: 0_level_0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged
Unnamed: 0_level_1,String15,String15,String15,String15,String7,String31,String15
1,september,normal,gt-norm,norm,yes,same-lst-yr,low-areas
2,august,normal,norm,norm,no,same-lst-sev-yrs,low-areas
3,july,lt-normal,norm,norm,yes,same-lst-two-yrs,low-areas
4,august,normal,gt-norm,norm,yes,same-lst-yr,whole-field
5,july,normal,lt-norm,norm,no,same-lst-two-yrs,whole-field
6,july,normal,gt-norm,norm,yes,same-lst-yr,scattered
7,july,lt-normal,gt-norm,norm,yes,same-lst-sev-yrs,whole-field
8,september,lt-normal,gt-norm,gt-norm,yes,same-lst-sev-yrs,low-areas
9,october,normal,lt-norm,gt-norm,no,diff-lst-year,upper-areas
10,september,lt-normal,gt-norm,norm,yes,diff-lst-year,scattered


### Encoding
We encode the data using the same encoder that we saved during training.

In [38]:
loaded_top_categories = open(deserialize, TOP_CATEGORIES)

# Function to one-hot encode only the top 3 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = "$(feature)_binary"
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = "$(feature)_$(cat)"
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end



one_hot_top_categories!(df, loaded_top_categories)

### Making predictions & Creating Predictions DataFrame
Using the model saved during training.

In [39]:
target_levels = open(deserialize, TARGET_LEVELS)
model = open(deserialize, PREDICTOR_FILE_PATH)
predictions = apply_forest_proba(model, Matrix(df), target_classes)
predictions = DataFrame(predictions, Symbol.(target_classes))
insertcols!(predictions, 1, id_feature => ids)
CSV.write(PREDICTIONS_FILE, predictions)

"/Users/moo/Desktop/Upwork/rt-ML/Regression/Julia/Julia-Random-Forest-Classifier-Template/model_inputs_outputs/outputs/predictions/predictions.csv"