## Imports

In [9]:
# DO NOT CHANGE THESE LINES
using Suppressor
@suppress begin
    using DataFrames, CSV, Random, Statistics, Serialization, LazyJSON, StatsBase, DecisionTree, ScientificTypes, MLJ
end

## Paths

In [2]:
# DO NOT CHANGE THESE LINES
ROOT_DIR = dirname(pwd())
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.ser")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
TARGET_LEVELS = joinpath(MODEL_ARTIFACTS_PATH, "target_levels.ser")

if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

### Reading the schema
The schema contains metadata about the datasets. We will use the scehma to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [3]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)
schema = LazyJSON.parse(schema_string)

features = schema["features"]
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]


"disease"

### Reading training data

In [4]:
# Read the CSV file into a DataFrame
file_name = first(filter(x -> endswith(x, ".csv"), readdir(TRAIN_DIR)))
file_path = joinpath(TRAIN_DIR, file_name)
train_df = DataFrame(CSV.File(file_path))

Unnamed: 0_level_0,id,date,plant-stand,precip,temp,hail,crop-hist
Unnamed: 0_level_1,Int64,String15?,String15?,String15?,String15?,String7?,String31?
1,268,september,normal,gt-norm,gt-norm,yes,same-lst-two-yrs
2,204,july,lt-normal,norm,gt-norm,yes,same-lst-sev-yrs
3,664,september,missing,missing,missing,missing,missing
4,391,september,normal,lt-norm,gt-norm,yes,same-lst-two-yrs
5,434,september,normal,gt-norm,norm,yes,same-lst-two-yrs
6,218,october,normal,gt-norm,gt-norm,yes,same-lst-sev-yrs
7,146,july,normal,gt-norm,norm,yes,same-lst-sev-yrs
8,509,september,normal,gt-norm,gt-norm,yes,same-lst-yr
9,262,august,normal,gt-norm,gt-norm,yes,same-lst-yr
10,508,july,normal,gt-norm,lt-norm,yes,same-lst-sev-yrs


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> The median value will be used to impute missing values of the numeric features and the mode will be used to impute categorical features.

##### You can add your own preprocessing steps such as:
<ul>
<li>Normalization</li> <br>
<li>Outlier removal</li><br>
<li>Dropping or adding features</li><br>
</ul>

### Important note:
<p> 
Saving the values used for imputation during training step is crucial. These values will be used to impute missing data in the testing set. This is very important to avoid the well known problem of data leakage. During testing, you should not make any assumptions about the data in hand, alternatively anything needed during the testing phase should be learned from the training phase. This is why we are creating a dictionary of values used during training to reuse these values during testing.
</p>


In [5]:
# Imputing missing data
imputation_values = Dict{String, Any}()

for column in nullable_features
    if column in numeric_features
        value = median(skipmissing(train_df[!, column]))
    else
        value = mode(skipmissing(train_df[!, column]))
    end
    train_df[!, column] = coalesce.(train_df[!, column], value)
    imputation_values[column] = value
end

# Serialize the imputation_values dictionary to a binary file
open(IMPUTATION_FILE, "w") do io
    serialize(io, imputation_values)
end


##### Encoding Categorical features
<p>
The id column is just an identifier for the training example, so we will exclude it during the encoding phase.<br>
Target feature will be label encoded in the next step.
</p>


In [6]:
# Store the target column in a variable
target = train_df[:, target_feature]

# Drop the target column from the dataframe
select!(train_df, Not(target_feature))

# Function to get top 3 categories
function get_top_categories(df, features, n=3)
    top_cats = Dict()
    for feature in features
        col_data = df[!, feature]
        category_counts = StatsBase.countmap(col_data)
        sorted_categories = sort(collect(category_counts), by=x->x[2], rev=true)
        
        # Take minimum between n and the number of unique categories
        num_categories = min(n, length(sorted_categories))
        
        top_cats[feature] = [x[1] for x in sorted_categories[1:num_categories]]
    end
    return top_cats
end

# Get top 3 categories for specific features
top_categories = get_top_categories(train_df, categorical_features)


Dict{Any, Any} with 35 entries:
  "leaf-mild"      => String15[" absent", " lower-surf", " upper-surf"]
  "seed-discolor"  => String15[" absent", " present"]
  "fruit-pods"     => String15[" norm", " diseased", " dna"]
  "roots"          => String15[" norm", " rotted", " galls-cysts"]
  "precip"         => String15[" gt-norm", " norm", " lt-norm"]
  "lodging"        => String7[" yes", " no"]
  "stem-cankers"   => String15[" absent", " above-sec-nde", " below-soil"]
  "leaf-malf"      => String15[" absent", " present"]
  "temp"           => String15[" norm", " gt-norm", " lt-norm"]
  "area-damaged"   => String15[" low-areas", " whole-field", " upper-areas"]
  "shriveling"     => String15[" absent", " present"]
  "crop-hist"      => String31[" same-lst-two-yrs", " same-lst-sev-yrs", " same…
  "mold-growth"    => String15[" absent", " present"]
  "mycelium"       => String15[" absent", " present"]
  "seed-size"      => String15[" norm", " lt-norm"]
  "hail"           => String7[" yes", " 

In [7]:
# Function to one-hot encode only the top 3 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = Symbol(string(feature, "_binary"))
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = Symbol(string(feature, "_", cat))
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end

# Apply one-hot encoding
one_hot_top_categories!(train_df, top_categories)

# Serialize the top_categories dictionary to a binary file
open(TOP_CATEGORIES, "w") do io
    serialize(io, top_categories)
end


### Encoding the target feature

In [10]:
target_levels = levels(target)
open(TARGET_LEVELS, "w") do io
    serialize(io, target_levels)
end

target = categorical(target)
encoded_target = int(target)


546-element Vector{UInt32}:
 0x0000000d
 0x0000000f
 0x00000001
 0x00000007
 0x00000006
 0x00000002
 0x00000006
 0x00000012
 0x0000000d
 0x00000012
 0x00000004
 0x00000007
 0x00000010
          ⋮
 0x0000000d
 0x00000002
 0x00000007
 0x00000010
 0x00000010
 0x0000000d
 0x00000006
 0x00000010
 0x00000002
 0x00000003
 0x00000010
 0x00000010

### Training the Classifier
We choose Random Forest Classifier model, but feel free to try your own and compare the results.

In [14]:
x_train = Matrix(train_df[:, Not([id_feature])])

# train random forest classifier
# using 2 random features, 10 trees, 0.5 portion of samples per tree, and a maximum tree depth of 6
model = build_forest(target, x_train, 2, 10, 0.5, 6)

open(PREDICTOR_FILE_PATH, "w") do io
    serialize(io, model)
end

In [15]:
target

546-element CategoricalArrays.CategoricalArray{String31,1,UInt32}:
 String31(" frog-eye-leaf-spot")
 String31(" phyllosticta-leaf-spot")
 String31(" 2-4-d-injury")
 String31(" brown-stem-rot")
 String31(" brown-spot")
 String31(" alternarialeaf-spot")
 String31(" brown-spot")
 String31(" purple-seed-stain")
 String31(" frog-eye-leaf-spot")
 String31(" purple-seed-stain")
 String31(" bacterial-blight")
 String31(" brown-stem-rot")
 String31(" phytophthora-rot")
 ⋮
 String31(" frog-eye-leaf-spot")
 String31(" alternarialeaf-spot")
 String31(" brown-stem-rot")
 String31(" phytophthora-rot")
 String31(" phytophthora-rot")
 String31(" frog-eye-leaf-spot")
 String31(" brown-spot")
 String31(" phytophthora-rot")
 String31(" alternarialeaf-spot")
 String31(" anthracnose")
 String31(" phytophthora-rot")
 String31(" phytophthora-rot")