## Imports

In [11]:
#DO NOT CHANGE THESE LINES
using Suppressor
@suppress begin
    using DataFrames
    using LazyJSON
    using GLM 
    using MLJ 
    using MLJBase
    using CSV
    using Serialization
    using MLJScientificTypes
    using CategoricalArrays
    # using JLD2
    using StatsBase
    using StatsModels
    using MLJLinearModels
end

## Paths

In [12]:
# DO NOT CHANGE THESE LINES
ROOT_DIR = dirname(pwd())
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.ser")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.ser")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
TARGET_LEVELS = joinpath(MODEL_ARTIFACTS_PATH, "target_levels.ser")
TARGET_MAPPING = joinpath(MODEL_ARTIFACTS_PATH, "target_mapping.ser")

if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

### Reading the schema
The schema contains metadata about the datasets. We will use the scehma to get information about the type of each feature (NUMERIC or CATEGORICAL) and the id and target features, this will be helpful in preprocessing stage.

In [13]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)  # Read file content as a string
schema = LazyJSON.parse(schema_string)  # Parse using LazyJSON

features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]
target_classes = schema["target"]["classes"]

if length(target_classes) == 2
    negative_class = target_classes[1]
    positive_class = target_classes[2]
end


"M"

### Reading training data

In [14]:
# Find the first CSV file in the TRAIN_DIR
file_name = first(filter(x -> endswith(x, ".csv"), readdir(TRAIN_DIR)))
file_path = joinpath(TRAIN_DIR, file_name)

# Read the CSV file into a DataFrame
df = CSV.File(file_path) |> DataFrame

# Display the first few rows of the DataFrame
first(df, 5)


Unnamed: 0_level_0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean
Unnamed: 0_level_1,Int64,String1,Float64,Float64,Float64,Float64,Float64
1,859471,B,9.029,17.33,58.79,250.5,0.1066
2,873593,M,21.09,26.57,142.7,1311.0,0.1141
3,859196,B,9.173,13.86,59.2,260.9,0.07721
4,88466802,B,10.65,25.22,68.01,347.0,0.09657
5,858970,B,10.17,14.88,64.55,311.9,0.1134


## Data Preprocessing
Data preprocessing is very important before training the model, as the data may contain missing values in some cells. Moreover, most of the learning algorithms cannot work with categorical data, thus the data has to be encoded.

In this section we will impute the missing values and encode the categorical features. Afterwards the data will be ready to train the model.

##### Imputing missing data
> The median value will be used to impute missing values of the numeric features and the mode will be used to impute categorical features.

##### You can add your own preprocessing steps such as:
<ul>
<li>Normalization</li> <br>
<li>Outlier removal</li><br>
<li>Dropping or adding features</li><br>
</ul>

### Important note:
<p> 
Saving the values used for imputation during training step is crucial. These values will be used to impute missing data in the testing set. This is very important to avoid the well known problem of data leakage. During testing, you should not make any assumptions about the data in hand, alternatively anything needed during the testing phase should be learned from the training phase. This is why we are creating a dictionary of values used during training to reuse these values during testing.
</p>


In [15]:
# Imputing missing data
imputation_values = Dict{String, Any}()

for column in nullable_features
    if column in numeric_features
        value = median(skipmissing(df[!, column]))
    else
        value = mode(skipmissing(df[!, column]))
    end
    df[!, column] = coalesce.(df[!, column], value)
    imputation_values[column] = value
end

# Serialize the imputation_values dictionary to a binary file
open(IMPUTATION_FILE, "w") do io
    serialize(io, imputation_values)
end


In [16]:
# Saving the id and target columns in a different variable
ids = df[!, Symbol(id_feature)]
target = df[!, Symbol(target_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature), Symbol(target_feature)]))

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64
1,9.029,17.33,58.79,250.5,0.1066,0.1413
2,21.09,26.57,142.7,1311.0,0.1141,0.2832
3,9.173,13.86,59.2,260.9,0.07721,0.08751
4,10.65,25.22,68.01,347.0,0.09657,0.07234
5,10.17,14.88,64.55,311.9,0.1134,0.08061
6,14.54,27.54,96.73,658.8,0.1139,0.1595
7,14.41,19.73,96.03,651.0,0.08757,0.1676
8,11.43,15.39,73.06,399.8,0.09639,0.06889
9,12.25,17.94,78.27,460.3,0.08654,0.06679
10,19.89,20.26,130.5,1214.0,0.1037,0.131


##### Encoding Categorical features
<p>
The id column is just an identifier for the training example, so we will exclude it during the encoding phase.<br>
Target feature will be label encoded in the next step.
</p>


In [17]:
# Function to get top 10 categories
function get_top_categories(df, features, n=10)
    top_cats = Dict()
    for feature in features
        col_data = df[!, feature]
        category_counts = countmap(col_data)
        sorted_categories = sort(collect(category_counts), by=x->x[2], rev=true)
        
        # Take minimum between n and the number of unique categories
        num_categories = min(n, length(sorted_categories))
        
        top_cats[feature] = [x[1] for x in sorted_categories[1:num_categories]]
    end
    return top_cats
end

# Get top10 categories for specific features
top_categories = get_top_categories(df, categorical_features)

# Function to one-hot encode only the top 10 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = "$(feature)_binary"
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = "$(feature)_$(cat)"
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end

# Apply one-hot encoding
one_hot_top_categories!(df, top_categories)

# Serialize the imputation_values dictionary to a binary file
open(TOP_CATEGORIES, "w") do io
    serialize(io, top_categories)
end


### Encoding target feature

In [18]:
# if length(target_classes) == 2
#     encoded_target = ifelse.(target .== positive_class, 1, 0)
#     mapping = Dict(0 => negative_class, 1 => positive_class)
# else
target = categorical(target)

# end

# open(TARGET_MAPPING, "w") do io
#     serialize(io, mapping)
# end

open(TARGET_LEVELS, "w") do io
    serialize(io, levels(target))
end

2

### Training the Classifier
We choose Logistic Regression model, but feel free to try your own and compare the results.

In [19]:
# if length(target_classes) == 2
#     predictor_columns = names(df)
#     X = Matrix(df[!, predictor_columns])
#     model = glm(X, encoded_target, Bernoulli(), LogitLink())
    
# else
model = MLJ.@load LogisticClassifier pkg=MLJLinearModels
logistic_model = LogisticClassifier()

for col in names(df)
    if eltype(df[!, col]) <: Integer
        df[!, col] = convert(Vector{Float64}, df[!, col])
    end
end


model = machine(logistic_model, df, target)
fit!(model)
# end

open(PREDICTOR_FILE_PATH, "w") do io
    serialize(io, model)
end



import MLJLinearModels ✔


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining [34mMachine{LogisticClassifier,…} @267[39m.
[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mSolver: LBFGS{Optim.Options{Float64, Nothing}, NamedTuple{(), Tuple{}}}
[36m[1m│ [22m[39m  optim_options: Optim.Options{Float64, Nothing}
[36m[1m└ [22m[39m  lbfgs_options: NamedTuple{(), Tuple{}} NamedTuple()
