## Imports

In [8]:
# DO NOT CHANGE THESE LINES.
using Suppressor
@suppress begin
    using DataFrames
    using LazyJSON
    using GLM 
    using MLJ 
    using MLJBase
    using CSV
    using Serialization
    using MLJScientificTypes
    using CategoricalArrays
    using MLJLinearModels
end


## Paths

In [9]:
# DO NOT CHANGE THESE LINES 
ROOT_DIR = dirname(pwd())
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
OUTPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "outputs")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.ser")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.ser")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
PREDICTIONS_DIR = joinpath(OUTPUT_DIR, "predictions")
PREDICTIONS_FILE = joinpath(PREDICTIONS_DIR, "predictions.csv")
TARGET_LEVELS = joinpath(MODEL_ARTIFACTS_PATH, "target_levels.ser")
TARGET_MAPPING = joinpath(MODEL_ARTIFACTS_PATH, "target_mapping.ser")


if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

### Reading the schema

In [10]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)  # Read file content as a string
schema = LazyJSON.parse(schema_string)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]
target_classes = schema["target"]["classes"]

if length(target_classes) == 2
    negative_class = target_classes[1]
    positive_class = target_classes[2]
end



"M"

### Reading test data.

In [11]:
file_name = filter(x -> occursin(".csv", x), readdir(TEST_DIR))[1]
file_path = joinpath(TEST_DIR, file_name)
df = DataFrame(CSV.File(file_path))

Unnamed: 0_level_0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64
1,87930,12.47,18.6,81.09,481.9,0.09965
2,859575,18.94,21.31,123.6,1130.0,0.09009
3,8670,15.46,19.48,101.7,748.9,0.1092
4,907915,12.4,17.68,81.47,467.8,0.1054
5,921385,11.54,14.44,74.65,402.9,0.09984
6,927241,20.6,29.33,140.1,1265.0,0.1178
7,9012000,22.01,21.9,147.2,1482.0,0.1063
8,853201,17.57,15.05,115.0,955.1,0.09847
9,8611161,13.34,15.86,86.49,520.0,0.1078
10,911673,13.9,16.62,88.97,599.4,0.06828


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [12]:
imputation_values = open(deserialize, IMPUTATION_FILE)
for column in nullable_features
    df[!, Symbol(column)] .= coalesce.(df[!, Symbol(column)], get(imputation_values, string(column), missing))
end

# Saving the id column in a different variable
ids = df[!, Symbol(id_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature)]))

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64
1,12.47,18.6,81.09,481.9,0.09965,0.1058
2,18.94,21.31,123.6,1130.0,0.09009,0.1029
3,15.46,19.48,101.7,748.9,0.1092,0.1223
4,12.4,17.68,81.47,467.8,0.1054,0.1316
5,11.54,14.44,74.65,402.9,0.09984,0.112
6,20.6,29.33,140.1,1265.0,0.1178,0.277
7,22.01,21.9,147.2,1482.0,0.1063,0.1954
8,17.57,15.05,115.0,955.1,0.09847,0.1157
9,13.34,15.86,86.49,520.0,0.1078,0.1535
10,13.9,16.62,88.97,599.4,0.06828,0.05319


### Encoding
We encode the data using the same encoder that we saved during training.

In [13]:
loaded_top_categories = open(deserialize, TOP_CATEGORIES)

# Function to one-hot encode only the top 10 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = "$(feature)_binary"
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = "$(feature)_$(cat)"
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end



one_hot_top_categories!(df, loaded_top_categories)

### Making predictions
Using the model saved during training.

In [19]:
# Load the saved model
model = open(deserialize, PREDICTOR_FILE_PATH)
target_levels = open(deserialize, TARGET_LEVELS)


# if length(target_classes) == 2
#     loaded_mapping = open(deserialize, TARGET_MAPPING)
#     log_odds_predictions = GLM.predict(model, convert(Matrix{Float64}, Matrix(df)))
#     probabilities = 1 ./ (1 .+ exp.(-log_odds_predictions))
#     threshold = 0.5
#     integer_predictions = ifelse.(probabilities .> threshold, 1, 0)
#     label_predictions = [loaded_mapping[pred] for pred in integer_predictions]
#     label_predictions = target_levels[integer_predictions .+ 1]

#     result_df = DataFrame()
#     result_df[!, Symbol(negative_class)] = 1 .- probabilities
#     result_df[!, Symbol(positive_class)] = probabilities
#     result_df[!, id_feature] = ids
#     result_df


# else
probabilities = MLJ.predict(model, df)

# Number of classes
n_classes = length(target_classes)

# Extract probabilities for each class
probs_matrix = hcat([pdf.(probabilities, level) for level in target_levels]...)

# Create the DataFrame
result_df = DataFrame(probs_matrix, Symbol.(target_levels))
result_df[!, id_feature] = ids
result_df

# end
CSV.write(PREDICTIONS_FILE, result_df)


"/Users/moo/Desktop/Upwork/rt-ML/Julia/Julia-Logistic-Regression-Template/model_inputs_outputs/outputs/predictions/predictions.csv"