In [11]:
using Suppressor
@suppress begin
    using DataFrames
    using LazyJSON
    using GLM 
    using MLJ 
    using MLJBase
    using CSV
    using Serialization
    using MLJScientificTypes
    using CategoricalArrays
    using JLD2
end


In [12]:
# Setting up directories
ROOT_DIR = dirname(pwd())
# Setting up directory and file paths
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
OUTPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "outputs")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.jld2")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
PREDICTIONS_DIR = joinpath(OUTPUT_DIR, "predictions")
PREDICTIONS_FILE = joinpath(PREDICTIONS_DIR, "predictions.csv")


if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

In [13]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)  # Read file content as a string
schema = LazyJSON.parse(schema_string)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]

"danceability"

In [14]:
file_name = filter(x -> occursin(".csv", x), readdir(TEST_DIR))[1]
file_path = joinpath(TEST_DIR, file_name)
df = DataFrame(CSV.File(file_path))

Unnamed: 0_level_0,id,energy,key,loudness,mode,speechiness,acousticness
Unnamed: 0_level_1,String31,Float64,Int64,Float64,Int64,Float64,Float64
1,355kXbXrCIdm27pjd4fncE,0.652,4,-11.707,1,0.0374,0.523
2,5t8M1Ktl6TkrAAKVmYlc3E,0.856,11,-4.669,1,0.0421,0.00715
3,6pYKIB7l7bLqObaJlmKIyc,0.68,0,-14.698,0,0.0547,0.0246
4,1jtJDYwDAg2FZBiLTd5GJO,0.389,9,-13.75,1,0.0255,0.0727
5,11gI5uqSKvwquuoKHPH99V,0.662,10,-7.829,0,0.0463,0.0267
6,5yuMx81o4yhOQ0LDAI3CV3,0.264,2,-13.512,1,0.0387,0.737
7,5xMeshTXi4stt1EBhwdtyW,0.527,8,-8.028,0,0.055,0.455
8,1W8EyUzQPkVyGtAkCg1HqS,0.342,1,-10.514,1,0.0269,0.794
9,5JOrWjFDzatQ1ycDJ3Tybu,0.526,0,-5.825,1,0.0407,0.323
10,63LiOsvf1w1DSbmjbxjh4b,0.797,7,-6.627,1,0.152,0.00257


In [15]:
imputation_values = open(deserialize, IMPUTATION_FILE)
for column in nullable_features
    df[!, Symbol(column)] .= coalesce.(df[!, Symbol(column)], get(imputation_values, string(column), missing))
end

# Saving the id column in a different variable
ids = df[!, Symbol(id_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature)]))

Unnamed: 0_level_0,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness
Unnamed: 0_level_1,Float64,Int64,Float64,Int64,Float64,Float64,Float64,Float64
1,0.652,4,-11.707,1,0.0374,0.523,0.000117,0.134
2,0.856,11,-4.669,1,0.0421,0.00715,7.61e-6,0.0982
3,0.68,0,-14.698,0,0.0547,0.0246,6.45e-5,0.429
4,0.389,9,-13.75,1,0.0255,0.0727,1.34e-6,0.0446
5,0.662,10,-7.829,0,0.0463,0.0267,0.0,0.648
6,0.264,2,-13.512,1,0.0387,0.737,0.0,0.127
7,0.527,8,-8.028,0,0.055,0.455,0.0,0.11
8,0.342,1,-10.514,1,0.0269,0.794,0.0,0.108
9,0.526,0,-5.825,1,0.0407,0.323,0.0,0.181
10,0.797,7,-6.627,1,0.152,0.00257,0.0,0.272


In [16]:
loaded_top_categories = open(deserialize, TOP_CATEGORIES)

# Function to one-hot encode only the top 3 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = "$(feature)_binary"
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = "$(feature)_$(cat)"
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end



one_hot_top_categories!(df, loaded_top_categories)

In [17]:
# Load the saved model
model = JLD2.load(PREDICTOR_FILE_PATH, "model")

# Make predictions
predictions = GLM.predict(model, df)

1786-element Vector{Union{Missing, Float64}}:
 0.6146136662128445
 0.6717073267021385
 0.62262870686972
 0.6631447171077764
 0.47298372140353706
 0.5095759991533177
 0.666907511316748
 0.458931931881122
 0.5992937449955925
 0.7898685508954684
 ⋮
 0.6840166594294237
 0.7112754197135341
 0.6761309679917424
 0.5582199667476604
 0.7002823391096924
 0.49638774200679464
 0.6533891978615152
 0.5432435243902574
 0.5036601650907818

In [18]:
id_column_name = id_feature
prediction_column_name = "prediction"
# Create a DataFrame
df = DataFrame()
df[!, Symbol(id_column_name)] = ids
df[!, Symbol(prediction_column_name)] = predictions

# Save the DataFrame to a CSV file
CSV.write(PREDICTIONS_FILE, df)

"/Users/moo/Desktop/Upwork/rt-ML/Regression/Julia/Juila-Linear-Regression-Template/model_inputs_outputs/outputs/predictions/predictions.csv"