In [24]:
using DataFrames
using JSON
using GLM 
using MLJ 
using MLJBase
using CSV
using Serialization
using MLJScientificTypes
using CategoricalArrays
using JLD2

In [25]:
# Setting up directories
ROOT_DIR = dirname(pwd())
# Setting up directory and file paths
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
OUTPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "outputs")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.jld2")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.json")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.json")
PREDICTIONS_DIR = joinpath(OUTPUT_DIR, "predictions")
PREDICTIONS_FILE = joinpath(PREDICTIONS_DIR, "predictions.csv")


if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

In [26]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema = JSON.parsefile(schema_path)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]

"percent_pell_grant"

In [27]:
file_name = filter(x -> occursin(".csv", x), readdir(TEST_DIR))[1]
file_path = joinpath(TEST_DIR, file_name)
df = DataFrame(CSV.File(file_path))


Unnamed: 0_level_0,unit_id,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange
Unnamed: 0_level_1,Int64,Float64?,Float64?,Float64?,Float64?
1,422695,missing,missing,missing,missing
2,363712,missing,missing,missing,missing
3,119331,missing,missing,missing,missing
4,476142,missing,missing,missing,missing
5,194259,missing,missing,missing,missing
6,171456,22.0,22.0,21.0,missing
7,185828,26.0,24.0,26.0,missing
8,140827,missing,missing,missing,missing
9,213589,missing,missing,missing,missing
10,201982,missing,missing,missing,missing


In [28]:
imputation_values = JSON.parsefile(IMPUTATION_FILE)
for column in nullable_features
    df[!, Symbol(column)] .= coalesce.(df[!, Symbol(column)], get(imputation_values, string(column), missing))
end

# Saving the id column in a different variable
ids = df[!, Symbol(id_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature)]))

Unnamed: 0_level_0,act_combined_midrange,act_english_midrange,act_math_midrange,act_writing_midrange
Unnamed: 0_level_1,Float64,Float64,Float64,Float64
1,23.0,22.0,22.0,7.0
2,23.0,22.0,22.0,7.0
3,23.0,22.0,22.0,7.0
4,23.0,22.0,22.0,7.0
5,23.0,22.0,22.0,7.0
6,22.0,22.0,21.0,7.0
7,26.0,24.0,26.0,7.0
8,23.0,22.0,22.0,7.0
9,23.0,22.0,22.0,7.0
10,23.0,22.0,22.0,7.0


In [29]:
json_content = read(TOP_CATEGORIES, String)
loaded_top_categories = JSON.parse(json_content)

# Function to one-hot encode only the top 3 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        for cat in top_cats
            new_col_name = "$(feature)_$(cat)"
            df[!, new_col_name] = df[!, feature] .== cat
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column 
    end
end


one_hot_top_categories!(df, loaded_top_categories)

In [31]:
# Load the saved model
model = JLD2.load(PREDICTOR_FILE_PATH, "model")

# Make predictions
predictions = GLM.predict(model, df)

1413-element Vector{Union{Missing, Float64}}:
 0.531043764945229
 0.4903163876569154
 0.33887493931860796
 0.5853391011665144
 0.4663625588733915
 0.3650712129637893
 0.33228876093708
 0.42920581969476007
 0.7200420756168442
 0.5901079880090312
 ⋮
 0.5305145739999522
 0.49330302563080003
 0.6226623840813286
 0.56082724046062
 0.4367230232743229
 0.5689438204764994
 0.48021193189259864
 0.594499642900781
 0.5600931748413881

In [32]:
id_column_name = id_feature
prediction_column_name = "prediction"
# Create a DataFrame
df = DataFrame()
df[!, Symbol(id_column_name)] = ids
df[!, Symbol(prediction_column_name)] = predictions

# Save the DataFrame to a CSV file
CSV.write(PREDICTIONS_FILE, df)

"/Users/moo/Desktop/Upwork/rt-ML/Regression/Julia/Juila-Linear-Regression-Template/model_inputs_outputs/outputs/predictions/predictions.csv"