## Imports

In [20]:
# DO NOT CHANGE THESE LINES.
using Suppressor
@suppress begin
    using DataFrames
    using LazyJSON
    using GLM 
    using MLJ 
    using MLJBase
    using CSV
    using Serialization
    using MLJScientificTypes
    using CategoricalArrays
end


## Paths

In [21]:
# DO NOT CHANGE THESE LINES 
ROOT_DIR = dirname(pwd())
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
OUTPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "outputs")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.jld2")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.ser")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.ser")
PREDICTIONS_DIR = joinpath(OUTPUT_DIR, "predictions")
PREDICTIONS_FILE = joinpath(PREDICTIONS_DIR, "predictions.csv")


if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

### Reading the schema

In [22]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema_string = read(schema_path, String)  # Read file content as a string
schema = LazyJSON.parse(schema_string)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]

"SalePrice"

### Reading test data.

In [23]:
file_name = filter(x -> occursin(".csv", x), readdir(TEST_DIR))[1]
file_path = joinpath(TEST_DIR, file_name)
df = DataFrame(CSV.File(file_path))

Unnamed: 0_level_0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape
Unnamed: 0_level_1,Int64,Int64,String7,Float64?,Int64,String7,String7?,String3
1,706,190,RM,70.0,5600,Pave,missing,Reg
2,39,20,RL,68.0,7922,Pave,missing,Reg
3,762,30,RM,60.0,6911,Pave,missing,Reg
4,105,50,RM,missing,7758,Pave,missing,Reg
5,347,20,RL,missing,12772,Pave,missing,IR1
6,135,20,RL,78.0,10335,Pave,missing,IR1
7,596,20,RL,69.0,11302,Pave,missing,IR1
8,1427,60,RL,81.0,10944,Pave,missing,IR1
9,52,50,RM,52.0,6240,Pave,missing,Reg
10,1366,60,FV,missing,7500,Pave,missing,Reg


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [24]:
imputation_values = open(deserialize, IMPUTATION_FILE)
for column in nullable_features
    df[!, Symbol(column)] .= coalesce.(df[!, Symbol(column)], get(imputation_values, string(column), missing))
end

# Saving the id column in a different variable
ids = df[!, Symbol(id_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature)]))

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour
Unnamed: 0_level_1,Int64,String7,Float64,Int64,String7,String7,String3,String3
1,190,RM,70.0,5600,Pave,Grvl,Reg,Lvl
2,20,RL,68.0,7922,Pave,Grvl,Reg,Lvl
3,30,RM,60.0,6911,Pave,Grvl,Reg,Lvl
4,50,RM,69.5,7758,Pave,Grvl,Reg,Lvl
5,20,RL,69.5,12772,Pave,Grvl,IR1,Lvl
6,20,RL,78.0,10335,Pave,Grvl,IR1,Lvl
7,20,RL,69.0,11302,Pave,Grvl,IR1,Lvl
8,60,RL,81.0,10944,Pave,Grvl,IR1,Lvl
9,50,RM,52.0,6240,Pave,Grvl,Reg,Lvl
10,60,FV,69.5,7500,Pave,Grvl,Reg,Lvl


### Encoding
We encode the data using the same encoder that we saved during training.

In [25]:
loaded_top_categories = open(deserialize, TOP_CATEGORIES)

# Function to one-hot encode only the top 10 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        if length(top_cats) == 2  # Handle the binary case
            # Assuming the first category in top_cats is treated as 'true'
            new_col_name = "$(feature)_binary"
            df[!, new_col_name] = df[!, feature] .== top_cats[1]
        else  # Handle the general case
            for cat in top_cats
                new_col_name = "$(feature)_$(cat)"
                df[!, new_col_name] = df[!, feature] .== cat
            end
        end
        select!(df, Not(Symbol(feature)))  # Drop the original feature column
    end
end

one_hot_top_categories!(df, loaded_top_categories)

### Making predictions
Using the model saved during training.

In [27]:
# Load the saved model
model = open(deserialize, PREDICTOR_FILE_PATH)

# Make predictions
predictions = GLM.predict(model, df)

292-element Vector{Union{Missing, Float64}}:
 -26934.43077931663
 137729.82041489254
 105841.82598461382
 192870.39910569758
  23759.281190360583
 169831.6483953199
 308696.0984100826
 263527.73482309363
 119748.25629049577
 214912.81537891482
 125803.20557638997
 121904.61717094912
 122185.21136286343
      ⋮
 159888.79969365234
 146138.75469297275
 126088.8159643489
  10821.005085433317
  28237.596302486716
 197633.43422867116
 135587.06054263358
 138696.63932536042
 167072.29511288772
  59889.020598648305
 191091.3627131496
 148348.7847185513

### Creating predictions DataFrame.

In [28]:
id_column_name = id_feature
prediction_column_name = "prediction"
# Create a DataFrame
df = DataFrame()
df[!, Symbol(id_column_name)] = ids
df[!, Symbol(prediction_column_name)] = predictions

# Save the DataFrame to a CSV file
CSV.write(PREDICTIONS_FILE, df)

"/Users/moo/Desktop/Upwork/rt-ML/Julia/Juila-Linear-Regression-Template/model_inputs_outputs/outputs/predictions/predictions.csv"