In [40]:
using Pkg
using DataFrames
using JSON
using GLM 
using MLJ 
using MLJBase
using CSV
using Serialization
using MLJScientificTypes
using CategoricalArrays
using JLD2
using StatsBase
using StatsModels



In [41]:
# Setting up directories
ROOT_DIR = dirname(pwd())
# Setting up directory and file paths
MODEL_INPUTS_OUTPUTS = joinpath(ROOT_DIR, "model_inputs_outputs")
INPUT_DIR = joinpath(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = joinpath(INPUT_DIR, "schema")
DATA_DIR = joinpath(INPUT_DIR, "data")
TRAIN_DIR = joinpath(DATA_DIR, "training")
TEST_DIR = joinpath(DATA_DIR, "testing")
MODEL_PATH = joinpath(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = joinpath(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = joinpath(MODEL_ARTIFACTS_PATH, "ohe.jld2")
PREDICTOR_DIR_PATH = joinpath(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = joinpath(PREDICTOR_DIR_PATH, "predictor.jld2")
IMPUTATION_FILE = joinpath(MODEL_ARTIFACTS_PATH, "imputation.json")
TOP_CATEGORIES = joinpath(MODEL_ARTIFACTS_PATH, "top_categories.json")

if !isdir(MODEL_ARTIFACTS_PATH)
    mkdir(MODEL_ARTIFACTS_PATH)
end
if !isdir(PREDICTOR_DIR_PATH)
    mkdir(PREDICTOR_DIR_PATH)
end

In [42]:
# Reading a schema from a JSON file and extracting features
file_name = first(filter(x -> endswith(x, "json"), readdir(INPUT_SCHEMA_DIR)))
schema_path = joinpath(INPUT_SCHEMA_DIR, file_name)
schema = JSON.parsefile(schema_path)
features = schema["features"]

# Identifying numeric, categorical, and nullable features
numeric_features = String[]
categorical_features = String[]
nullable_features = String[]

for f in features
    if f["dataType"] == "CATEGORICAL"
        push!(categorical_features, f["name"])
    else
        push!(numeric_features, f["name"])
    end
    if f["nullable"]
        push!(nullable_features, f["name"])
    end
end

# Extracting ID and target features
id_feature = schema["id"]["name"]
target_feature = schema["target"]["name"]


"target"

In [43]:
# Find the first CSV file in the TRAIN_DIR
file_name = first(filter(x -> endswith(x, ".csv"), readdir(TRAIN_DIR)))
file_path = joinpath(TRAIN_DIR, file_name)

# Read the CSV file into a DataFrame
df = CSV.File(file_path) |> DataFrame

# Display the first few rows of the DataFrame
first(df, 5)


Unnamed: 0_level_0,id,number,color,target
Unnamed: 0_level_1,String7,Float64?,String7?,Float64
1,YDDKTO,missing,missing,84.7411
2,FPLK2Z,3.3782,Red,26.2021
3,P2YCAP,2.248,Blue,112.355
4,IMT8XP,1.9806,Blue,109.826
5,5D9Q5F,0.5048,Red,1.0144


In [44]:
# Imputing missing data
imputation_values = Dict{String, Any}()

for column in nullable_features
    if column in numeric_features
        value = median(skipmissing(df[!, column]))
    else
        value = mode(skipmissing(df[!, column]))
    end
    df[!, column] = coalesce.(df[!, column], value)
    imputation_values[column] = value
end

# Serialize the imputation_values dictionary to a JSON file
open(IMPUTATION_FILE, "w") do io
    JSON.print(io, imputation_values)
end

In [45]:
# Saving the id and target columns in a different variable
ids = df[!, Symbol(id_feature)]
target = df[!, Symbol(target_feature)]

# Dropping the id and target from the DataFrame
select!(df, Not([Symbol(id_feature), Symbol(target_feature)]))

Unnamed: 0_level_0,number,color
Unnamed: 0_level_1,Float64,String7
1,5.05815,Blue
2,3.3782,Red
3,2.248,Blue
4,1.9806,Blue
5,0.5048,Red
6,8.1929,Red
7,5.05815,Blue
8,1.1673,Red
9,1.5919,Red
10,0.1463,Green


In [46]:
# Function to get top 3 categories
function get_top_categories(df, features, n=3)
    top_cats = Dict()
    for feature in features
        col_data = df[!, feature]
        category_counts = countmap(col_data)
        sorted_categories = sort(collect(category_counts), by=x->x[2], rev=true)
        top_cats[feature] = [x[1] for x in sorted_categories[1:n]]
    end
    return top_cats
end


# Get top 3 categories for specific features
top_categories = get_top_categories(df, categorical_features)

# Function to one-hot encode only the top 3 categories
function one_hot_top_categories!(df, top_categories)
    for (feature, top_cats) in top_categories
        for cat in top_cats
            new_col_name = "$(feature)_$(cat)"
            df[!, new_col_name] = df[!, feature] .== cat
        end
    end
end

# Apply one-hot encoding
one_hot_top_categories!(df, top_categories)


# Save top_categories to a JSON file
json_content = JSON.json(top_categories)
open(TOP_CATEGORIES, "w") do f
    write(f, json_content)
end


32

In [47]:
# Concatenate the new DataFrame with the existing 'df' DataFrame horizontally
df = hcat(df, DataFrame(target = target))

# Get all column names
all_columns = names(df)

# Remove the target variable to get only predictor variables
predictor_columns = filter(x -> x != "target", all_columns)

# Create Terms for predictors and response
predictor_terms = [Term(Symbol(col)) for col in predictor_columns]
response_term = Term(:target)

# Create the formula
formula_obj = FormulaTerm(response_term, sum(predictor_terms))

# Create a Linear Regression model and train it
model = lm(formula_obj, df)

# Your model is now trained on all predictor variables in 'df'

# BEGIN
    # If you want to perform additional operations on the model, you can do so here
    # For example, you can set hyperparameters, do cross-validation, etc.
    # model = ...
# END

# Saving the model to use it for predictions
save(PREDICTOR_FILE_PATH, "model", model)
