In [1]:
# If you are missing any necessary packages, you can download them here, juts uncomment the "using Pkg" and the desired package.
#using Pkg
#Pkg.add("PyCall")
#Pkg.add("CSV")
#Pkg.add("DataFrames")
#Pkg.add("ScikitLearn")
#Pkg.add("MLJ")

In [2]:
using CSV, DataFrames, PyCall
using ScikitLearn: fit!, predict, @sk_import

In [3]:
# Importing all necessary functions from ScikitLearn
@sk_import tree : DecisionTreeClassifier
@sk_import naive_bayes : (GaussianNB, CategoricalNB, MultinomialNB)
@sk_import metrics : (accuracy_score, max_error)
@sk_import svm : (SVC, LinearSVC)
@sk_import preprocessing : (OneHotEncoder, LabelEncoder)
@sk_import neural_network: MLPClassifier
@sk_import model_selection : train_test_split

└ @ ScikitLearn.Skcore C:\Users\caput\.julia\packages\ScikitLearn\NJwUf\src\Skcore.jl:179


PyObject <function train_test_split at 0x000000004A2F84C0>

In [4]:
# Creating a pyimport object that would work as the "joblib" library
model_size = pyimport("joblib")

PyObject <module 'joblib' from 'C:\\Users\\caput\\.julia\\conda\\3\\lib\\site-packages\\joblib\\__init__.py'>

In [5]:
# Function that would Model each necessary method
function Model(model, train_x, train_y, test_x, test_y, model_type)
    println("------------------------------")
    print("Training Time:")
    
    # Fit the training data to the model
    @time fit!(model, train_x, train_y)
    
    print("Prediction Time:")
    # Predict the testing data with the model
    @time prediction = predict(model, test_x)
    
    # Find the accuracy of the model
    # If it is a classification model, use the classification accuracy_score function
    if model_type == "classification"
        println("Model Accuracy: ", accuracy_score(test_y,prediction))
    # If it is a regression model, use the regression max_error function
    elseif model_type == "regression"
        println("Model Accuracy: ", max_error(test_y, prediction))
    # Just some error handling
    else 
        println("Not a valid model_type")
    end
    
    # Find the size of the model by creating a file, checking the size and then erasing it.
    model_size.dump(model, "model")
    println("Model Size: ", (stat("model").size)/1000, " kB")
    rm("model")
    
    println("------------------------------")
end

Model (generic function with 1 method)

In [6]:
# Function that would model the full Kddcup Data Set
function KddFullModel()
    
    # Load the data to a dataframe
    kddcup = CSV.File(read("Datasets/Kddcup/kddcup.data.corrected"); header=false) |> DataFrame
    
    # Separate the data into features and labels while converting them to arrays
    kddcup_x = convert(Array, kddcup[!,1:41])
    kddcup_y = convert(Array, kddcup[!,42])
    
    # Encoding the columns that are not 100% numbers
    encKd = LabelEncoder()
    kddcup_x[:,2] .= encKd.fit_transform(kddcup_x[:,2])
    kddcup_x[:,3] .= encKd.fit_transform(kddcup_x[:,3])
    kddcup_x[:,4] .= encKd.fit_transform(kddcup_x[:,4])
    
    # Modifying the labels for better modeling
    for (cnt, value) in enumerate(kddcup_y)
        # If it is normal make it a 1
        if value == "normal."
            kddcup_y[cnt, :] .= "1"
        # If it is something else make it a 2
        else
            kddcup_y[cnt,:] .= "2"      
        end
    end
    
    # Partitioning the data between training and test, using 70% for training
    train, test = partition(1:length(kddcup_y), 0.70, shuffle = true)
    kd_x_train = kddcup_x[train, :]
    kd_y_train = kddcup_y[train, :]
    kd_x_test = kddcup_x[test, :]
    kd_y_test = kddcup_y[test, :]
    
    # Modeling Naive Bayes Classifier
    println("Naive Bayes Classifier Model")
    Model(GaussianNB(), kd_x_train, vec(kd_y_train), kd_x_test, vec(kd_y_test), "classification")    
    
    # Modeling Decision Tree Classifier
    println("Decision Tree Classifier Model")
    Model(DecisionTreeClassifier(), kd_x_train, kd_y_train, kd_x_test, kd_y_test, "classification")

end

KddFullModel (generic function with 1 method)

In [8]:
# Load Car data to a dataframe
car = CSV.File(read("Datasets/Car/car.data"); header=["buying","maint","doors","persons","lug_boot","safety","class"]) |> DataFrame

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class
Unnamed: 0_level_1,String,String,String,String,String,String,String
1,vhigh,vhigh,2,2,small,low,unacc
2,vhigh,vhigh,2,2,small,med,unacc
3,vhigh,vhigh,2,2,small,high,unacc
4,vhigh,vhigh,2,2,med,low,unacc
5,vhigh,vhigh,2,2,med,med,unacc
6,vhigh,vhigh,2,2,med,high,unacc
7,vhigh,vhigh,2,2,big,low,unacc
8,vhigh,vhigh,2,2,big,med,unacc
9,vhigh,vhigh,2,2,big,high,unacc
10,vhigh,vhigh,2,4,small,low,unacc


In [9]:
# Separate the data into features and labels while converting them to arrays
car_x = convert(Array,car[:,1:6])
car_y = convert(Array,car[:,7])

4-element Array{Array,1}:
 [1.0 0.0 … 1.0 0.0; 0.0 0.0 … 0.0 1.0; … ; 1.0 0.0 … 1.0 0.0; 0.0 1.0 … 0.0 0.0]
 [0.0 0.0 … 1.0 0.0; 0.0 0.0 … 0.0 1.0; … ; 0.0 1.0 … 0.0 1.0; 0.0 0.0 … 1.0 0.0]
 ["unacc", "acc", "unacc", "unacc", "unacc", "acc", "acc", "unacc", "unacc", "acc"  …  "acc", "unacc", "unacc", "good", "unacc", "unacc", "acc", "unacc", "unacc", "unacc"]
 ["unacc", "acc", "unacc", "unacc", "acc", "unacc", "unacc", "unacc", "unacc", "unacc"  …  "good", "unacc", "unacc", "unacc", "acc", "unacc", "unacc", "unacc", "acc", "unacc"]

In [None]:
# Encode the car data usinf OneHotEncoder because the data is relatively small
encCar = OneHotEncoder()
car_x_encoded = encCar.fit_transform(car_x).toarray()

In [None]:
# Partitioning the data between training and test, using 70% for training using the seed 100
car_x_train, car_x_test, car_y_train, car_y_test = train_test_split(car_x_encoded,car_y,test_size=0.3, random_state=100)

In [10]:
# Modeling Naive Bayes Classifier
println("Naive Bayes Classifier Model")
Model(CategoricalNB(), car_x_train, car_y_train, car_x_test, car_y_test, "classification")

Naive Bayes Classifier Model
------------------------------
Training Time:  0.036964 seconds (33.10 k allocations: 1.714 MiB)
Prediction Time:  0.134726 seconds (226.36 k allocations: 11.516 MiB)
Model Accuracy: 0.8574181117533719
Model Size: 5.341 kB
------------------------------


In [11]:
# Modeling Decision Tree Classifier
println("Decision Tree Classifier Model")
Model(DecisionTreeClassifier(), car_x_train, car_y_train, car_x_test, car_y_test, "classification")

Decision Tree Classifier Model
------------------------------
Training Time:  0.003869 seconds (1.23 k allocations: 19.734 KiB)
Prediction Time:  0.001298 seconds (3.16 k allocations: 98.812 KiB)
Model Accuracy: 0.9691714836223507
Model Size: 17.36 kB
------------------------------


In [12]:
# Modeling Suport Vector Machine
println("Suport Vector Machines Model")
Model(SVC(), car_x_train, car_y_train, car_x_test, car_y_test, "classification")

Suport Vector Machines Model
------------------------------
Training Time:  0.064502 seconds (1.23 k allocations: 19.734 KiB)
Prediction Time:  0.040510 seconds (3.16 k allocations: 98.812 KiB)
Model Accuracy: 0.9633911368015414
Model Size: 130.856 kB
------------------------------


In [13]:
# Modeling Neural Networks
println("Neural Networks Machine Model")
Model(MLPClassifier(max_iter = 1000), car_x_train, car_y_train, car_x_test, car_y_test, "classification")

Neural Networks Machine Model
------------------------------
Training Time:  3.093932 seconds (1.23 k allocations: 19.734 KiB)
Prediction Time:  0.001774 seconds (3.16 k allocations: 98.812 KiB)
Model Accuracy: 0.9942196531791907
Model Size: 96.64 kB
------------------------------


In [14]:
# Load Abalone data to a dataframe
abalone = CSV.File(read("Datasets/Abalone/abalone.data"); header=["sex","length","diameter","height","whole_weight","shucked_weight","viscera_weight","shell_weight","rings"]) |> DataFrame

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight
Unnamed: 0_level_1,String,Float64,Float64,Float64,Float64,Float64,Float64
1,M,0.455,0.365,0.095,0.514,0.2245,0.101
2,M,0.35,0.265,0.09,0.2255,0.0995,0.0485
3,F,0.53,0.42,0.135,0.677,0.2565,0.1415
4,M,0.44,0.365,0.125,0.516,0.2155,0.114
5,I,0.33,0.255,0.08,0.205,0.0895,0.0395
6,I,0.425,0.3,0.095,0.3515,0.141,0.0775
7,F,0.53,0.415,0.15,0.7775,0.237,0.1415
8,F,0.545,0.425,0.125,0.768,0.294,0.1495
9,M,0.475,0.37,0.125,0.5095,0.2165,0.1125
10,F,0.55,0.44,0.15,0.8945,0.3145,0.151


In [15]:
# Encoding the columns that are not 100% numbers and convert the dataframes to arrays
encAb = LabelEncoder()
abalone_x = convert(Array,abalone[:,1:8])
abalone_x[:,1] .= encAb.fit_transform(abalone_x[:,1])
abalone_y = convert(Array,abalone[:,9])

4177-element Array{Int64,1}:
 15
  7
  9
 10
  7
  8
 20
 16
  9
 19
 14
 10
 11
  ⋮
  7
 10
  9
  8
 10
 10
  8
 11
 10
  9
 10
 12

In [16]:
# Adding 1.5 to all the ring labels in order to convert them to years
abalone_y = abalone_y .+ 1.5

4177-element Array{Float64,1}:
 16.5
  8.5
 10.5
 11.5
  8.5
  9.5
 21.5
 17.5
 10.5
 20.5
 15.5
 11.5
 12.5
  ⋮
  8.5
 11.5
 10.5
  9.5
 11.5
 11.5
  9.5
 12.5
 11.5
 10.5
 11.5
 13.5

In [17]:
# Modifying the labels for better modeling
for (cnt, value) in enumerate(abalone_y)
    # if the age is between 1 and 9 make it a 1
    if value > 1 && value < 9
        abalone_y[cnt, :] .= 1
    # if the age is between 9 and 10 make it a 2
    elseif value < 10
        abalone_y[cnt,:] .= 2
    # if the age is greater then 10 it a 3
    else
        abalone_y[cnt,:] .= 3      
    end
end

In [18]:
# Partitioning the data between training and test, using 70% for training using the seed 100
abalone_x_train, abalone_x_test, abalone_y_train, abalone_y_test = train_test_split(abalone_x,abalone_y,test_size=0.3, random_state=100)

4-element Array{Array,1}:
 Real[1 0.53 … 0.108 0.195; 2 0.56 … 0.1795 0.285; … ; 0 0.615 … 0.2355 0.345; 1 0.5 … 0.156 0.18]
 Real[1 0.615 … 0.195 0.345; 0 0.695 … 0.3005 0.44; … ; 2 0.555 … 0.1525 0.26; 2 0.395 … 0.066 0.09]
 [3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 2.0, 3.0, 1.0, 3.0  …  3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 3.0, 2.0, 3.0, 3.0]
 [3.0, 3.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 2.0, 1.0  …  1.0, 3.0, 3.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0]

In [19]:
# Modeling Naive Bayes Classifier
println("Naive Bayes Classifier Model")
Model(GaussianNB(), abalone_x_train, abalone_y_train, abalone_x_test, abalone_y_test, "classification")

Naive Bayes Classifier Model
------------------------------
Training Time:  0.092860 seconds (210.07 k allocations: 8.957 MiB)
Prediction Time:  0.025894 seconds (49.59 k allocations: 1.791 MiB)
Model Accuracy: 0.7081339712918661
Model Size: 1.067 kB
------------------------------


In [20]:
# Modeling Decision Tree Classifier
println("Decision Tree Classifier Model")
Model(DecisionTreeClassifier(), abalone_x_train, abalone_y_train, abalone_x_test, abalone_y_test, "classification")

Decision Tree Classifier Model
------------------------------
Training Time:  0.029358 seconds (52.63 k allocations: 822.969 KiB)
Prediction Time:  0.006642 seconds (22.61 k allocations: 364.047 KiB)
Model Accuracy: 0.7256778309409888
Model Size: 87.335 kB
------------------------------


In [21]:
# Modeling Suport Vector Machine
println("Suport Vector Machines Model")
Model(LinearSVC(), abalone_x_train, abalone_y_train, abalone_x_test, abalone_y_test, "classification")

Suport Vector Machines Model
------------------------------
Training Time:  0.077978 seconds (52.63 k allocations: 822.969 KiB)
Prediction Time:  0.008913 seconds (22.61 k allocations: 364.047 KiB)
Model Accuracy: 0.7910685805422647
Model Size: 0.927 kB
------------------------------


In [22]:
# Modeling Neural Networks
println("Neural Networks Model")
Model(MLPClassifier(max_iter = 1000), abalone_x_train, abalone_y_train, abalone_x_test, abalone_y_test, "classification")

Neural Networks Model
------------------------------
Training Time:  2.405724 seconds (52.63 k allocations: 822.969 KiB)
Prediction Time:  0.009349 seconds (22.61 k allocations: 364.047 KiB)
Model Accuracy: 0.8022328548644339
Model Size: 46.842 kB
------------------------------


In [23]:
# Load the madelon train, test and valid data into dataframes
madelon_test = CSV.File(read("Datasets/Madelon/madelon_test.data"); header=false) |> DataFrame
madelon_train_x = CSV.File(read("Datasets/Madelon/madelon_train.data"); header=false) |> DataFrame
madelon_train_y = CSV.File(read("Datasets/Madelon/madelon_train.labels"); header=false) |> DataFrame
madelon_valid_x = CSV.File(read("Datasets/Madelon/madelon_valid.data"); header=[false]) |> DataFrame
madelon_valid_y = CSV.File(read("Datasets/Madelon/madelon_valid.labels"); header=[false]) |> DataFrame

Unnamed: 0_level_0,-1
Unnamed: 0_level_1,Int64
1,-1
2,-1
3,1
4,-1
5,1
6,-1
7,-1
8,-1
9,1
10,1


In [24]:
# Convert the dataframes to Array
# Erase 1 columns from the features because its a placeholder
madelon_test = convert(Array,madelon_test[:,1:size(madelon_test)[2]-1])
madelon_train_x = convert(Array,madelon_train_x[:,1:size(madelon_train_x)[2]-1])
madelon_valid_x = convert(Array,madelon_valid_x[:,1:size(madelon_valid_x)[2]-1])
madelon_valid_y = convert(Array,madelon_valid_y)
madelon_train_y = convert(Array,madelon_train_y)

2000×1 Array{Int64,2}:
 -1
 -1
 -1
  1
  1
  1
  1
 -1
  1
 -1
 -1
 -1
  1
  ⋮
 -1
  1
 -1
 -1
 -1
 -1
  1
  1
 -1
 -1
  1
  1

In [25]:
# Modeling Naive Bayes Classifier
println("Naive Bayes Classifier Model")
Model(GaussianNB(), madelon_train_x, vec(madelon_train_y), madelon_valid_x, vec(madelon_valid_y), "classification")

Naive Bayes Classifier Model
------------------------------
Training Time:  0.169461 seconds (330.01 k allocations: 16.220 MiB)
Prediction Time:  0.024442 seconds (25.36 k allocations: 1.372 MiB)
Model Accuracy: 0.5926544240400667
Model Size: 16.696 kB
------------------------------


In [26]:
# Modeling Decision Tree Classifier
println("Decision Tree Classifier Model")
Model(DecisionTreeClassifier(), madelon_train_x, vec(madelon_train_y), madelon_valid_x, vec(madelon_valid_y), "classification")

Decision Tree Classifier Model
------------------------------
Training Time:  0.556864 seconds (22 allocations: 1.125 KiB)
Prediction Time:  0.003155 seconds (55 allocations: 7.656 KiB)
Model Accuracy: 0.7495826377295493
Model Size: 22.987 kB
------------------------------


In [27]:
# Modeling Suport Vector Machine
println("Suport Vector Machines Model")
Model(SVC(), madelon_train_x, vec(madelon_train_y), madelon_valid_x, vec(madelon_valid_y), "classification")

Suport Vector Machines Model
------------------------------
Training Time:  1.371578 seconds (22 allocations: 1.125 KiB)
Prediction Time:  0.490231 seconds (55 allocations: 7.656 KiB)
Model Accuracy: 0.6861435726210351
Model Size: 7301.657 kB
------------------------------


In [28]:
# Modeling Neural Networks
println("Neural Networks Model")
Model(MLPClassifier(max_iter = 1000), madelon_train_x, vec(madelon_train_y), madelon_valid_x, vec(madelon_valid_y), "classification")

Neural Networks Model
------------------------------
Training Time:  1.067421 seconds (22 allocations: 1.125 KiB)
Prediction Time:  0.004232 seconds (55 allocations: 7.656 KiB)
Model Accuracy: 0.5041736227045075
Model Size: 1611.807 kB
------------------------------


In [29]:
# Load the data to a dataframe
kddcup_10 = CSV.File(read("Datasets/Kddcup/kddcup.data_10_percent_corrected"); header=false) |> DataFrame

Unnamed: 0_level_0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9
Unnamed: 0_level_1,Int64,String,String,String,Int64,Int64,Int64,Int64,Int64
1,0,tcp,http,SF,181,5450,0,0,0
2,0,tcp,http,SF,239,486,0,0,0
3,0,tcp,http,SF,235,1337,0,0,0
4,0,tcp,http,SF,219,1337,0,0,0
5,0,tcp,http,SF,217,2032,0,0,0
6,0,tcp,http,SF,217,2032,0,0,0
7,0,tcp,http,SF,212,1940,0,0,0
8,0,tcp,http,SF,159,4087,0,0,0
9,0,tcp,http,SF,210,151,0,0,0
10,0,tcp,http,SF,212,786,0,0,0


In [30]:
# Separate the data into features and labels while converting them to arrays
kddcup_10_x = convert(Array, kddcup_10[!,1:41])
kddcup_10_y = convert(Array, kddcup_10[!,42])

494021-element Array{String,1}:
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 ⋮
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."
 "normal."

In [31]:
# Encoding the columns that are not 100% numbers
encKd_10 = LabelEncoder()
kddcup_10_x[:,2] .= encKd_10.fit_transform(kddcup_10_x[:,2])
kddcup_10_x[:,3] .= encKd_10.fit_transform(kddcup_10_x[:,3])
kddcup_10_x[:,4] .= encKd_10.fit_transform(kddcup_10_x[:,4])

494021-element view(::Array{Any,2}, :, 4) with eltype Any:
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 ⋮
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9
 9

In [32]:
# Modifying the labels for better modeling
for (cnt, value) in enumerate(kddcup_10_y)
    # If it is "normal." make it a "1"
    if value == "normal."
        kddcup_10_y[cnt, :] .= "1"
    # If it is something else make it a "2"
    else
        kddcup_10_y[cnt,:] .= "2"      
    end
end

In [33]:
# Partitioning the data between training and test, using 70% for training using the seed 100
kdd10_x_train, kdd10_x_test, kdd10_y_train, kdd10_y_test = train_test_split(kddcup_10_x,kddcup_10_y,test_size=0.3, random_state=100)

4-element Array{Array,1}:
 Real[0 0 … 0.0 0.0; 0 0 … 0.0 0.0; … ; 0 1 … 0.0 0.0; 0 0 … 0.0 0.0]
 Real[0 0 … 0.0 0.0; 0 0 … 0.0 0.0; … ; 0 0 … 0.0 0.0; 0 1 … 0.0 0.0]
 ["2", "2", "1", "2", "2", "2", "2", "2", "1", "1"  …  "2", "1", "2", "2", "2", "2", "2", "2", "2", "2"]
 ["2", "2", "1", "1", "2", "2", "2", "2", "2", "2"  …  "2", "2", "2", "1", "2", "2", "2", "2", "2", "1"]

In [34]:
# Modeling Naive Bayes Classifier
println("Naive Bayes Classifier Model")
Model(GaussianNB(), kdd10_x_train, vec(kdd10_y_train), kdd10_x_test, vec(kdd10_y_test), "classification")

Naive Bayes Classifier Model
------------------------------
Training Time: 13.127009 seconds (29.42 M allocations: 449.967 MiB)
Prediction Time:  6.002948 seconds (13.34 M allocations: 215.579 MiB)
Model Accuracy: 0.9217513342824563
Model Size: 1.991 kB
------------------------------


In [35]:
# Modeling Decision Tree Classifier
println("Decision Tree Classifier Model")
Model(DecisionTreeClassifier(), kdd10_x_train, kdd10_y_train, kdd10_x_test, kdd10_y_test, "classification")

Decision Tree Classifier Model
------------------------------
Training Time: 20.498642 seconds (29.39 M allocations: 448.520 MiB, 13.28% gc time)
Prediction Time:  5.277085 seconds (13.34 M allocations: 215.579 MiB)
Model Accuracy: 0.9997031179363998
Model Size: 21.419 kB
------------------------------


In [36]:
# Modeling Suport Vector Machines
println("Suport Vector Machines Model")
Model(LinearSVC(dual=false), kdd10_x_train, kdd10_y_train, kdd10_x_test, kdd10_y_test, "classification")

Suport Vector Machines Model
------------------------------
Training Time: 17.945133 seconds (29.39 M allocations: 448.520 MiB, 9.17% gc time)
Prediction Time:  8.751269 seconds (13.34 M allocations: 215.579 MiB, 17.65% gc time)
Model Accuracy: 0.9844001970217331
Model Size: 1.057 kB
------------------------------


In [37]:
# Modeling Neural Networks
println("Neural Networks Model")
Model(MLPClassifier(max_iter = 1000), kdd10_x_train, kdd10_y_train, kdd10_x_test, kdd10_y_test, "classification")

Neural Networks Model
------------------------------
Training Time: 61.681551 seconds (29.39 M allocations: 448.520 MiB)
Prediction Time:  9.716917 seconds (13.34 M allocations: 215.579 MiB, 13.00% gc time)
Model Accuracy: 0.9964239205975426
Model Size: 142.868 kB
------------------------------


In [39]:
# Kddcup Full Data set modeling, takes a lot of time to run, only run if you have extra time 
# Neccesary Package: MLJ
#using MLJ: partition
#KddFullModel()

Naive Bayes Classifier Model
------------------------------
Training Time:175.422732 seconds (291.49 M allocations: 4.345 GiB, 12.28% gc time)
Prediction Time: 71.944270 seconds (132.28 M allocations: 2.076 GiB, 6.41% gc time)
Model Accuracy: 0.9423012407376785
Model Size: 1.991 kB
------------------------------
Decision Tree Classifier Model
------------------------------
Training Time:232.248612 seconds (294.91 M allocations: 4.396 GiB, 7.11% gc time)
Prediction Time: 67.061262 seconds (132.26 M allocations: 2.075 GiB, 6.52% gc time)
Model Accuracy: 0.9999407973575207
Model Size: 45.899 kB
------------------------------
