In [1]:
# resolving modules
import Pkg
Pkg.add("DataFrames")
Pkg.add("CSV")
Pkg.add("Flux")
Pkg.add("MLJ")
Pkg.add("MLJModels")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m InlineStrings ─────────────── v1.4.2
[32m[1m   Installed[22m[39m Crayons ───────────────────── v4.1.1
[32m[1m   Installed[22m[39m SentinelArrays ────────────── v1.4.5
[32m[1m   Installed[22m[39m PooledArrays ──────────────── v1.4.3
[32m[1m   Installed[22m[39m TableTraits ───────────────── v1.0.1
[32m[1m   Installed[22m[39m DataAPI ───────────────────── v1.16.0
[32m[1m   Installed[22m[39m Tables ────────────────────── v1.12.0
[32m[1m   Installed[22m[39m PrettyTables ──────────────── v2.3.2
[32m[1m   Installed[22m[39m IteratorInterfaceExtensions ─ v1.0.0
[32m[1m   Installed[22m[39m DataValueInterfaces ───────── v1.0.0
[32m[1m   Installed[22m[39m InvertedIndices ───────────── v1.3.0
[32m[1m   Installed[22m[39m OrderedCollections ────────── v1.6.3
[32m[1m   Installed[22m[39m DataFra

In [2]:
#getting modules
using DataFrames
using CSV
using Flux
using Statistics, LinearAlgebra

In [3]:
train_df = CSV.read("./data/train.csv", DataFrame)
display(describe(train_df))

size(train_df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,PassengerId,446.0,1,446.0,891,0,Int64
2,Survived,0.383838,0,0.0,1,0,Int64
3,Pclass,2.30864,1,3.0,3,0,Int64
4,Name,,"Abbing, Mr. Anthony",,"van Melkebeke, Mr. Philemon",0,String
5,Sex,,female,,male,0,String7
6,Age,29.6991,0.42,28.0,80.0,177,"Union{Missing, Float64}"
7,SibSp,0.523008,0,0.0,8,0,Int64
8,Parch,0.381594,0,0.0,6,0,Int64
9,Ticket,,110152,,WE/P 5735,0,String31
10,Fare,32.2042,0.0,14.4542,512.329,0,Float64


(891, 12)

Tratamento de dados

In [4]:
# coluna embarked possui poucas linhas nulas -> retiradas
# coluna age tem uma quantidade ok de linhas nulas -> trocar pela mediana
# coluna cabin tem muitas linhas nulas -> ignorar a coluna
train_df = dropmissing(train_df, "Embarked")
train_df.Age = replace(train_df.Age, missing => 28)
train_df = select(train_df, Not("Cabin"))

# passenger ID e nome são apenas identificadores e não ajudam a classificar
train_df = select(train_df, Not(["PassengerId", "Name"]))

describe(train_df)


Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,Survived,0.382452,0,0.0,1,0,Int64
2,Pclass,2.31159,1,3.0,3,0,Int64
3,Sex,,female,,male,0,String7
4,Age,29.3152,0.42,28.0,80.0,0,Float64
5,SibSp,0.524184,0,0.0,8,0,Int64
6,Parch,0.382452,0,0.0,6,0,Int64
7,Ticket,,110152,,WE/P 5735,0,String31
8,Fare,32.0967,0.0,14.4542,512.329,0,Float64
9,Embarked,,C,,S,0,String1


In [5]:
# verificando valores únicos de cada coluna não-numérica
combine(groupby(train_df,"Sex"), nrow=>"count")

Row,Sex,count
Unnamed: 0_level_1,String7,Int64
1,male,577
2,female,312


In [6]:
combine(groupby(train_df,"Ticket"), nrow=>"count")

Row,Ticket,count
Unnamed: 0_level_1,String31,Int64
1,A/5 21171,1
2,PC 17599,1
3,STON/O2. 3101282,1
4,113803,2
5,373450,1
6,330877,1
7,17463,1
8,349909,4
9,347742,3
10,237736,2


In [7]:
combine(groupby(train_df,"Embarked"), nrow=>"count")

Row,Embarked,count
Unnamed: 0_level_1,String1,Int64
1,S,644
2,C,168
3,Q,77


In [8]:
# para Sex e Embarked, é possível codificar para numeros.
# para Ticket, são muitos valores únicos -> ignorar coluna

train_df.Sex = Int64.(replace(train_df.Sex, "female" => 1, "male" => 2))
train_df.Embarked = Int64.(replace(train_df.Embarked, "S" => 1, "C" => 2, "Q" => 3))

train_df = select(train_df, Not("Ticket"))
describe(train_df) # dataset limpo!

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Real,Float64,Real,Int64,DataType
1,Survived,0.382452,0.0,0.0,1.0,0,Int64
2,Pclass,2.31159,1.0,3.0,3.0,0,Int64
3,Sex,1.64904,1.0,2.0,2.0,0,Int64
4,Age,29.3152,0.42,28.0,80.0,0,Float64
5,SibSp,0.524184,0.0,0.0,8.0,0,Int64
6,Parch,0.382452,0.0,0.0,6.0,0,Int64
7,Fare,32.0967,0.0,14.4542,512.329,0,Float64
8,Embarked,1.3622,1.0,1.0,3.0,0,Int64


Criação e Treinamento dos Modelos (SVC e KNN)

In [9]:
X_train = select(train_df, Not(:Survived))
y_train = train_df.Survived

889-element Vector{Int64}:
 0
 1
 1
 1
 0
 0
 0
 0
 1
 1
 ⋮
 0
 0
 0
 0
 0
 1
 0
 1
 0

In [16]:
using MLJ, ScikitLearn

# Carregar os modelos
SVC = @load SVC pkg="LIBSVM"
KNN = @load KNeighborsClassifier pkg="MLJScikitLearnInterface"

# Criar os modelos
svc_model = SVC()
knn_model = KNN()

# Envolver os dados em um formato utilizável por MLJ
X_train_mlj = MLJ.table(X_train)
y_train_mlj = categorical(y_train)  # variável de saída deve ser categórica

import MLJLIBSVMInterface ✔
import MLJScikitLearnInterface

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/thamya/.julia/packages/MLJModels/8W54X/src/loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/thamya/.julia/packages/MLJModels/8W54X/src/loading.jl:159


In [None]:
# Treinar os modelos
svc_machine = machine(svc_model, X_train_mlj, y_train_mlj)
knn_machine = machine(knn_model, X_train_mlj, y_train_mlj)

fit!(svc_machine)
fit!(knn_machine)

Testar e Avaliar os Modelos

In [None]:
using MLJBase

# Previsões
svc_predictions = predict(svc_machine, X_train_mlj)
knn_predictions = predict(knn_machine, X_train_mlj)

# Converter previsões para rótulos
svc_predictions_labels = mode.(svc_predictions)  # prever o rótulo mais provável
knn_predictions_labels = mode.(knn_predictions)

# Avaliação do modelo SVC
svc_accuracy = accuracy(knn_predictions_labels, y_train_mlj)
svc_precision = precision(svc_predictions_labels, y_train_mlj, 1)  # '1' para a classe positiva
svc_recall = recall(svc_predictions_labels, y_train_mlj, 1)
svc_f1 = f1_score(svc_predictions_labels, y_train_mlj, 1)

# Avaliação do modelo SVC
knn_accuracy = accuracy(knn_predictions_labels, y_train_mlj)
knn_precision = precision(knn_predictions_labels, y_train_mlj, 1)
knn_recall = recall(knn_predictions_labels, y_train_mlj, 1)
knn_f1 = f1_score(knn_predictions_labels, y_train_mlj, 1)

In [None]:

# Apresentação dos resultados
println("-------- Desempenho do modelo SVC -------- ")
println("Acurácia: ", svc_accuracy)
println("Precisão: ", svc_precision)
println("Recall: ", svc_recall)
println("F1-Score: ", svc_f1)

println("\n-------- Desempenho do modelo KNN: --------")
println("Acurácia: ", knn_accuracy)
println("Precisão: ", knn_precision)
println("Recall: ", knn_recall)
println("F1-Score: ", knn_f1)