diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md index b8631f521..42c58cd2f 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTE.md @@ -93,11 +93,11 @@ Julia Computing and New Zealand eScience Infrastructure. **Julia language consultants.** Mike Innes, Avik Sengupta -**Other contributors, past and present.** Diego Arenas, Edoardo Barp, -Gergö Bohner, Michael K. Borregaard, Valentin Churavy, Harvey -Devereux, Mosè Giordano, Thibaut Lienart, Mohammed Nook, Annika -Stechemesser, Ayush Shridar, Yiannis Simillides - +**Other contributors, past and present.** Dilum Aluthge, Diego + Arenas, Edoardo Barp, Gergö Bohner, Michael K. Borregaard, + Valentin Churavy, Harvey Devereux, Mosè Giordano, Thibaut Lienart, + Mohammed Nook, Piotr Oleśkiewicz, Julian Samaroo, Ayush Shridar, + Yiannis Simillides, Annika Stechemesser diff --git a/Project.toml b/Project.toml index e76d0f323..f835d6840 100644 --- a/Project.toml +++ b/Project.toml @@ -26,15 +26,18 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -MLJBase = "0.2.6" +CSV = "0.5" +MLJBase = "0.2.6, 0.3" MLJModels = "0.2.0" julia = "1" [extras] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [targets] -test = ["DecisionTree", "RDatasets", "Test", "UnicodePlots"] +test = ["CSV", "DecisionTree", "MLJBase", "RDatasets", "Test", "UnicodePlots"] diff --git a/README.md b/README.md index d8553f39b..59ba9c370 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,7 @@ A pure Julia machine learning framework. ## `join!(MLJ, YourModel)` -**Call for help.** MLJ is [getting -attention](https://github.com/trending/julia?since=monthly) but its -small project team needs help to ensure its success. This depends +**Call for help.** MLJ needs your help to ensure its success. This depends crucially on: - Existing and developing ML algorithms implementing the MLJ model interface diff --git a/examples/JuliaCon2019/Manifest.toml b/examples/JuliaCon2019/Manifest.toml new file mode 100644 index 000000000..24bbf9675 --- /dev/null +++ b/examples/JuliaCon2019/Manifest.toml @@ -0,0 +1,415 @@ +# This file is machine-generated - editing it directly is not advised + +[[Arpack]] +deps = ["BinaryProvider", "Libdl", "LinearAlgebra"] +git-tree-sha1 = "07a2c077bdd4b6d23a40342a8a108e2ee5e58ab6" +uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +version = "0.3.1" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinDeps]] +deps = ["Compat", "Libdl", "SHA", "URIParser"] +git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9" +uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" +version = "0.8.10" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "c7361ce8a2129f20b0e05a89f7070820cfed6648" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.6" + +[[CSV]] +deps = ["CategoricalArrays", "DataFrames", "Dates", "Mmap", "Parsers", "PooledArrays", "Profile", "Tables", "Unicode", "WeakRefStrings"] +git-tree-sha1 = "a7df9250dff3aba96436580dd6ac00d712364cab" +uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +version = "0.5.9" + +[[CategoricalArrays]] +deps = ["Compat", "Future", "Missings", "Printf", "Reexport", "Requires"] +git-tree-sha1 = "94d16e77dfacc59f6d6c1361866906dbb65b6f6b" +uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" +version = "0.5.2" + +[[CodecZlib]] +deps = ["BinaryProvider", "Libdl", "Test", "TranscodingStreams"] +git-tree-sha1 = "36bbf5374c661054d41410dc53ff752972583b9b" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.5.2" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "10050a24b09e8e41b951e9976b109871ce98d965" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.8.0" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "2.1.0" + +[[DataFrames]] +deps = ["CategoricalArrays", "Compat", "IteratorInterfaceExtensions", "Missings", "PooledArrays", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "StatsBase", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "7c0f86a01be0f77cc7f3f9096ed875f1217487e1" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "0.18.4" + +[[DataStructures]] +deps = ["InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "0809951a1774dc724da22d26e4289bbaab77809a" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.17.0" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DecisionTree]] +deps = ["DelimitedFiles", "Distributed", "LinearAlgebra", "Random", "ScikitLearnBase", "Statistics", "Test"] +git-tree-sha1 = "594057e7171467e2983ab49739a3019ce2eae67f" +uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +version = "0.8.3" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distances]] +deps = ["LinearAlgebra", "Printf", "Random", "Statistics", "Test"] +git-tree-sha1 = "a135c7c062023051953141da8437ed74f89d767a" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.8.0" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Distributions]] +deps = ["LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] +git-tree-sha1 = "56a158bc0abe4af5d4027af2275fde484261ca6d" +uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" +version = "0.19.2" + +[[EzXML]] +deps = ["BinaryProvider", "Libdl", "Printf"] +git-tree-sha1 = "724e13b7522563a18ae4a5cc4a9792ae3b0da3e6" +uuid = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" +version = "0.9.3" + +[[FileIO]] +deps = ["Pkg"] +git-tree-sha1 = "351f001a78aa1b7ad2696e386e110b5abd071c71" +uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +version = "1.0.7" + +[[FixedPointNumbers]] +git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.6.1" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"] +git-tree-sha1 = "03ddc88af7f2d963fac5aa9f3ac8e11914d68a78" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.8.4" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[LIBLINEAR]] +deps = ["DelimitedFiles", "Libdl", "SparseArrays", "Test"] +git-tree-sha1 = "42cacc29d9b4ae77b6702c181bbfa58f14d8ef7a" +uuid = "2d691ee1-e668-5016-a719-b2531b85e0f5" +version = "0.5.1" + +[[LIBSVM]] +deps = ["Compat", "DelimitedFiles", "LIBLINEAR", "Libdl", "ScikitLearnBase", "SparseArrays", "Test"] +git-tree-sha1 = "f17068e3f13a83da68c05f36c47a696b22129cff" +uuid = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b" +version = "0.3.1" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[MLJ]] +deps = ["CSV", "CategoricalArrays", "DataFrames", "Dates", "Distributed", "Distributions", "InteractiveUtils", "LinearAlgebra", "MLJBase", "MLJModels", "Pkg", "ProgressMeter", "Random", "RecipesBase", "RemoteFiles", "Statistics", "StatsBase", "Tables"] +git-tree-sha1 = "8b5de48dda61ba6f1d7e0c22c38ade273155cafa" +repo-rev = "master" +repo-url = "https://github.com/alan-turing-institute/MLJ.jl.git" +uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" +version = "0.2.5" + +[[MLJBase]] +deps = ["CSV", "CategoricalArrays", "ColorTypes", "Distributions", "InteractiveUtils", "Random", "SparseArrays", "Statistics", "StatsBase", "Tables"] +git-tree-sha1 = "fab2cfd5f28850133b686d55cdbddadf8dd3f5a9" +uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +version = "0.2.6" + +[[MLJModels]] +deps = ["CategoricalArrays", "Distances", "Distributions", "LIBSVM", "LinearAlgebra", "MLJBase", "Pkg", "Random", "Requires"] +git-tree-sha1 = "641995f074fa5a790301c56e57c888ab938c58fc" +uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7" +version = "0.2.5" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS]] +deps = ["BinaryProvider", "Dates", "Distributed", "Libdl", "Random", "Sockets", "Test"] +git-tree-sha1 = "2d94286a9c2f52c63a16146bb86fd6cdfbf677c6" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "0.6.8" + +[[Missings]] +deps = ["SparseArrays", "Test"] +git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[Mocking]] +deps = ["Compat", "Dates"] +git-tree-sha1 = "4bf69aaf823b119b034e091e16b18311aa191663" +uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" +version = "0.5.7" + +[[MultivariateStats]] +deps = ["Arpack", "LinearAlgebra", "Printf", "Random", "SparseArrays", "Statistics", "StatsBase", "Test"] +git-tree-sha1 = "cf1c990020bc4a52ff34ba2ee058b7cb677141f2" +uuid = "6f286f6a-111f-5878-ab1e-185364afe411" +version = "0.6.0" + +[[OrderedCollections]] +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.1.0" + +[[PDMats]] +deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] +git-tree-sha1 = "8b68513175b2dc4023a564cb0e917ce90e74fd69" +uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" +version = "0.9.7" + +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "0.3.6" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PooledArrays]] +git-tree-sha1 = "6e8c38927cb6e9ae144f7277c753714861b27d14" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "0.5.2" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Profile]] +deps = ["Printf"] +uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" + +[[ProgressMeter]] +deps = ["Distributed", "Printf"] +git-tree-sha1 = "0f08e0e74e5b160ca20d3962a2620038b75881c7" +uuid = "92933f4c-e287-5a05-a399-4b506db050ca" +version = "1.0.0" + +[[QuadGK]] +deps = ["DataStructures", "LinearAlgebra", "Test"] +git-tree-sha1 = "3ce467a8e76c6030d4c3786e7d3a73442017cdc0" +uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +version = "2.0.3" + +[[RData]] +deps = ["CategoricalArrays", "CodecZlib", "DataFrames", "Dates", "FileIO", "TimeZones"] +git-tree-sha1 = "3b0fc2f7df61b8890502851281c1eb0d2407d6ac" +uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" +version = "0.6.2" + +[[RDatasets]] +deps = ["CSV", "CodecZlib", "DataFrames", "FileIO", "Printf", "RData", "Reexport"] +git-tree-sha1 = "f701bd7dc55cba37dd81a7053c20aadfde425ad0" +uuid = "ce6b1742-4840-55fa-b093-852dadbb1d8b" +version = "0.6.2" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[RecipesBase]] +git-tree-sha1 = "7bdce29bc9b2f5660a6e5e64d64d91ec941f6aa2" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "0.7.0" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[RemoteFiles]] +deps = ["Dates", "HTTP", "Test"] +git-tree-sha1 = "0bf57958308f3e3a6dcce1d34ec8225d6bc247ea" +uuid = "cbe49d4c-5af1-5b60-bb70-0a60aa018e1b" +version = "0.2.1" + +[[Requires]] +deps = ["Test"] +git-tree-sha1 = "f6fbf4ba64d295e146e49e021207993b6b48c7d1" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "0.5.2" + +[[Rmath]] +deps = ["BinaryProvider", "Libdl", "Random", "Statistics", "Test"] +git-tree-sha1 = "9a6c758cdf73036c3239b0afbea790def1dabff9" +uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" +version = "0.5.0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[ScikitLearnBase]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "7877e55c1523a4b336b433da39c8e8c08d2f221f" +uuid = "6e75b9c4-186b-50bd-896f-2d2496a4843e" +version = "0.5.0" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["BinDeps", "BinaryProvider", "Libdl", "Test"] +git-tree-sha1 = "0b45dc2e45ed77f445617b99ff2adf0f5b0f23ea" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "0.7.2" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "2b6ca97be7ddfad5d9f16a13fe277d29f3d11c23" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.31.0" + +[[StatsFuns]] +deps = ["Rmath", "SpecialFunctions", "Test"] +git-tree-sha1 = "b3a4e86aa13c732b8a8c0ba0c3d3264f55e6bb3e" +uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +version = "0.8.0" + +[[SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.0" + +[[Tables]] +deps = ["DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "Requires", "TableTraits", "Test"] +git-tree-sha1 = "2e5d1a0d9b574ee2ed0c1a2fe32807de022376dd" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "0.2.9" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TimeZones]] +deps = ["Dates", "EzXML", "Mocking", "Printf", "Serialization", "Unicode"] +git-tree-sha1 = "859bfc1832ea52e413c96fa5c92130516db62bdb" +uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" +version = "0.9.1" + +[[TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "a25d8e5a28c3b1b06d3859f30757d43106791919" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.4" + +[[URIParser]] +deps = ["Test", "Unicode"] +git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69" +uuid = "30578b45-9adc-5946-b283-645ec420af67" +version = "0.4.0" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[WeakRefStrings]] +deps = ["Random", "Test"] +git-tree-sha1 = "9a0bb82eede528debe631b642eeb48a631a69bc2" +uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" +version = "0.6.1" diff --git a/examples/JuliaCon2019/Project.toml b/examples/JuliaCon2019/Project.toml new file mode 100644 index 000000000..a3c9ab383 --- /dev/null +++ b/examples/JuliaCon2019/Project.toml @@ -0,0 +1,7 @@ +[deps] +DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +LIBSVM = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b" +MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" +MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" +MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" diff --git a/examples/JuliaCon2019/demo.ipynb b/examples/JuliaCon2019/demo.ipynb new file mode 100644 index 000000000..dd9b51d75 --- /dev/null +++ b/examples/JuliaCon2019/demo.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m registry at `~/.julia/registries/General`\n", + "\u001b[32m\u001b[1m Updating\u001b[22m\u001b[39m git-repo `https://github.com/JuliaRegistries/General.git`\n", + "\u001b[?25l\u001b[2K\u001b[?25h" + ] + } + ], + "source": [ + "using Pkg\n", + "Pkg.activate(@__DIR__)\n", + "Pkg.instantiate()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Loading model metadata\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/MLJ.jl:114\n" + ] + } + ], + "source": [ + "using MLJ" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting some data:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

4 rows × 4 columns

SepalLengthSepalWidthPetalLengthPetalWidth
Float64Float64Float64Float64
17.23.26.01.8
25.03.51.30.3
35.03.51.60.6
45.72.94.21.3
" + ], + "text/latex": [ + "\\begin{tabular}{r|cccc}\n", + "\t& SepalLength & SepalWidth & PetalLength & PetalWidth\\\\\n", + "\t\\hline\n", + "\t& Float64 & Float64 & Float64 & Float64\\\\\n", + "\t\\hline\n", + "\t1 & 7.2 & 3.2 & 6.0 & 1.8 \\\\\n", + "\t2 & 5.0 & 3.5 & 1.3 & 0.3 \\\\\n", + "\t3 & 5.0 & 3.5 & 1.6 & 0.6 \\\\\n", + "\t4 & 5.7 & 2.9 & 4.2 & 1.3 \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "4×4 DataFrame\n", + "│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │\n", + "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", + "├─────┼─────────────┼────────────┼─────────────┼────────────┤\n", + "│ 1 │ 7.2 │ 3.2 │ 6.0 │ 1.8 │\n", + "│ 2 │ 5.0 │ 3.5 │ 1.3 │ 0.3 │\n", + "│ 3 │ 5.0 │ 3.5 │ 1.6 │ 0.6 │\n", + "│ 4 │ 5.7 │ 2.9 │ 4.2 │ 1.3 │" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "using RDatasets\n", + "iris = dataset(\"datasets\", \"iris\"); # a DataFrame\n", + "scrambled = shuffle(1:size(iris, 1))\n", + "X = iris[scrambled, 1:4];\n", + "y = iris[scrambled, 5];\n", + "\n", + "first(X, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5-element CategoricalArray{String,1,UInt8}:\n", + " \"virginica\" \n", + " \"setosa\" \n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"setosa\" " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y[1:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic fit and predict:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import MLJModels ✔\n", + "import LIBSVM ✔\n", + "import MLJModels.LIBSVM_.SVC ✔\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{SVC} @ 1…12\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*\n", + "optimization finished, #iter = 33\n", + "nu = 0.038907\n", + "obj = -1.945147, rho = -0.167869\n", + "nSV = 10, nBSV = 0\n", + "*\n", + "optimization finished, #iter = 48\n", + "nu = 0.293514\n", + "obj = -21.377494, rho = -0.144367\n", + "nSV = 33, nBSV = 26\n", + "*\n", + "optimization finished, #iter = 35\n", + "nu = 0.046521\n", + "obj = -2.403410, rho = 0.039522\n", + "nSV = 10, nBSV = 2\n", + "Total nSV = 44\n" + ] + }, + { + "data": { + "text/plain": [ + "150-element Array{CategoricalString{UInt8},1}:\n", + " \"virginica\" \n", + " \"setosa\" \n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"virginica\" \n", + " \"virginica\" \n", + " \"virginica\" \n", + " ⋮ \n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"virginica\" \n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"versicolor\"" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@load SVC()\n", + "classifier_ = SVC()\n", + "classifier = machine(classifier_, X, y)\n", + "fit!(classifier)\n", + "ŷ = predict(classifier, X) # or some Xnew" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating the model:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Evaluating using a holdout set. \n", + "│ fraction_train=0.8 \n", + "│ shuffle=false \n", + "│ measure=MLJ.misclassification_rate \n", + "│ operation=StatsBase.predict \n", + "│ Resampling from all rows. \n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/resampling.jl:100\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import MLJModels ✔\n", + "import MultivariateStats ✔\n", + "import MLJModels.MultivariateStats_.PCA ✔\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{PCA} @ 1…98\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n" + ] + }, + { + "data": { + "text/html": [ + "

3 rows × 3 columns

x1x2x3
Float64Float64Float64
1-2.614090.5609010.205535
22.77010.263528-0.0772477
32.405610.188871-0.263868
" + ], + "text/latex": [ + "\\begin{tabular}{r|ccc}\n", + "\t& x1 & x2 & x3\\\\\n", + "\t\\hline\n", + "\t& Float64 & Float64 & Float64\\\\\n", + "\t\\hline\n", + "\t1 & -2.61409 & 0.560901 & 0.205535 \\\\\n", + "\t2 & 2.7701 & 0.263528 & -0.0772477 \\\\\n", + "\t3 & 2.40561 & 0.188871 & -0.263868 \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "3×3 DataFrame\n", + "│ Row │ x1 │ x2 │ x3 │\n", + "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", + "├─────┼──────────┼──────────┼────────────┤\n", + "│ 1 │ -2.61409 │ 0.560901 │ 0.205535 │\n", + "│ 2 │ 2.7701 │ 0.263528 │ -0.0772477 │\n", + "│ 3 │ 2.40561 │ 0.188871 │ -0.263868 │" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate!(classifier,\n", + " resampling=Holdout(fraction_train=0.8),\n", + " measure=misclassification_rate)\n", + "# ## Adding dimension reduction:\n", + "@load PCA\n", + "dim_reducer_ = PCA()\n", + "dim_reducer = machine(dim_reducer_, X)\n", + "fit!(dim_reducer)\n", + "Xsmall = transform(dim_reducer, X);\n", + "\n", + "first(Xsmall, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*\n", + "optimization finished, #iter = 23\n", + "nu = 0.038664\n", + "obj = -1.933164, rho = -0.165650\n", + "nSV = 8, nBSV = 0\n", + "*\n", + "optimization finished, #iter = 38\n", + "nu = 0.293883\n", + "obj = -21.597810, rho = -0.082448\n", + "nSV = 34, nBSV = 26\n", + "*\n", + "optimization finished, #iter = 30\n", + "nu = 0.045664\n", + "obj = -2.380751, rho = 0.053250\n", + "nSV = 9, nBSV = 2\n", + "Total nSV = 45\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mMachine{SVC} @ 1…52\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n" + ] + }, + { + "data": { + "text/plain": [ + "150-element Array{CategoricalString{UInt8},1}:\n", + " \"virginica\" \n", + " \"setosa\" \n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"virginica\" \n", + " \"virginica\" \n", + " \"virginica\" \n", + " ⋮ \n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"virginica\" \n", + " \"virginica\" \n", + " \"versicolor\"\n", + " \"setosa\" \n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"versicolor\"\n", + " \"versicolor\"" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier = machine(classifier_, Xsmall, y)\n", + "fit!(classifier)\n", + "ŷ = predict(classifier, Xsmall)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building a composite model:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Method 1: Compact syntax (but not generalizable):" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(not implemented at time of talk)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# composite_ = @pipeline dim_reducer_ classifier_\n", + "\n", + "# composite = machine(composite_, X, y)\n", + "# evaluate!(composite, measure=misclassification_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Method 2: Re-interpret unstreamlined code:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[34mNode @ 1…92\u001b[39m = predict(\u001b[0m\u001b[1m1…90\u001b[22m, transform(\u001b[0m\u001b[1m1…02\u001b[22m, \u001b[34m5…24\u001b[39m))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Xraw = X;\n", + "yraw = y;\n", + "\n", + "X = source(Xraw)\n", + "y = source(yraw)\n", + "\n", + "dim_reducer = machine(dim_reducer_, X)\n", + "Xsmall = transform(dim_reducer, X)\n", + "\n", + "classifier = machine(classifier_, Xsmall, y)\n", + "ŷ = predict(classifier, Xsmall)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Training \u001b[34mNodalMachine{PCA} @ 1…02\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n", + "┌ Info: Training \u001b[34mNodalMachine{SVC} @ 1…90\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*\n", + "optimization finished, #iter = 23\n", + "nu = 0.038664\n", + "obj = -1.933164, rho = -0.165650\n", + "nSV = 8, nBSV = 0\n", + "*\n", + "optimization finished, #iter = 38\n", + "nu = 0.293883\n", + "obj = -21.597810, rho = -0.082448\n", + "nSV = 34, nBSV = 26\n", + "*\n", + "optimization finished, #iter = 30\n", + "nu = 0.045664\n", + "obj = -2.380751, rho = 0.053250\n", + "nSV = 9, nBSV = 2\n", + "Total nSV = 45\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mNode @ 1…92\u001b[39m = predict(\u001b[0m\u001b[1m1…90\u001b[22m, transform(\u001b[0m\u001b[1m1…02\u001b[22m, \u001b[34m5…24\u001b[39m))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit!(ŷ)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{CategoricalString{UInt8},1}:\n", + " \"setosa\" \n", + " \"versicolor\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ŷ(rows=3:4)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Updating \u001b[34mNodalMachine{PCA} @ 1…02\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:152\n", + "┌ Info: Training \u001b[34mNodalMachine{SVC} @ 1…90\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:140\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*\n", + "optimization finished, #iter = 13\n", + "nu = 0.030533\n", + "obj = -1.526884, rho = -0.270704\n", + "nSV = 4, nBSV = 1\n", + "*\n", + "optimization finished, #iter = 20\n", + "nu = 0.355841\n", + "obj = -30.258034, rho = 0.019778\n", + "nSV = 36, nBSV = 34\n", + "*\n", + "optimization finished, #iter = 8\n", + "nu = 0.048815\n", + "obj = -2.645552, rho = 0.204566\n", + "nSV = 7, nBSV = 4\n", + "Total nSV = 44\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mNode @ 1…92\u001b[39m = predict(\u001b[0m\u001b[1m1…90\u001b[22m, transform(\u001b[0m\u001b[1m1…02\u001b[22m, \u001b[34m5…24\u001b[39m))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dim_reducer_.ncomp = 1 # maximum output dimension\n", + "fit!(ŷ)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{CategoricalString{UInt8},1}:\n", + " \"setosa\" \n", + " \"versicolor\"" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ŷ(rows=3:4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Changing classifier hyperparameter does not retrigger retraining of\n", + " upstream dimension reducer:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*\n", + "optimization finished, #iter = 13\n", + "nu = 0.033696\n", + "obj = -1.838789, rho = -0.128178\n", + "nSV = 5, nBSV = 2\n", + "*\n", + "optimization finished, #iter = 24\n", + "nu = 0.429648\n", + "obj = -35.588638, rho = -0.040530\n", + "nSV = 44, nBSV = 42\n", + "*\n", + "optimization finished, #iter = 5\n", + "nu = 0.080000\n", + "obj = -4.676483, rho = -0.106043\n", + "nSV = 8, nBSV = 8\n", + "Total nSV = 53\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Not retraining \u001b[34mNodalMachine{PCA} @ 1…02\u001b[39m.\n", + "│ It appears up-to-date. Use force=true to force retraining.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:146\n", + "┌ Info: Updating \u001b[34mNodalMachine{SVC} @ 1…90\u001b[39m.\n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/machines.jl:152\n" + ] + }, + { + "data": { + "text/plain": [ + "\u001b[34mNode @ 1…92\u001b[39m = predict(\u001b[0m\u001b[1m1…90\u001b[22m, transform(\u001b[0m\u001b[1m1…02\u001b[22m, \u001b[34m5…24\u001b[39m))" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier_.gamma = 0.1\n", + "fit!(ŷ)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{CategoricalString{UInt8},1}:\n", + " \"setosa\" \n", + " \"versicolor\"" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ŷ(rows=3:4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predicting on new data (`Xraw` in `source(Xraw)` is substituted for `Xnew`):" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{CategoricalString{UInt8},1}:\n", + " \"setosa\"\n", + " \"setosa\"" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Xnew = (SepalLength = [4.0, 5.2],\n", + " SepalWidth = [3.2, 3.0],\n", + " PetalLength = [1.2, 1.5],\n", + " PetalWidth = [0.1, 0.4],)\n", + "ŷ(Xnew)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Exporting network as stand-alone reusable model:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(pca = (ncomp = 1,\n", + " method = :auto,\n", + " pratio = 0.99,\n", + " mean = nothing,),\n", + " svc = (kernel = RadialBasis::KERNEL = 2,\n", + " gamma = 0.1,\n", + " weights = nothing,\n", + " cost = 1.0,\n", + " degree = 3,\n", + " coef0 = 0.0,\n", + " tolerance = 0.001,\n", + " shrinking = true,\n", + " probability = false,),)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "composite_ = @from_network Composite(pca=dim_reducer_, svc=classifier_) <= (X, y, ŷ)\n", + "params(composite_)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Evaluating using cross-validation. \n", + "│ nfolds=6. \n", + "│ shuffle=false \n", + "│ measure=MLJ.misclassification_rate \n", + "│ operation=StatsBase.predict \n", + "│ Resampling from all rows. \n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/resampling.jl:151\n", + "\u001b[33mCross-validating: 100%[=========================] Time: 0:00:02\u001b[39m\n" + ] + }, + { + "data": { + "text/plain": [ + "6-element Array{Float64,1}:\n", + " 0.08\n", + " 0.08\n", + " 0.0 \n", + " 0.12\n", + " 0.08\n", + " 0.04" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "composite = machine(composite_, Xraw, yraw)\n", + "evaluate!(composite, measure=misclassification_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating a \"self-tuning\" random forest (nested resampling):" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dict{Any,Any} with 6 entries:\n", + " \"MultivariateStats\" => Any[\"RidgeRegressor\"]\n", + " \"MLJ\" => Any[\"MLJ.Constant.DeterministicConstantRegressor\", \"ML…\n", + " \"DecisionTree\" => Any[\"DecisionTreeRegressor\"]\n", + " \"ScikitLearn\" => Any[\"SVMLRegressor\", \"ElasticNet\", \"ElasticNetCV\", \"SV…\n", + " \"LIBSVM\" => Any[\"EpsilonSVR\", \"NuSVR\"]\n", + " \"XGBoost\" => Any[\"XGBoostRegressor\"]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "task = load_boston()\n", + "models(task)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluating a single tree:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import MLJModels ✔\n", + "import DecisionTree ✔\n", + "import MLJModels.DecisionTree_.DecisionTreeRegressor ✔\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Evaluating using a holdout set. \n", + "│ fraction_train=0.7 \n", + "│ shuffle=false \n", + "│ measure=Function[rms, mav] \n", + "│ operation=StatsBase.predict \n", + "│ Resampling from all rows. \n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/resampling.jl:100\n" + ] + }, + { + "data": { + "text/plain": [ + "(MLJ.rms = 8.795939100833767,\n", + " MLJ.mav = 5.785953164160401,)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@load DecisionTreeRegressor # load code\n", + "\n", + "tree_ = DecisionTreeRegressor(n_subfeatures=3)\n", + "tree = machine(tree_, task)\n", + "evaluate!(tree,\n", + " resampling=Holdout(fraction_train=0.7),\n", + " measure=[rms, mav])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use ensembling wrapper to create a random forest:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.DeterministicEnsembleModel(atom = \u001b[34mDecisionTreeRegressor @ 7…75\u001b[39m,\n", + " weights = Float64[],\n", + " bagging_fraction = 0.8,\n", + " rng = MersenneTwister(UInt32[0x08804db9, 0xfc38831f, 0xd5683001, 0x444075ec]),\n", + " n = 10,\n", + " parallel = true,\n", + " out_of_bag_measure = Any[],)\u001b[34m @ 9…74\u001b[39m" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "forest_ = EnsembleModel(atom=tree_, n=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrapping in a tuning strategy creates a \"self_tuning\" random forest:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MLJ.DeterministicTunedModel(model = \u001b[34mDeterministicEnsembleModel{DecisionTreeRegressor} @ 9…74\u001b[39m,\n", + " tuning = \u001b[34mGrid @ 2…87\u001b[39m,\n", + " resampling = \u001b[34mCV @ 1…01\u001b[39m,\n", + " measure = MLJ.rms,\n", + " operation = StatsBase.predict,\n", + " ranges = MLJ.NumericRange{T,Symbol} where T[\u001b[34mNumericRange @ 1…81\u001b[39m, \u001b[34mNumericRange @ 1…80\u001b[39m],\n", + " minimize = true,\n", + " full_report = true,\n", + " train_best = true,)\u001b[34m @ 6…25\u001b[39m" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r1 = range(forest_, :bagging_fraction, lower=0.4, upper=1.0);\n", + "r2 = range(forest_, :(atom.n_subfeatures), lower=1, upper=12)\n", + "\n", + "self_tuning_forest_ = TunedModel(model=forest_,\n", + " tuning=Grid(),\n", + " resampling=CV(),\n", + " ranges=[r1,r2],\n", + " measure=rms)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate the self_tuning_forest (nested resampling):" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Info: Evaluating using cross-validation. \n", + "│ nfolds=6. \n", + "│ shuffle=false \n", + "│ measure=Function[rms, rmslp1] \n", + "│ operation=StatsBase.predict \n", + "│ Resampling from all rows. \n", + "└ @ MLJ /Users/anthony/.julia/packages/MLJ/tod7z/src/resampling.jl:151\n", + "\u001b[33mCross-validating: 100%[=========================] Time: 0:00:18\u001b[39m\n" + ] + }, + { + "data": { + "text/plain": [ + "(MLJ.rms = [2.91827, 3.40544, 4.60971, 4.54709, 8.12081, 3.79819],\n", + " MLJ.rmslp1 = [0.148546, 0.119118, 0.148812, 0.134863, 0.345141, 0.221093],)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "self_tuning_forest = machine(self_tuning_forest_, task)\n", + "\n", + "evaluate!(self_tuning_forest,\n", + " resampling=CV(),\n", + " measure=[rms,rmslp1])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.1.0", + "language": "julia", + "name": "julia-1.1" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.1.1" + } + }, + "nbformat": 4, + "nbformat_minor": 3 +} diff --git a/examples/JuliaCon2019/demo.jl b/examples/JuliaCon2019/demo.jl new file mode 100644 index 000000000..1e8e5857f --- /dev/null +++ b/examples/JuliaCon2019/demo.jl @@ -0,0 +1,174 @@ +using Pkg +Pkg.activate(@__DIR__) +Pkg.instantiate() + +#- + +using MLJ + + +# ## Getting some data: + +using RDatasets +iris = dataset("datasets", "iris"); # a DataFrame +scrambled = shuffle(1:size(iris, 1)) +X = iris[scrambled, 1:4]; +y = iris[scrambled, 5]; + +first(X, 4) + +#- + +y[1:5] + + +# ## Basic fit and predict: + +@load SVC() +classifier_ = SVC() +classifier = machine(classifier_, X, y) +fit!(classifier) +ŷ = predict(classifier, X) # or some Xnew + +#- + +# ## Evaluating the model: +evaluate!(classifier, + resampling=Holdout(fraction_train=0.8), + measure=misclassification_rate) + + +# ## Adding dimension reduction: + +@load PCA +dim_reducer_ = PCA() +dim_reducer = machine(dim_reducer_, X) +fit!(dim_reducer) +Xsmall = transform(dim_reducer, X); + +first(Xsmall, 3) + +#- + +classifier = machine(classifier_, Xsmall, y) +fit!(classifier) +ŷ = predict(classifier, Xsmall) + + +# ## Building a composite model: + +# ### Method 1: Compact syntax (but not generalizable): + +# (not implemented at time of talk) + +## composite_ = @pipeline dim_reducer_ classifier_ + +## composite = machine(composite_, X, y) +## evaluate!(composite, measure=misclassification_rate) + + +# ### Method 2: Re-interpret unstreamlined code: + +Xraw = X; +yraw = y; + +X = source(Xraw) +y = source(yraw) + +dim_reducer = machine(dim_reducer_, X) +Xsmall = transform(dim_reducer, X) + +classifier = machine(classifier_, Xsmall, y) +ŷ = predict(classifier, Xsmall) + +#- + +fit!(ŷ) + +#- + +ŷ(rows=3:4) + +#- + +dim_reducer_.ncomp = 1 # maximum output dimension +fit!(ŷ) + +#- + +ŷ(rows=3:4) + +# Changing classifier hyperparameter does not retrigger retraining of +# upstream dimension reducer: + +classifier_.gamma = 0.1 +fit!(ŷ) + +#- + +ŷ(rows=3:4) + +# Predicting on new data (`Xraw` in `source(Xraw)` is substituted for `Xnew`): + +Xnew = (SepalLength = [4.0, 5.2], + SepalWidth = [3.2, 3.0], + PetalLength = [1.2, 1.5], + PetalWidth = [0.1, 0.4],) +ŷ(Xnew) + + +# #### Exporting network as stand-alone reusable model: + +composite_ = @from_network Composite(pca=dim_reducer_, svc=classifier_) <= (X, y, ŷ) +params(composite_) + +#- + +composite = machine(composite_, Xraw, yraw) +evaluate!(composite, measure=misclassification_rate) + +# ## Evaluating a "self-tuning" random forest (nested resampling): + +task = load_boston() +models(task) + +#- + +# ### Evaluating a single tree: + +@load DecisionTreeRegressor # load code + +tree_ = DecisionTreeRegressor(n_subfeatures=3) +tree = machine(tree_, task) +evaluate!(tree, + resampling=Holdout(fraction_train=0.7), + measure=[rms, mav]) + +# ### Use ensembling wrapper to create a random forest: + +forest_ = EnsembleModel(atom=tree_, n=10) + + +# ### Wrapping in a tuning strategy creates a "self_tuning" random forest: + +r1 = range(forest_, :bagging_fraction, lower=0.4, upper=1.0); +r2 = range(forest_, :(atom.n_subfeatures), lower=1, upper=12) + +self_tuning_forest_ = TunedModel(model=forest_, + tuning=Grid(), + resampling=CV(), + ranges=[r1,r2], + measure=rms) + +# ### Evaluate the self_tuning_forest (nested resampling): + +self_tuning_forest = machine(self_tuning_forest_, task) + +evaluate!(self_tuning_forest, + resampling=CV(), + measure=[rms,rmslp1]) + + + + + diff --git a/examples/JuliaCon2019/talk.pdf b/examples/JuliaCon2019/talk.pdf new file mode 100644 index 000000000..0770a43d9 Binary files /dev/null and b/examples/JuliaCon2019/talk.pdf differ diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..31cfca5cf --- /dev/null +++ b/examples/README.md @@ -0,0 +1,3 @@ +To ensure loading of correct package versions, be sure to run +scripts/notebooks in the same directory containing the Project.toml +files that accompany them in this repo. diff --git a/src/tuning.jl b/src/tuning.jl index a1eca5000..8c49e921f 100644 --- a/src/tuning.jl +++ b/src/tuning.jl @@ -92,6 +92,7 @@ function TunedModel(;model=nothing, !isempty(ranges) || error("You need to specify ranges=... ") model !== nothing || error("You need to specify model=... ") + model isa Supervised || error("model must be a SupervisedModel. ") message = clean!(model) isempty(message) || @info message diff --git a/test/Transformers.jl b/test/Transformers.jl index e75056ae0..6aba3f77a 100644 --- a/test/Transformers.jl +++ b/test/Transformers.jl @@ -1,7 +1,9 @@ module TestTransformer # using Revise -using MLJ, MLJBase +using MLJ +using MLJBase +using CSV using Test using Statistics using DataFrames diff --git a/test/datasets.jl b/test/datasets.jl index ad806a486..0a7ea56a2 100644 --- a/test/datasets.jl +++ b/test/datasets.jl @@ -2,6 +2,8 @@ module TestDatasets # using Revise using MLJ +using MLJBase +using CSV load_ames() load_boston() diff --git a/test/ensembles.jl b/test/ensembles.jl index bafe09aa6..ee2cfe8e5 100644 --- a/test/ensembles.jl +++ b/test/ensembles.jl @@ -10,7 +10,8 @@ module TestEnsembles using Test using Random using MLJ -import MLJBase +using MLJBase +using CSV using CategoricalArrays using Distributions diff --git a/test/machines.jl b/test/machines.jl index c50122c7c..ca09a0eab 100644 --- a/test/machines.jl +++ b/test/machines.jl @@ -2,7 +2,8 @@ module TestMachines # using Revise using MLJ -import MLJBase +using MLJBase +using CSV using Test using Statistics diff --git a/test/networks.jl b/test/networks.jl index 854d97f76..bb8a6ba80 100644 --- a/test/networks.jl +++ b/test/networks.jl @@ -3,7 +3,8 @@ module TestLearningNetworks # using Revise using Test using MLJ -import MLJBase +using MLJBase +using CSV using CategoricalArrays # TRAINABLE MODELS diff --git a/test/resampling.jl b/test/resampling.jl index 4d154a0c8..981394faf 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -4,6 +4,7 @@ module TestResampling using Test using MLJ using MLJBase +using CSV using DataFrames x1 = ones(4) diff --git a/test/runtests.jl b/test/runtests.jl index a7f005e36..44cec0e71 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,8 @@ # eg, `module TestDatasets` for code testing `datasets.jl`. using MLJ +using MLJBase +using CSV using Test @constant junk=KNNRegressor() diff --git a/test/tuning.jl b/test/tuning.jl index b3ed54799..cddac2c96 100644 --- a/test/tuning.jl +++ b/test/tuning.jl @@ -4,7 +4,8 @@ module TestTuning using Test using MLJ # using UnicodePlots -import MLJBase +using MLJBase +using CSV x1 = rand(100); x2 = rand(100);