In [2]:
%classpath add jar ./scicloj-ml.jar

In [3]:
(require '[scicloj.ml.core :as ml]
         '[scicloj.ml.metamorph :as mm]
         '[scicloj.ml.dataset :as ds])

(import '[smile.data.formula Formula]
        '[smile.data.type StructField StructType DataType]
        '[smile.data DataFrame])

..instrumented #'scicloj.metamorph.ml/model
..instrumented #'scicloj.metamorph.ml/explain
..instrumented #'scicloj.metamorph.ml/evaluate-pipelines
..instrumented #'scicloj.metamorph.ml/define-model!
..instrumented #'scicloj.metamorph.ml/train
..instrumented #'scicloj.metamorph.ml/thaw-model
..instrumented #'scicloj.metamorph.ml/default-loss-fn
..instrumented #'scicloj.metamorph.ml/predict
Register model:  :smile.classification/linear-discriminant-analysis
Register model:  :smile.classification/fld
Register model:  :smile.classification/random-forest
Register model:  :smile.classification/ada-boost
Register model:  :smile.classification/knn
Register model:  :smile.classification/decision-tree
Register model:  :smile.classification/gradient-tree-boost
Register model:  :smile.classification/regularized-discriminant-analysis
Register model:  :smile.classification/quadratic-discriminant-analysis
Register model:  :smile.classification/logistic-regression
Register model:  :smile.regression/or



interface smile.data.DataFrame

In [4]:
(import 'jupyter.Displayer)

(import 'jupyter.Displayers)

(definterface Html
    (^java.lang.String html []))

interface beaker_clojure_shell_00d159ab_acfd_4247_b55e_19029c52e556.Html

In [5]:
(Displayers/register Html
                     (proxy [Displayer] []
                         (display [var1] {"text/html" (.html var1)})
                         (setMimeTypes [& types])))

(defn render-html [html]
    (reify Html
        (html [this]
              html)))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/render-html

In [6]:
(defn to-number-and-symbol [ds]
    (map (fn [row]
             (let [values (vals row)]
                 (zipmap (keys row)
                         (reduce (fn [acc v]
                                     (if (string? v)
                                         (conj acc (read-string v))
                                         (conj acc v)))
                                 []
                                 values)))) ds))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/to-number-and-symbol

In [7]:
(def enhanced-titanic ((comp to-number-and-symbol read-string slurp) "titanic-enhanced-data.txt"))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/enhanced-titanic

In [8]:
(defn train-test-split
    [train-n dataset]
    (let [ks ((comp distinct mapcat) keys dataset)
          [train tst] (split-at train-n dataset)]
        [(ds/dataset train) (ds/dataset tst)]))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/train-test-split

In [9]:
(defmacro with-destructuring-def [names value]
    `(let [binds# (destructure '~[names value])
           part# (partition 2 binds#)]
         (eval (list* 'let (vec (first part#))
                (mapv #(intern *ns* (first %) (eval (last %))) part#)))))
           
(with-destructuring-def [foo bar] (list "foo" "bar"))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/bar

In [10]:
(with-destructuring-def [train tst] (train-test-split 346 enhanced-titanic))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/tst

In [11]:
(defn ->pipe-fn [cols add-column target model]
    (ml/pipeline
     (mm/select-columns cols)
     (apply mm/add-column add-column)
     (mm/categorical->number cols)
     (mm/set-inference-target target)
     {:metamorph/id :model}
     (mm/model {:model-type model})
     (ml/lift ds/dataset)))

(defn y-n [ds]
    (map #(case %
              "1" "yes"
              "0" "no"
              1 "yes"
              0 "no"
              false ""
              nil "") (get ds "Survived")))

(defn apply-pipe [pipe data]
    (pipe data))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/apply-pipe

In [34]:
(defn accuracy [pipe train tst]
    (let [fitted (pipe {:metamorph/data train :metamorph/mode :fit})
          tested (pipe (assoc fitted :metamorph/data tst :metamorph/mode :transform))
          preds (get-in tested [:metamorph/data "Survived"])
          survs (get tst "Survived")]
        (loop [[pred & preds] preds
               [surv & survs] survs
               ret (fn [r] (float (/ r (count survs))))]
            (if (not pred)
                (trampoline ret 0)
                (recur preds
                       survs
                       (if (= (double pred) (double surv))
                           (fn [r] #(ret (inc r)))
                           ret))))))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/accuracy

In [13]:
;; (require '[smile.regression :refer [cart]])

null

In [14]:
(defn ->DataFrame [seq-of-maps]
    (let [make-struct  (fn [IN]
                        (StructType.
                         (java.util.ArrayList.
                          (mapv
                           (fn [[field-name dtype]]
                            (StructField. field-name
                             (DataType/of dtype)))
                           IN))))
          make-tuples  (fn [structure coll]
                        (java.util.ArrayList.
                         (map
                          (fn [x]
                           (smile.data.Tuple/of (into-array java.lang.Object x)
                            structure))
                          coll)))
          apply-conv   (fn [make-struct make-tuples]
                        (let [ks ((comp distinct mapcat) keys seq-of-maps)
                              dtype #(DataType/infer (str \" % \"))
                              random-row (rand-nth seq-of-maps)
                              rand-values (vals random-row)
                              types (map class rand-values)
                              values (map vals seq-of-maps)
                              row-fields (map (fn [_] (str (gensym))) rand-values)
                              pairs (map vector row-fields types)
                              struc (make-struct pairs)
                              tuples (make-tuples struc values)
                              col-pairs (map (fn [k dtype] [(str k) dtype]) ks types)
                              col-struc (make-struct col-pairs)]
                          (DataFrame/of tuples col-struc)))]   
      (apply-conv make-struct make-tuples)))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/->DataFrame

In [35]:
;; (require '[clojure.set :refer [map-invert]])

null

In [16]:
;;(letfn [(train-cart [ivariables dvariable model-data]
  ;;       (assoc model-data :metamorph/id
    ;;      (cart
      ;     (Formula/of ((comp str first keys) dvariable)
       ;     (let [ks (keys ivariables)]
        ;      (into-array String ks)))
         ;  (let [to-seq (fn [ds]
          ;                (let [ks (keys ds)
           ;                     vs (vals ds)]
            ;                (apply map
             ;                (fn [& args] (zipmap ks args))
              ;               vs)))]
               ;(->DataFrame
               ; (to-seq (merge ivariables dvariable))))
          ;45
          ;21
          ;67)))
        ;; (predict-cart [ds model-data mode]
           ;           (let [model (:metamorph/id model-data)
            ;                ks (.keySet ds)
             ;               rows (.columns ds)
              ;              data (map seq rows)
               ;             SEQ (apply map
                ;                 (fn [& args] (zipmap ks args))
                 ;                data)
                  ;          dframed (->DataFrame SEQ)
                   ;         predicted (.predict model dframed)
                    ;        cat-map (:target-categorical-maps mode)
                     ;       target (first (:target-columns mode))
                      ;      labels (get-in cat-map [target :lookup-table])]
                        ;  (loop [inverted (map-invert labels)
                         ;        [x & xs] predicted
                          ;       ret (let [ks (keys labels)
                           ;                size (count ks)]
                            ;             (zipmap (concat ks [target])
                             ;                    (repeat (inc size) [])))]
                              ;(if (not x)
                               ;   ret
                                ;  (recur inverted
                                 ;        xs
                                  ;       (let [r (Math/round x)
                                   ;            rema (- 1 x)]
                                    ;         (merge-with conj
                                     ;                    ret
                                      ;                   (zipmap (keys ret)
                                       ;                          [rema x r]))))))))]
    
  ;  (ml/define-model! :cart1
   ;                   train-cart
    ;                  predict-cart
     ;                 {}))

null

In [17]:
(defn mean [coll]
    (let [size (count coll)]
        (if (zero? size)
            0
            (/ (reduce + coll) (count coll)))))

(defn mse [real preds]
    (let [N (count preds)
          errors (map (fn [x y]
                          (Math/abs (- x y)))
                      real
                      preds)]
        (if (zero? N)
            0
            (/ (reduce + errors) N))))

(defn rmse [real preds]
    (let [N (count preds)
          errors (map (fn [x y]
                          (Math/pow (- x y) 2))
                      real
                      preds)]
        (Math/sqrt (mean errors))))

(defn squared-differences [data mean-value]
  (map #(Math/pow (- % mean-value) 2) data))

(defn total-sum-of-squares [actual]
  (let [mean-value (mean actual)]
    (reduce + (squared-differences actual mean-value))))

(defn sum-of-squared-residuals [actual predicted]
  (reduce + (map 
             (fn [a p] 
                 (Math/pow (- a p) 2)) 
             actual 
             predicted)))

(defn r-squared [actual predicted]
  (let [tss (total-sum-of-squares actual)
        ssr (sum-of-squared-residuals actual predicted)]
      (- 1 (/ ssr tss))))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/r-squared

In [18]:
(defn mock [data seed]
    (ds/dataset
     (into {}
           (mapv
            (fn [[k values]]
                (let [vs (take seed (cycle values))
                      shuf (apply merge values vs)]
                    [k shuf]))
            data))))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/mock

In [19]:
(defn return-thunk [model data n metric validate-n]
    (fn []
     (let [[train tst] (train-test-split n data)
           train-ctx (model {:metamorph/data train :metamorph/mode :fit})
           validate (mock tst validate-n)
           test-ctx (model (merge train-ctx {:metamorph/data validate :metamorph/mode :transform}))]
       (metric (get validate "Survived")
               (get-in test-ctx [:metamorph/data "Survived"])))))

(defn multiple-thunks [model data n validate-n]
    (fn [metrics]
        (let [data (shuffle data)]
          (map #(return-thunk model data n % validate-n) metrics))))

(defn finify [coll]
    (let [seen (atom #{})
          ret (atom [])]
        (reduce (fn [acc item]
                    (if (@seen item)
                        (reduced item)
                        (do (swap! seen conj item)
                            (swap! ret conj item))))
                nil
                coll)
        @ret))

(defn show-perf [coll]
    (let [ks '(:accuracy :mse :rmse :r-squared)
          group (group-by count coll)
          MAX (ffirst (sort-by first #(compare %2 %1) group))]
        (apply map
               (fn [& args] (zipmap ks args))
               coll)))

(defn new-performance [model n data validate-n]
    (let [thunks-fn (multiple-thunks model data n validate-n)]
        (show-perf
        (map finify
         (map repeatedly
          (thunks-fn
           [(fn [real pred]
                (let [size (count real)
                      preds (map
                             (fn [r i]
                                 (if (= (double r) (double i))
                                     1
                                     0)) real pred)
                      pred-1 (count (filter #{1} preds))]
                    (double (/ pred-1 size))))
            (comp double mse)
            (comp double rmse)
            (comp double r-squared)]))))))

(defn out-model [& params]
    (if (< (count params) 3)
        (fn [model]
            (apply ->pipe-fn
                   (concat params 
                           [(first (last params)) model])))
        (fn [model]
            (apply ->pipe-fn (concat params [model])))))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/out-model

In [20]:
(def awaiting-model (out-model ["Survived" "Pclass" "Sex" "Age" "Embarked"] ["Survived" y-n]))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/awaiting-model

In [21]:
(let [KNN (awaiting-model :smile.classification/knn)]
    (new-performance KNN 400 enhanced-titanic 3000))

In [22]:
(let [rand-forest (awaiting-model :smile.classification/random-forest)]
    (new-performance rand-forest 400 enhanced-titanic 3000))

In [23]:
(let [randF (awaiting-model :smile.regression/random-forest)]
    (new-performance (fn [{:metamorph/keys [data mode] :as ctx}] 
                     (if (= mode :fit) 
                         (randF ctx) 
                         (update-in (randF ctx) [:metamorph/data "Survived"] #(map (fn [_] (Math/round _)) %))))
                 400
                 enhanced-titanic
                 3000))

In [24]:
(let [decision-tree (awaiting-model :smile.classification/decision-tree)]
    (new-performance decision-tree
                 400
                 enhanced-titanic
                 3000))

In [25]:
(let [logit (awaiting-model :smile.classification/logistic-regression)]
    (new-performance logit 400 enhanced-titanic 3000))

In [26]:
;(let [reg-tree (awaiting-model :cart1)]
 ;   (new-performance reg-tree 400 enhanced-titanic 3000))

null

In [28]:
(defn get-false-neg [real preds]
    (loop [true-pos 0
           false-pos 0
           true-neg 0
           false-neg 0
           [real & reals] (map double real)
           [pred & preds] (map double preds)]
        (if (and real pred)
            (cond
                (and (zero? pred) (= pred real))
                (recur true-pos false-pos (inc true-neg) false-neg reals preds)
                (and (zero? pred) (not= pred real))
                (recur true-pos false-pos true-neg (inc false-neg) reals preds)
                (and (pos? pred) (= pred real))
                (recur (inc true-pos) false-pos true-neg false-neg reals preds)
                :else (recur true-pos (inc false-pos) true-neg false-neg reals preds))
            {:true-pos true-pos
             :false-pos false-pos
             :true-neg true-neg
             :false-neg false-neg})))

(def colors ["#F5F904"
             "#F9E404"
             "#F9BB04"
             "#F99F04"
             "#F97E04"
             "#F95104"
             "#F94004"
             "#F92804"
             "#F91C04"
             "#F90404"
             "#F90404"])

(defn decide-color [values]
    (let [helper (fn [x]
                     (let [size (count x)
                           read-first (comp read-string str first)
                           f (read-first x)
                           rst (rest x)]
                         (str (inc f)
                              (apply str (repeat (count rst) 0)))))
          round (comp read-string helper str)
          rounded (map round values)
          [x1 x2 :as all] (sort rounded)
          difference (- x2 x1)
          sorted (iterate #(+ difference %) 0)
          ret-seq (take-while #(<= % (last all)) sorted)]
        (zipmap (concat ret-seq [(+ difference (last ret-seq)) (+ (* 2 difference) (last ret-seq))])
                (take (+ 2 (count ret-seq)) colors))))

(defn grab-colors [colors label]
    (let [colors (into (sorted-map) colors)
          xf (fn [rf]
                 (fn ([result] (if (sequential? result) (result 1) result))
                     ([[prev color][k v]]
                      (if (and (>= label prev) (<= label k))
                          (reduced color)
                          (rf nil [k v])))))] 
        (transduce xf (fn [_ NEW] NEW) (first colors) (rest colors))))

(defn produce-coords [[x-orig y :as start]]
    (iterate
     (fn [[x y]]
      (if (= x x-orig)
          [(+ x 200) y]
          [x-orig (+ y 100)]))
     start))

(defn gen-square-leg [x y color text]
    (format "<rect x=\"%s\" y=\"%s\" stroke=\"%s\" fill=\"%s\" stroke-width=\"1px\"
             width=\"10\" height=\"100\" /><text stroke=\"white\" x=\"%s\" y=\"%s\" width=\"10\" height=\"10\"
             alignment-baseline=\"middle\" text-anchor=\"middle\">%s</text>"
            (+ x 300)
            (- y 20)
            color
            color
            (+ 340 x)
            (+ y 20)
            text))
                  
(defn make-legend [colors [x y]]
    (let [size-colors (count colors)
          coords (iterate
                  (fn [[x y]]
                      [x (+ 50 y)])
                  [x y])
          xys (take size-colors coords)]
        (apply str
               (map (fn [[k color] [x y]]
                        (gen-square-leg x y color k))
                    (reverse colors)
                    xys))))
          

(defn plot-confusion [{:keys [true-pos false-pos true-neg false-neg :as values] :as sq}]
    (format (str "<h3 style=\"position: relative; transform: translate(320px,100px);\">
                  Confusion Matrix</h3>" "<svg width=\"1000\" height=\"800\">%s</svg>")
            (let [coords (produce-coords [200 200])
                  squares (take 4 coords)
                  colors (decide-color [true-pos false-pos true-neg false-neg])
                  colors (into (sorted-map) colors)
                  plotted (map #(str "<rect x=" 
                                     (first %1) 
                                     " y=" 
                                     (second %1) 
                                     " width=\"200\" height=\"100\" stroke=" \" "black" \" 
                                     " stroke-width=\"1px\" fill=" \" (grab-colors colors %2) \""/>
                                      <text x=" 
                                     (+ 80 (first %1))  
                                     " y=" 
                                     (+ 60 (second %1)) 
                                     " alignment-baseline\"middle\" text-anchor\"middle\">" %2 "</text>")
                               squares
                               [true-pos false-pos true-neg false-neg])]
                (str (apply str plotted) (make-legend colors (second squares))))))

(defn accurate [real preds]
    (/ (count
        (remove nil?
                (map
                 (fn [r p]
                     (when (= (double r) (double p))
                         true))
                 real preds)))
       (count real)))

(def display (comp render-html str))

(defn confusion-matrix [model n data validate-n]
    (let [[train tst] (train-test-split n (shuffle data))
          train-ctx (model {:metamorph/data train :metamorph/mode :fit})
          valid (mock tst validate-n)
          tst-ctx (model (merge train-ctx {:metamorph/data valid :metamorph/mode :transform}))
          real (get valid "Survived")
          preds (get-in tst-ctx [:metamorph/data "Survived"])]
        (display (plot-confusion (get-false-neg real preds))
             (format "<p>Accuracy: %s</p>"
                     (float (accurate real preds))))))

#'beaker_clojure_shell_00d159ab-acfd-4247-b55e-19029c52e556/confusion-matrix

In [29]:
(let [randF (awaiting-model :smile.regression/random-forest)]
    (confusion-matrix (fn [{:metamorph/keys [data mode] :as ctx}] 
                     (if (= mode :fit) 
                         (randF ctx) 
                         (update-in (randF ctx) [:metamorph/data "Survived"] #(map (fn [_] (Math/round _)) %))))
                 400
                 enhanced-titanic
                 3000))

In [30]:
(let [KNN (awaiting-model :smile.classification/knn)]
    (confusion-matrix KNN 400 enhanced-titanic 3000))

In [31]:
(let [rand-forest (awaiting-model :smile.classification/random-forest)]
    (confusion-matrix rand-forest 400 enhanced-titanic 3000))

In [32]:
(let [decision-tree (awaiting-model :smile.classification/decision-tree)]
    (confusion-matrix decision-tree
                 400
                 enhanced-titanic
                 3000))

In [33]:
(let [logit (awaiting-model :smile.classification/logistic-regression)]
    (confusion-matrix logit 400 enhanced-titanic 3000))