# Práctica 1 - CART Splitting
El objetivo de esta práctica consiste en implemetar el algoritmo de CART Splitting, el cuál se basa en iterar por las variables del conjunto de entrada de forma que se minimice la siguiente expresión:

![Figura 1. Expresión a minimizar](./img/expresion.png)

## Algoritmo
El algoritmo para minimizar esta suma es el siguiente:
1. Elegimos un valor de X1 que no sea ni el primero ni el último. Ese valor determinará las regiones (s).
2. Aplicamos la suma con ese valor de X1.
3. Repetimos para todos los valores de X1.
4. El valor que nos dé la suma mínima será el que elegiremos.

## Código
Primero, cargamos el dataframe que vamos a usar para crear el árbol

In [6]:
library(tidyverse)
df <- read.csv("./datasets/datos.csv")
creditcard_2023 <- read.csv("./datasets/creditcard_2023.csv")
creditcard_test <- read.csv("./datasets/creditcard_test.csv")
creditcard_test <- creditcard_test[, -1]
cd_size <- dim(creditcard_2023)[1] / 4
creditcard_2023 <- creditcard_2023[, -1]
creditcard_2023
df

Posteriormente, creamos la función que implementa el algoritmo en cuestión.

In [3]:
cart_splitting <- function(data, class_variable,
                           verbose = FALSE) {
  data <- data.frame(lapply(data, sort))
  data_size <- dim(data)[1]
  min_global_sum <- Inf
  varnames <- names(data)[names(data) != class_variable]
  best_var <- NA
  best_s <- Inf
  for (var in varnames) {
    min_sum <- Inf
    for (i in 2:(data_size - 1)) {
      aux_s <- data[[var]][i]
      y_vector_r1 <- data[[class_variable]][data[[var]] < aux_s]
      y_vector_r2 <- data[[class_variable]][data[[var]] >= aux_s]
      aux_sum <- sum((y_vector_r1 - mean(y_vector_r1))^2)
      + sum((y_vector_r2 - mean(y_vector_r2))^2)

      if (aux_sum <= min_sum) {
        min_sum <- aux_sum
        s <- aux_s
      }
      if (verbose) {
        print(sprintf("Variable: %f; Valor: %f; Suma mínima: %f", aux_s,
                      data[[class_variable]][data[[var]] == aux_s],
                      aux_sum))
      }
    }
    if (min_sum < min_global_sum) {
      min_global_sum <- min_sum
      best_s <- s
      best_var <- var
    }
    if (min_global_sum == 0) {
      break
    }
  }
  r1 <- data[data[[best_var]] < best_s, ]
  r2 <- data[data[[best_var]] >= best_s, ]
  return(list(R1 = r1, R2 = r2, value = best_s,
              best_var = best_var, min_sum = min_global_sum))
}

Una vez encontrado el valor que garantiza la suma mínima, obtenemos de la lista resultado las regiones, la suma mínima y el valor de *s*.

In [4]:
split_df <- cart_splitting(df, "y")
split_creditcard_2023 <- cart_splitting(creditcard_2023, "Class")
split_df
split_creditcard_2023

Unnamed: 0_level_0,x1,x2,y
Unnamed: 0_level_1,<dbl>,<dbl>,<int>
1,1.728571,0.4766834,0
2,2.771245,1.1697614,0
3,2.999209,1.7847839,0
4,3.67832,2.2090142,0
5,3.961043,2.6199503,0

Unnamed: 0_level_0,x1,x2,y
Unnamed: 0_level_1,<dbl>,<dbl>,<int>
6,6.642287,2.812814,1
7,7.444542,3.162954,1
8,7.497546,3.234551,1
9,9.002203,3.319984,1
10,10.124939,3.339047,1


Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,-2.637662,-25.480046,-2.518308,-4.468314,-4.298320,-18.642598,-3.038728,-6.595785,-2.170915,-2.041556,⋯,-10.699713,-6.262872,-19.297443,-3.938205,-10.230967,-3.733169,-5.536809,-9.015933,50.41,0
2,-2.609364,-20.936274,-2.493666,-4.148959,-3.461897,-12.900643,-2.626328,-4.074350,-2.068588,-1.980392,⋯,-5.956417,-6.238628,-14.452018,-3.888742,-6.810179,-3.621793,-5.273815,-8.923980,50.53,0
3,-2.580623,-15.016940,-2.468736,-3.996094,-3.301638,-12.015046,-2.480375,-4.067282,-1.942540,-1.940162,⋯,-5.775451,-6.220250,-11.690438,-3.674306,-3.961696,-3.582814,-4.850184,-8.758956,50.70,0
4,-2.551425,-14.821249,-2.443510,-3.920552,-3.254039,-9.726991,-2.452595,-4.049661,-1.935719,-1.929278,⋯,-5.696745,-6.192213,-9.277415,-3.651065,-3.875061,-3.448838,-4.677707,-7.623396,53.43,0
5,-2.543593,-14.605105,-2.417978,-3.868896,-3.206065,-7.780302,-2.424652,-4.018033,-1.928932,-1.920683,⋯,-5.670307,-6.173698,-8.428863,-3.619889,-3.745031,-3.430118,-4.641842,-7.262726,54.04,0
6,-2.521749,-11.294675,-2.393498,-3.823482,-3.157705,-6.907847,-2.396545,-4.007709,-1.927792,-1.918385,⋯,-5.649456,-6.151633,-6.799407,-3.516737,-3.523117,-3.410041,-4.618920,-6.837813,55.39,0
7,-2.513872,-11.123808,-2.361764,-3.809880,-3.108947,-5.805149,-2.368268,-3.984524,-1.922172,-1.913524,⋯,-5.577835,-4.363603,-6.620316,-3.470262,-3.386582,-3.398027,-4.461079,-6.830490,57.48,0
8,-2.490300,-9.442053,-2.343357,-3.796197,-3.080113,-4.902848,-2.351729,-3.959842,-1.915436,-1.907480,⋯,-5.572485,-3.484114,-5.895783,-3.372470,-3.364507,-3.361095,-4.399786,-6.826173,59.19,0
9,-2.466559,-8.544918,-2.316555,-3.662857,-2.968957,-4.816992,-2.319007,-3.943795,-1.911040,-1.896561,⋯,-5.568043,-3.367893,-5.775965,-3.335895,-3.297102,-3.302606,-4.308811,-6.330663,60.86,0
10,-2.435370,-8.282162,-2.289398,-3.563524,-2.960017,-4.753573,-2.277167,-3.923012,-1.908719,-1.888460,⋯,-3.215811,-3.308682,-4.957436,-3.287210,-3.116552,-3.235496,-4.223734,-6.292595,61.10,0

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
19917,1.254621,1.214578,3.529216,1.124065,2.687874,2.739893,2.032587,1.225636,5.068460,5.310404,⋯,1.527863,1.395286,2.335420,2.360359,2.200561,2.480461,2.296380,1.857360,23933.62,1
19918,1.255629,1.214728,3.533889,1.125636,2.730450,2.749717,2.034673,1.232213,5.076955,5.330836,⋯,1.551733,1.395286,2.349898,2.361216,2.225705,2.481218,2.336310,1.878270,23933.70,1
19919,1.255651,1.217614,3.534544,1.129042,2.734133,2.749931,2.044989,1.233206,5.095439,5.342060,⋯,1.555205,1.396989,2.366374,2.361336,2.236987,2.482220,2.373107,1.881083,23933.77,1
19920,1.256539,1.220464,3.536421,1.131062,2.745173,2.752106,2.057710,1.238864,5.096248,5.362743,⋯,1.555259,1.398995,2.381883,2.361713,2.242818,2.483833,2.376459,1.909247,23935.35,1
19921,1.256888,1.226622,3.542548,1.136884,2.760307,2.752340,2.080448,1.257855,5.113236,5.382188,⋯,1.568435,1.399913,2.424644,2.362093,2.250436,2.494613,2.388152,1.939783,23935.86,1
19922,1.256934,1.236953,3.545182,1.142646,2.774251,2.754327,2.097063,1.262403,5.123627,5.420375,⋯,1.592931,1.402195,2.439909,2.363940,2.253393,2.499828,2.416843,1.940592,23936.54,1
19923,1.258013,1.244313,3.546323,1.146971,2.787657,2.754338,2.098395,1.265627,5.140240,5.425197,⋯,1.597404,1.402312,2.447737,2.365106,2.253835,2.501485,2.451806,1.944958,23937.29,1
19924,1.258405,1.248035,3.553549,1.148364,2.796341,2.755341,2.107765,1.289284,5.143859,5.430741,⋯,1.606015,1.402710,2.451128,2.365212,2.258094,2.503053,2.457481,1.947801,23937.98,1
19925,1.258688,1.261596,3.553775,1.151326,2.826177,2.756121,2.110583,1.299837,5.167549,5.470249,⋯,1.633988,1.407928,2.458566,2.365354,2.261052,2.516822,2.491317,1.953048,23939.92,1
19926,1.262127,1.263765,3.563971,1.154047,2.829809,2.756150,2.119633,1.315378,5.187520,5.517208,⋯,1.634839,1.411270,2.464390,2.365583,2.274761,2.523791,2.498398,1.956742,23941.13,1


Ahora, podemos probar el modelo creado

In [30]:
creditcard_test <- data.frame(lapply(creditcard_test, sort))
split_creditcard2023_test_r1 <- creditcard_test[creditcard_test$V1 <
                                                  split_creditcard_2023$value, ]
split_creditcard2023_test_r2 <- creditcard_test[creditcard_test$V1 >=
                                                  split_creditcard_2023$value, ]
split_creditcard_test <- list(R1 = split_creditcard2023_test_r1,
                              R2 = split_creditcard2023_test_r2)
split_creditcard_test

accuracy_r1 <- sum(dim(split_creditcard2023_test_r1[split_creditcard2023_test_r1
                                                    $Class == 0, ])[1])
accuracy_r2 <- sum(dim(split_creditcard2023_test_r2[split_creditcard2023_test_r2
                                                    $Class == 1, ])[1])
accuracy <- (accuracy_r1 / dim(split_creditcard2023_test_r1)[1] +
               accuracy_r2 / dim(split_creditcard2023_test_r2)[1]) / 2
accuracy * 100

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,-2.735211,-11.006236,-2.600419,-4.140699,-3.450680,-6.402244,-2.681276,-3.397355,-2.740952,-2.515992,⋯,-3.938117,-3.047244,-11.273871,-3.671693,-4.530496,-3.807622,-4.809814,-14.861503,51.44,0
2,-2.707827,-9.750012,-2.578182,-4.088214,-3.422336,-5.615938,-2.660446,-2.309111,-2.598234,-2.334690,⋯,-3.042176,-2.902875,-10.270414,-3.567598,-4.304435,-3.420628,-3.458518,-8.902624,54.87,0
3,-2.686628,-9.279405,-2.549988,-4.002641,-3.316655,-4.839882,-2.636296,-1.984652,-2.547855,-2.157019,⋯,-1.823877,-2.753699,-8.710230,-3.381105,-3.282465,-3.414544,-3.276941,-6.525335,61.93,0
4,-2.184222,-8.544413,-1.954368,-3.869528,-2.943699,-4.285727,-2.572037,-1.901134,-2.536363,-2.103428,⋯,-1.782453,-2.527554,-6.957472,-3.177406,-2.747233,-3.399889,-3.127901,-6.378206,74.03,0
5,-2.086423,-8.231273,-1.943180,-3.850495,-2.560038,-3.869002,-2.562091,-1.851613,-2.424235,-2.089028,⋯,-1.766336,-2.438652,-6.081819,-3.175618,-2.563958,-3.330823,-3.113267,-5.867298,74.72,0
6,-2.034729,-8.180905,-1.863725,-3.772298,-2.496764,-3.305728,-2.249872,-1.591153,-2.380712,-2.048877,⋯,-1.742665,-2.203543,-5.425204,-3.168827,-2.547873,-3.324770,-3.080165,-5.299102,83.92,0
7,-1.930032,-7.887634,-1.814920,-3.618630,-2.481960,-3.229557,-2.116285,-1.513143,-2.372762,-2.034626,⋯,-1.730462,-2.043034,-5.104587,-3.140737,-2.542601,-3.227324,-2.917630,-5.141293,87.11,0
8,-1.852223,-7.494327,-1.748613,-3.569770,-2.421195,-2.797838,-2.060551,-1.512404,-2.322493,-1.982970,⋯,-1.681567,-1.933262,-4.969868,-2.761435,-2.537909,-3.144424,-2.897917,-4.610703,88.05,0
9,-1.840303,-6.909646,-1.714206,-3.504095,-2.325498,-2.728906,-1.985076,-1.467021,-2.317736,-1.972634,⋯,-1.641707,-1.924906,-4.061471,-2.748895,-2.502113,-3.087600,-2.881941,-4.606365,98.19,0
10,-1.832318,-6.801722,-1.710238,-3.384719,-2.295824,-2.658252,-1.906186,-1.452551,-2.281456,-1.944978,⋯,-1.632975,-1.883169,-3.763214,-2.738655,-2.496671,-3.019603,-2.863150,-4.372090,107.76,0

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
4971,1.254809,1.004649,2.925638,1.110115,2.576339,2.684919,2.544238,1.03875,2.942562,2.787432,⋯,0.8770711,1.413925,2.971143,2.433147,2.150772,2.488847,1.472504,1.469367,23948.02,1
4972,1.255145,1.00933,2.93201,1.203061,2.589637,2.686097,2.57686,1.06623,3.25328,2.904545,⋯,0.8870777,1.414125,2.973631,2.434855,2.155393,2.49571,1.555173,1.472876,23948.76,1
4973,1.256191,1.020436,2.936693,1.216367,2.596623,2.688537,2.579378,1.115209,3.415679,2.955246,⋯,0.8889493,1.415441,2.990899,2.44,2.18885,2.513176,1.621776,1.695637,23950.21,1
4974,1.256511,1.06719,2.941906,1.292377,2.638269,2.698939,2.660889,1.11675,3.496005,2.975715,⋯,0.904225,1.419253,3.063108,2.446674,2.215502,2.51542,1.624976,1.723819,23950.65,1
4975,1.256793,1.099684,2.95611,1.297224,2.960196,2.699786,2.697115,1.121546,3.566979,2.99488,⋯,0.9197898,1.428884,3.119278,2.459457,2.21916,2.519194,1.657103,1.750536,23953.96,1
4976,1.257163,1.10207,2.964951,1.300179,2.979155,2.701358,2.731924,1.185442,3.607534,3.048559,⋯,0.9261468,1.428989,3.189444,2.470859,2.224454,2.523778,1.695514,1.820895,23960.44,1
4977,1.25721,1.129769,2.971337,1.30143,2.979326,2.714139,2.826622,1.208676,3.62108,3.122016,⋯,0.9430223,1.429566,3.216252,2.474099,2.227473,2.530806,1.765793,1.906782,23964.75,1
4978,1.257255,1.182575,2.973003,1.324944,3.124185,2.719659,2.858409,1.217233,3.834103,3.14273,⋯,0.9505875,1.435888,3.300032,2.48193,2.275675,2.541804,1.772452,1.938301,23970.25,1
4979,1.26247,1.184438,2.993895,1.377905,3.142669,2.743247,2.908922,1.236403,4.080252,3.194359,⋯,0.9615423,1.447941,3.553921,2.482866,2.281455,2.573711,1.869869,2.101354,23972.18,1
4980,1.266925,1.195508,3.028086,1.411185,3.224495,2.767663,2.9403,1.268758,4.309327,3.215848,⋯,0.9636948,1.458595,3.593284,2.48659,2.393503,2.594974,1.976113,2.231411,23984.18,1
