# SVM

### Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from joblib import dump, load

In [2]:
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [3]:
from preprocessing import reemplazarNulls,reemplazarCategoricas,reemplazarFechas,targetBooleano,regularizar

[###] All Done                                              


In [4]:
from utilities import score

### Codigo a correr

In [5]:
MAX_ITER = 10**6
MAX_JOBS = -1

**Run :** Entrenar Nuevamente o Cargar Entrenado

In [6]:
runLinearSimple = False
runLinear = False
runPoly = False
runRadial = False

**Save :** Guardar Modelo (pisa anterior)

In [7]:
saveLinear= True
saveLinearSimple= True
savePoly = True
saveRadial = True

### Dataset

In [8]:
df_feat = pd.read_csv("datasets/train_features.csv", low_memory=False)
df_targ = targetBooleano( pd.read_csv("datasets/train_target.csv") )

### Preprocesamiento

In [9]:
reemplazarNulls(df_feat , inplace=True)
reemplazarCategoricas(df_feat , inplace=True)
reemplazarFechas(df_feat , inplace=True)
df_allfeat = regularizar(df_feat , drop=0)
regularizar(df_feat , inplace=True)

Unnamed: 0,id,direccion_viento_tarde,direccion_viento_temprano,horas_de_sol,humedad_tarde,humedad_temprano,mm_lluvia_dia,nubosidad_tarde,presion_atmosferica_tarde,rafaga_viento_max_direccion,rafaga_viento_max_velocidad,velocidad_viendo_tarde
0,0.495844,0.736537,0.645996,-2.018030,1.277923,0.270523,7.914307,0.916539,1.269394,0.802215,0.589120,0.834079
1,0.183431,0.317434,-1.382279,-2.018030,-1.806334,-0.675093,0.417359,-1.288420,-2.406972,0.393094,1.544505,1.402051
2,-0.113001,-0.730326,-0.570969,-1.330311,1.326115,-1.147901,-0.281713,1.284033,-2.094694,-0.834272,1.250541,1.629239
3,-1.678190,1.574745,0.848823,-2.018030,1.952605,1.426276,0.320935,1.284033,0.020281,-1.447954,1.030067,1.402051
4,0.930508,-1.149430,-1.179451,-2.018030,-1.806334,-1.673244,-0.281713,-0.920927,-1.257220,-0.834272,0.736103,0.038918
...,...,...,...,...,...,...,...,...,...,...,...,...
102351,-0.264327,-0.101670,-0.570969,-0.298733,0.555051,-1.305504,-0.233501,0.549046,-0.490719,-0.834272,2.352908,2.083617
102352,-1.317418,0.946089,1.457306,-1.462565,0.651434,0.585729,0.345041,0.916539,0.318365,1.211337,0.589120,1.402051
102353,-1.594150,0.736537,1.660133,-2.018030,1.422498,-0.359888,-0.281713,0.916539,-0.064885,-1.447954,-0.366265,-1.892186
102354,0.293898,-1.149430,-0.773796,-1.779974,1.326115,-0.202285,-0.281713,1.284033,-2.052110,-0.629711,1.177050,0.606890


In [10]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102356 entries, 0 to 102355
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           102356 non-null  float64
 1   direccion_viento_tarde       102356 non-null  float64
 2   direccion_viento_temprano    102356 non-null  float64
 3   horas_de_sol                 102356 non-null  float64
 4   humedad_tarde                102356 non-null  float64
 5   humedad_temprano             102356 non-null  float64
 6   mm_lluvia_dia                102356 non-null  float64
 7   nubosidad_tarde              102356 non-null  float64
 8   presion_atmosferica_tarde    102356 non-null  float64
 9   rafaga_viento_max_direccion  102356 non-null  float64
 10  rafaga_viento_max_velocidad  102356 non-null  float64
 11  velocidad_viendo_tarde       102356 non-null  float64
dtypes: float64(12)
memory usage: 9.4 MB


## Entrenamiento

### Lineal

#### Simple

In [11]:
if (runLinearSimple):
    svm_simple_lineal = SVC(
        kernel='linear', C=0.1,
        max_iter=MAX_ITER, shrinking=False, verbose = 1,
    )
    svm_simple_lineal.fit( df_feat, df_targ )
    
    # Guardar
    if(saveLinearSimple):
        dump(svm_simple_lineal, 'models/SVM/svm_simple_lineal.sk') 
else:
    # Cargar
    svm_simple_lineal = load('models/SVM/svm_simple_lineal.sk')

In [12]:
%%time
linsim_pred = svm_simple_lineal.predict(df_feat)

CPU times: user 1min 20s, sys: 258 ms, total: 1min 20s
Wall time: 1min 20s


In [13]:
score(df_targ, linsim_pred)

ACCURACY: 0.8385927546992848


#### Optimizando Hiperparametros

In [14]:
base_arrange = np.arange(0.01, 0.09, 0.03)
hipr_arrange = np.concatenate((base_arrange,base_arrange*10,base_arrange*100),axis=None)
hipr_arrange

array([0.01, 0.04, 0.07, 0.1 , 0.4 , 0.7 , 1.  , 4.  , 7.  ])

In [15]:
if (runLinear):
    svm_lineal = SVC(
        kernel='linear',
        max_iter=MAX_ITER, shrinking=False,
    )
    params = {'C': hipr_arrange}
    
    svm_lineal_grid = HalvingGridSearchCV(
        svm_lineal,
        params,
        scoring='accuracy',
        n_jobs=MAX_JOBS,
        return_train_score=True,
        verbose=2,
    ).fit(df_feat, df_targ)
    
    # Guardar
    if(saveLinear):
        dump(svm_lineal_grid, 'models/SVM/svm_lineal_grid.sk') 
else:
    # Cargar
    svm_lineal_grid = load('models/SVM/svm_lineal_grid.sk')

In [16]:
print(f"Best score: {svm_lineal_grid.best_score_}")
print(f"Best params {svm_lineal_grid.best_params_}")

Best score: 0.8386942323663009
Best params {'C': 1.0}


In [17]:
svm_lineal = svm_lineal_grid.best_estimator_

In [18]:
%%time
lin_pred = svm_lineal.predict(df_feat)

CPU times: user 1min 19s, sys: 286 ms, total: 1min 20s
Wall time: 1min 20s


In [19]:
score(df_targ, lin_pred)

ACCURACY: 0.8385927546992848


#### Otras Corridas

SVM Lineal , C=10, sin eliminar features poco significativos ni limitar iteraciones, t ~= 10hs \
El entranamiento fue removido debido a que tarda mucho y ni siquiera se esta optimizando el hiperparametro

In [20]:
svm_lineal_allfeat = load('models/SVM/svm_lineal_allfeat.sk')

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [21]:
pred = svm_lineal_allfeat.predict(df_allfeat)
score(df_targ, pred)



ACCURACY: 0.8404294814177967


### Poly

In [22]:
if (runPoly):
    svm_poly = SVC(
        kernel='poly',
        max_iter=MAX_ITER*10, shrinking=False,
    )
    params = {
        'C': hipr_arrange*10,
        'degree': np.arange(1,5,1),
        'gamma': hipr_arrange,
        'coef0': hipr_arrange*10,
    }
    
    svm_poly_grid = HalvingGridSearchCV(
        svm_poly,
        params,
        scoring='accuracy',
        n_jobs=MAX_JOBS,
        return_train_score=True,
        verbose=2,
    ).fit(df_feat, df_targ)
    
    # Guardar
    if(savePoly):
        dump(svm_poly_grid, 'models/SVM/svm_poly_grid.sk') 
else:
    # Cargar
    svm_poly_grid = load('models/SVM/svm_poly_grid.sk')

In [23]:
print(f"Best score: {svm_poly_grid.best_score_}")
print(f"Best params {svm_poly_grid.best_params_}")

Best score: 0.8387192890566743
Best params {'C': 69.99999999999999, 'coef0': 4.0, 'degree': 1, 'gamma': 0.01}


In [24]:
svm_poly = svm_poly_grid.best_estimator_

In [25]:
%%time
pol_pred = svm_poly.predict(df_feat)

CPU times: user 1min 28s, sys: 261 ms, total: 1min 28s
Wall time: 1min 28s


In [26]:
score(df_targ, pol_pred)

ACCURACY: 0.83853413576146


### Radial

In [27]:
if (runRadial):
    svm_radial = SVC(
        kernel='rbf',
        max_iter=MAX_ITER, shrinking=False,
    )
    params = {
        'C': hipr_arrange,
        'gamma': hipr_arrange,
    }
    
    svm_radial_grid = HalvingGridSearchCV(
        svm_radial,
        params,
        scoring='accuracy',
        n_jobs=MAX_JOBS,
        return_train_score=True,
        verbose=1,
    ).fit(df_feat, df_targ)
    
    # Guardar
    if(saveRadial):
        dump(svm_radial_grid, 'models/SVM/svm_radial_grid.sk') 
else:
    # Cargar
    svm_radial_grid = load('models/SVM/svm_radial_grid.sk')

In [28]:
print(f"Best score: {svm_radial_grid.best_score_}")
print(f"Best params {svm_radial_grid.best_params_}")

Best score: 0.8438724989194568
Best params {'C': 0.4, 'gamma': 0.06999999999999999}


In [29]:
svm_radial = svm_radial_grid.best_estimator_

In [30]:
%%time
rad_pred = svm_radial.predict(df_feat)

CPU times: user 6min 3s, sys: 733 ms, total: 6min 4s
Wall time: 6min 4s


In [31]:
score(df_targ, rad_pred)

ACCURACY: 0.8459201219273906


## Resultados

#### Lineal

Best score: 0.8386942323663009 \
Best params {'C': 1.0}

ACCURACY: 0.8385927546992848

PRED TIME: 1min 20s

#### Polinomial

Best score: 0.8387192890566743 \
Best params {'C': 69.99999999999999, 'coef0': 4.0, 'degree': 1, 'gamma': 0.01}

ACCURACY: 0.83853413576146

PRED TIME: 1min 28s

- El caso polinomial se optimizo en lineal sin mejorar las metricas

#### Radial

Best score: 0.8438724989194568 \
Best params {'C': 0.4, 'gamma': 0.06999999999999999}

PRED TIME: 6min 4s

ACCURACY: 0.8459201219273906

### Coincidencia

In [36]:
(lin_pred == pol_pred).mean() *100

99.98241431865254

Corroboramos que los modelos lineal y polinomial, ademas de ser ambos lineales, \
tuvieron casi las mismas predicciones, debido a esto descartamos el polinomial

In [37]:
(lin_pred == rad_pred).mean() *100

97.0065262417445

Notamos que hay una pequeña diferencia entre las predicciones de cada uno, lo que podria significar cierto margen de mejora

In [38]:
( (df_targ == lin_pred) | (df_targ == rad_pred) ).mean() *100

85.72238071046152

In [39]:
( ( (df_targ == lin_pred) | (df_targ == rad_pred) ).mean() - (df_targ == rad_pred).mean() ) *100

1.1303685177224576

Un ensamble optimo de estos dos modelos podria tener como maximo un 1.13% de mejora \
No nos parece lo suficientemente significativo, ya que la mejora real seria significativamente menor \
y que seria mas efectivo entrenar modelos con el objetivo de ser ensamblados

### Conclusion

Descartamos el polinomial ya que es basicamente lineal sin ninguna mejora.

Entre el lineal y el radial, nos quedamos con ambos. \
Ya que el radial tiene metricas mejores pero es mas complejo y tarda mas en predecir

In [40]:
dump(svm_lineal, 'models/SVM/svm_lineal.sk')

['models/SVM/svm_lineal.sk']

In [41]:
dump(svm_radial, 'models/SVM/svm_radial.sk')

['models/SVM/svm_radial.sk']