En el apartado "Loading Data" de esta URL:

https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

Se explica cómo se cargan una serie de datos: 

1. Utiliza esa misma forma para cargar los datos.
2. Limpia los datos si es necesario
3. Dibuja con plotly los que creas necesarios gráficos para entender los datos.
4. Utiliza los métodos de clasificación vistos hasta ahora para clasificar el target de los datos, ¿cuál da mejores resultados? 
5. Intenta superarte en el score cambiando las features de los algoritmos.

In [1]:
# Load libraries
import numpy as np
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 
# Set random seed
np.random.seed(0)
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import pickle # Para cargar una variable local

In [2]:
cancer = datasets.load_breast_cancer()

In [3]:
type(cancer)

sklearn.utils.Bunch

In [4]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
print("data: ", cancer.data)
print("target: ", cancer.target)
#print("frame: ", cancer.frame)
print("target_names: or Labels ", cancer.target_names)
#print("DESCR: ", cancer.DESCR)
print("feature_names: ", cancer.feature_names)
#print("filename: ", cancer.filename)

data:  [[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
target:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 

In [6]:
cancer.data.shape

(569, 30)

In [7]:
X = cancer.data
y = cancer.target

In [8]:
to_test = np.arange(1, 10)

In [10]:
#%%time 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

pipe = Pipeline(steps=[('classifier', RandomForestClassifier())])

logistic_params = { 
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': np.logspace(0, 4, 10)
    }

random_forest_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__max_features': [1, 2, 3]
    }

svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('linear', 'rbf', 'sigmoid'), 
    'classifier__C':[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
    'classifier__degree': to_test,
    'classifier__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
    'classifier__gamma': ('scale', 'auto')
    }

search_space = [
    logistic_params,
    random_forest_params,
    svm_params
    ]


cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1) 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, cv=cv, verbose=5, n_jobs=-1)
best_model = clf.fit(X_train, y_train)
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier']) 
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'finished_model.sav' #Guardamos el modelo una vez que ya hemos entrenado el modelo
pickle.dump(best_model, open(filename, 'wb')) #Lo guardamos en una variable

Fitting 10 folds for each of 3465 candidates, totalling 34650 fits

############################

best estimator: RandomForestClassifier(max_features=3, n_estimators=10)

############################

clf.best_params_ {'classifier': RandomForestClassifier(max_features=3, n_estimators=10), 'classifier__max_features': 3, 'classifier__n_estimators': 10}

############################

clf.best_score 0.9670048309178745


In [None]:
loaded_model = pickle.load(open("finished_model.sav", "rb")) #cargamos el modelo, pero si que el tenia en clases 

In [None]:
# Predict target vector
best_model.score(X_test, y_test) * 100