In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ann"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
# para leer el archivo
with open('../datasets/hepatitis.dat', 'r') as lector:
    lista = lector.readlines()

In [3]:
def mostFrequent(arr):
    n = len(arr)
    # Insert all elements in Hash.
    Hash = dict()
    for i in range(n):
        if arr[i] in Hash.keys():
            Hash[arr[i]] += 1
        else:
            Hash[arr[i]] = 1
    # find the max frequency
    max_count = 0
    res = -1
    for i in Hash:
        if (max_count < Hash[i]):
            res = i
            max_count = Hash[i]
    return res

def obtienePromedio(arr):
    if(len(arr) == 0):
        return 0
    suma = 0
    for element in arr:
        suma += element
    return suma // len(arr)

In [6]:
# convertir los numeros
import numpy as np


Xt = [entrada.rstrip('\n').split(',')[:-1]for entrada in lista if not entrada.startswith('@')]

X = []

for elemento in Xt:
    tempArr = []
    for ele in elemento:
        try:
            tempArr.append(int(ele))
        except:
            tempArr.append(-100)
    X.append(tempArr)


index = 0
for i in range (len(X[0])):
    arr = []
    for fold in X:
        if (fold[i] != -100):
            arr.append(fold[i])

    ## Mandat a calcular la moda para los datos categoricos
    if(i in range(1, 13) or i == 18):
        moda = mostFrequent(arr)
        ## Substituir los -100 por la moda
        for fold in X:
            if (fold[i] == -100):
                fold[i] = moda
    elif(i in range(13,18)):
        promedio = obtienePromedio(arr)
        ## Substituir los -100 por el promedio
        for fold in X:
            if (fold[i] == -100):
                fold[i] = promedio
                
for folds in X:
    print(folds)
    
    
        
    

[30, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 85, 18, 0, 61, 1]
[50, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 0, 135, 42, 0, 61, 1]
[78, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 96, 32, 0, 61, 1]
[31, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 46, 52, 0, 80, 1]
[34, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 105, 200, 0, 61, 1]
[34, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 95, 28, 0, 75, 1]
[51, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 0, 105, 85, 0, 61, 1]
[23, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 105, 85, 0, 61, 1]
[39, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 105, 48, 0, 61, 1]
[30, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 105, 120, 0, 61, 1]
[39, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 0, 78, 30, 0, 85, 1]
[32, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 0, 59, 249, 0, 54, 1]
[41, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 81, 60, 0, 52, 1]
[30, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 57, 144, 0, 78, 1]
[47, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 105, 60, 0, 61, 1]
[38, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 0, 72, 89, 0, 46, 

In [5]:
# para sacar y
yt = [entrada.rstrip('\n').split(',')[-1]for entrada in lista if not entrada.startswith('@')]

y = [float(i) for i in yt]
y = np.asarray(y)
print(y)

[2. 2. 2. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 1. 1. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2. 2. 2. 1.
 2. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 2. 2. 1. 2. 2. 1. 2.
 2. 2. 1. 2. 1. 2. 2. 2. 1. 2. 1. 2. 2. 1. 2. 1. 2. 2. 2. 2. 2. 2. 1. 2.
 1. 2. 2. 2. 2. 2. 2. 1. 2. 1. 2. 1. 2. 2. 1. 2. 2. 2. 1. 2. 2. 1. 2. 1.
 1. 2. 1. 1. 2. 2. 1. 2. 2. 2. 1.]


In [6]:
# vamos a generar los conjuntos de prueba
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42)

# Support Vector machine

### Normal Classifier

In [7]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Crear maquina de soporte vectorial
poly_kernel_svm_clf=Pipeline([
    ("scaler", StandardScaler()), 
    ("SVM", SVC(kernel="poly", degree=3, C=5))])
poly_kernel_svm_clf.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('SVM',
                 SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [8]:
# para predecir los test con poly_kernel_svm_clf
y_pred1=poly_kernel_svm_clf.predict(X_test)

### Best configuration for the classifier

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = [
  {'C': [1,10], 'kernel': ['linear', 'poly']},
  {'C': [1, 10, 20], 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'poly']},
 ]
parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}
bestClfSVM = GridSearchCV(poly_kernel_svm_clf, param_grid = parameteres, cv = 5)

bestClfSVM.fit(X_train,y_train)
print(bestClfSVM.best_estimator_)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('SVM',
                 SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=0.1,
                     kernel='poly', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)




In [10]:
# para predecir los test con bestClfSVM
y_pred2=bestClfSVM.predict(X_test)

In [11]:
###### Vamos a generar los resultados
from sklearn.metrics import accuracy_score

#Maquina de soporte vectorial con grado 3
result1SVM=accuracy_score(y_test,y_pred1)
print("The Accuracy for SVM with degree 3 is:" + str(result1SVM))


result2SVM=accuracy_score(y_test,y_pred2)
print("The Accuracy for SVM with GridSearch is:" + str(result2SVM))

The Accuracy for SVM with degree 3 is:0.7435897435897436
The Accuracy for SVM with GridSearch is:0.717948717948718


# Perceptron Multilayer

### Normal Classifier

In [12]:
from sklearn.neural_network import MLPClassifier

clf_perceptron = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(6,), random_state=1,activation='relu')
clf_perceptron.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(6,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [13]:
# para predecir los test con poly_kernel_svm_clf
y_pred1_perceptron = clf_perceptron.predict(X_test)

### Best configuration for the classifier

In [14]:
params = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
          'solver': ['lbfgs', 'sgd', 'adam'],
          'learning_rate_init': [.1,.05,.01,.005,.001]}
clf_grid_search_perceptron = GridSearchCV(
    MLPClassifier(random_state=42), params, verbose=1, cv=3)

clf_grid_search_perceptron.fit(X_train, y_train)
print(clf_grid_search_perceptron.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits






MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.005, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   16.5s finished


In [15]:
## Accuracy score for the best configuration calculated with grid_search_cv
y_pred2_perceptron = clf_grid_search_perceptron.predict(X_test)

In [16]:
###### Vamos a generar los resultados
from sklearn.metrics import accuracy_score

result1MLP=accuracy_score(y_test, y_pred1_perceptron)
print("The Accuracy for MLP: " + str(result1MLP))


result2MLP=accuracy_score(y_test, y_pred2_perceptron)
print("The Accuracy for MLP with GridSearch is: " + str(result2MLP))

The Accuracy for MLP: 0.7692307692307693
The Accuracy for MLP with GridSearch is: 0.8205128205128205


# Stochastic Gradient Descent

### Normal Classifier

In [17]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
# para predecir los test con sgd_clf
y_pred1_SGD = sgd_clf.predict(X_test)

### Best configuration for the classifier

In [19]:
params = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
          "penalty": ["none", "l1", "l2"]}

clf_grid_search_sgd = GridSearchCV(sgd_clf, params)

clf_grid_search_sgd.fit(X_train, y_train)
print(clf_grid_search_sgd.best_estimator_)



SGDClassifier(alpha=10, average=False, class_weight=None, early_stopping=False,
              epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
              random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)




In [20]:
## Accuracy score for the best configuration calculated with clf_grid_search_sgd
y_pred2_SGD = clf_grid_search_sgd.predict(X_test)

In [21]:
###### Vamos a generar los resultados
from sklearn.metrics import accuracy_score

result1SGD=accuracy_score(y_test, y_pred1_SGD)
print("The Accuracy for SGD: " + str(result1MLP))


result2SGD=accuracy_score(y_test, y_pred2_SGD)
print("The Accuracy for SGD with GridSearch is: " + str(result2MLP))

The Accuracy for SGD: 0.7692307692307693
The Accuracy for SGD with GridSearch is: 0.8205128205128205


# KMeans classifier

### Normal Classifier

In [22]:
from sklearn.cluster import KMeans

kmeans_clf = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=42, copy_x=True, n_jobs=None, algorithm='auto')
kmeans_clf.fit(X_train, y_train)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [23]:
# para predecir los test con kmeans_clf
y_pred1_KM = kmeans_clf.predict(X_test)

### Best configuration for the classifier

In [24]:
from sklearn.model_selection import GridSearchCV

params = {"n_clusters": range(2, 10)}

clf_grid_search_KM = GridSearchCV( 
    KMeans(random_state=42), params, verbose=1, cv=3 ) 

clf_grid_search_KM.fit(X_train, y_train)

print(clf_grid_search_KM.best_estimator_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    1.2s finished


In [25]:
## Accuracy score for the best configuration calculated with clf_grid_search_KM
y_pred2_KM = clf_grid_search_KM.predict(X_test)

In [26]:
###### Vamos a generar los resultados
from sklearn.metrics import accuracy_score

result1KM=accuracy_score(y_test, y_pred1_KM)
print("The Accuracy for KMeans: " + str(result1MLP))


result1KM=accuracy_score(y_test, y_pred2_KM)
print("The Accuracy for Kmeans with GridSearch is: " + str(result2MLP))

The Accuracy for KMeans: 0.7692307692307693
The Accuracy for Kmeans with GridSearch is: 0.8205128205128205
