In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Cargar datos: Glass

Atributos:
1. Número de identificación: 1 a 214
2. RI: índice de refracción
3. Na: Sodio (unidad de medida: porcentaje en peso en el óxido correspondiente, al igual que los atributos 4-10)
4. Mg: magnesio
5. Al: Aluminio
6. Si: silicio
7. K: Potasio
8. Ca: calcio
9. Ba: bario
10. Fe: Hierro
11. Tipo de vidrio: (atributo de clase)
     1. building_windows_float_processed
     2. building_windows_non_float_processed
     3. vehicle_windows_float_processed
     4. vehicle_windows_non_float_processed (ninguno en esta base de datos)
     5. contenedores
     6. vajilla
     7. faros
    
    
Distribución de clases: (de un total de 214 instancias)

1. 163 Vidrio de ventana (ventanas de edificios y ventanas de vehículos)
     1. 87 flotador procesado
         - 70 ventanas de construcción
         - 17 ventanas de vehículos
     2. 76 procesado sin flotación
         - 76 ventanas de construcción
         - 0 ventanas de vehículos
2. 51 Vidrio no de ventana
     - 13 contenedores
     - 9 vajillas
     - 29 faros

In [2]:
glass = pd.read_csv('glass.csv')
glass.columns = ['id','ri','na','mg','al','si','k','ca','ba','fe','class']
print("Glass shape: ", glass.shape)

Glass shape:  (213, 11)


Cargar datos: Titanic

In [3]:
train_titanic = pd.read_csv('titanic_train.csv', sep=',')
test_t = pd.read_csv('titanic_test.csv', sep=',')
gender_sub = pd.read_csv('gender_submission.csv', sep=',')
gender_sub.PassengerId = gender_sub.PassengerId.astype(float)

train_titanic = train_titanic.replace('Q', 0)
train_titanic = train_titanic.replace('S', 1)
train_titanic = train_titanic.replace('C', 2)
train_titanic = train_titanic.replace('male', 0)
train_titanic = train_titanic.replace('female', 1)
train_titanic = np.array(train_titanic.loc[:,['Sex','Age','Fare','Embarked', 'Survived']])
train_titanic = train_titanic[~np.isnan(train_titanic).any(axis=1)]

print("Titanic Train Shape : ", train_titanic.shape)

test1_t = pd.merge(test_t, gender_sub,  how='left', on='PassengerId')
test1_t = test1_t.replace('Q', 0)
test1_t = test1_t.replace('S', 1)
test1_t = test1_t.replace('C', 2)
test1_t = test1_t.replace('male', 0)
test1_t = test1_t.replace('female', 1)

print("Titanic Test Shape : ", train_titanic.shape)

test_titanic = np.array(test1_t.loc[:,['Sex','Age','Fare','Embarked', 'Survived']])
test_titanic = test_titanic[~np.isnan(test_titanic).any(axis=1)]

titanic = np.concatenate((train_titanic, test_titanic), axis=0)
titanic = pd.DataFrame(data=titanic, columns=['Sex','Age','Fare','Embarked', 'Survived'])

print("Titanic Shape: ", titanic.shape)


Titanic Train Shape :  (712, 5)
Titanic Test Shape :  (712, 5)
Titanic Shape:  (1043, 5)


Normalizar Datos

In [4]:
def Normalizar_Datos(data):
    print(data)
    tam = len(data.columns)
    y = data.iloc[:,-1]
    data = data.iloc[:,:-1]
    data = (data - data.mean(axis=0))/data.std(axis=0)
    data.insert(tam-1, "y", y, True) 
    return data

In [5]:
titanic_data = Normalizar_Datos(titanic)

      Sex   Age      Fare  Embarked  Survived
0     0.0  22.0    7.2500       1.0       0.0
1     1.0  38.0   71.2833       2.0       1.0
2     1.0  26.0    7.9250       1.0       1.0
3     1.0  35.0   53.1000       1.0       1.0
4     0.0  35.0    8.0500       1.0       0.0
...   ...   ...       ...       ...       ...
1038  1.0   3.0   13.7750       1.0       1.0
1039  1.0  37.0   90.0000       0.0       1.0
1040  1.0  28.0    7.7750       1.0       1.0
1041  1.0  39.0  108.9000       2.0       1.0
1042  0.0  38.5    7.2500       1.0       0.0

[1043 rows x 5 columns]


In [6]:
glass_data = Normalizar_Datos(glass)

      id       ri     na    mg    al     si     k    ca    ba    fe  class
0      2  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.00  0.00      1
1      3  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.00  0.00      1
2      4  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.00  0.00      1
3      5  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.00  0.00      1
4      6  1.51596  12.79  3.61  1.62  72.97  0.64  8.07  0.00  0.26      1
..   ...      ...    ...   ...   ...    ...   ...   ...   ...   ...    ...
208  210  1.51623  14.14  0.00  2.88  72.61  0.08  9.18  1.06  0.00      7
209  211  1.51685  14.92  0.00  1.99  73.06  0.00  8.40  1.59  0.00      7
210  212  1.52065  14.36  0.00  2.02  73.42  0.00  8.44  1.64  0.00      7
211  213  1.51651  14.38  0.00  1.94  73.61  0.00  8.48  1.57  0.00      7
212  214  1.51711  14.23  0.00  2.08  73.36  0.00  8.62  1.67  0.00      7

[213 rows x 11 columns]


In [7]:
def Sigmoidal(x):
    return 1.0 / (1.0+np.exp(-x))

Exactitud - Accuracy

In [8]:
def Calcular_Accuracy(X, Y, W, A):
    count = 0
    for i,x in enumerate(X):
        y_predict,_ = Forward(x,W,A)
        y_predict = np.around(y_predict)
        yy = Y[i]
        y = y_predict - yy

        cc = y_predict == yy
        if (y_predict == yy).all():
            count+=1
    return (count/len(Y)) * 100

KFolds - Multiclase

In [117]:
def kFolds(data, k=3, clases=[0, 1]):
    list_clases = []
    list_indices_break = []
    list_folds_clases = []
    
    #ordenar segun ultima columna
    data = data.sort_values(by=["y"]).values

    for i in range(1, len(clases)):
        #obtener el indice de la primera ocucrrencia de unos
        indice_break = np.where(data[:,-1] == clases[i])[0][0]
        list_indices_break.append(indice_break)
        
    #dividar los datos en dos clases
    list_clases = np.split(data, list_indices_break)

    
    #dividir en k folds
    for clase in list_clases:
        clase = np.array_split(clase, k)
        list_folds_clases.append(clase)

    datos_split = list()

    #distribuir una porcion equitativa de cada clase a cada fold
    for i in range(k):
        fold = list_folds_clases[0][i]
        
        #combinar cada fold con una proporcion de cada clase
        for j in range(len(list_folds_clases)):
            fold = np.concatenate((fold, list_folds_clases[j][i]))

        #mezclar los dataFrames
        np.random.shuffle(fold)

        #separar x e y
        x_data = fold[:, :-1]
        y_data = fold[:, -1]

        datos_split.append([x_data, y_data])

    return datos_split

In [118]:
titanic_folds = kFolds(titanic_data, k=3, clases=[0,1])

In [119]:
glass_folds = kFolds(glass_data, k=3, clases=[1,2,3,5,6,7])

# SVM

In [120]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm

# The Dataset comes from:
# https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

def load(path_test, path_train):
  # Load up the data.
  with open(path_test, 'r')  as f: testing  = pd.read_csv(f)
  with open(path_train, 'r') as f: training = pd.read_csv(f)

  # The number of samples between training and testing can vary
  # But the number of features better remain the same!
  n_features = testing.shape[1]

  X_test  = testing.iloc[:, :n_features-1]
  X_train = training.iloc[:, :n_features-1]
  y_test  = testing.iloc[:,n_features-1].values.ravel()
  y_train = training.iloc[:,n_features-1].values.ravel()

  return X_train, X_test, y_train, y_test


def linear_SVM(X_train, y_train, X_test, y_test):

    svc = svm.SVC(kernel='linear', C=1, gamma=0.001)
    svc.fit(X_train, y_train) 
    
    # Calcular el score del SVM
    score = svc.score(X_test, y_test)
    return score

def poly_SVM(X_train, y_train, X_test, y_test):

    svc = svm.SVC(kernel='poly', C=1, degree=3, gamma=0.001)
    svc.fit(X_train, y_train) 
    
    # Calcular el score del SVM
    score = svc.score(X_test, y_test)
    return score

def sigmoid_SVM(X_train, y_train, X_test, y_test):

    svc = svm.SVC(kernel='sigmoid', C=1, gamma=0.001)
    svc.fit(X_train, y_train) 
    
    # Calcular el score del SVM
    score = svc.score(X_test, y_test)
    return score

def kernel_gausiano_SVM(X_train, y_train, X_test, y_test):

    svc = svm.SVC(kernel='rbf', C=1, gamma=0.001)
    svc.fit(X_train, y_train) 
    
    # Calcular el score del SVM
    score = svc.score(X_test, y_test)
    return score




#cargando datos
X_train, X_test, y_train, y_test = load('Datasets/optdigits.tes', 'Datasets/optdigits.tra')

score = linear_SVM(X_train, y_train, X_test, y_test)
print ("Score de linear SVM:\n", score)

score = poly_SVM(X_train, y_train, X_test, y_test)
print ("Score de polynomial SVM:\n", score)

score = sigmoid_SVM(X_train, y_train, X_test, y_test)
print ("Score de sigmoid SVM:\n", score)

score = kernel_gausiano_SVM(X_train, y_train, X_test, y_test)
print ("Score de kernel gausiano SVM:\n", score)

Score de linear SVM:
 0.9610244988864143
Score de polynomial SVM:
 0.9749443207126949
Score de sigmoid SVM:
 0.717706013363029
Score de kernel gausiano SVM:
 0.982739420935412
