In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.svm import SVC
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from skfeature.function.similarity_based.lap_score import lap_score, feature_ranking
from sklearn.metrics import pairwise_distances
from scipy import sparse
import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-whitegrid')

<br>

In [None]:
texturas = pd.read_excel('BBDD_caEnd_articulo_jmCarot.xlsx', usecols='DG:FKW')
grado = pd.read_excel('BBDD_caEnd_articulo_jmCarot.xlsx', usecols='AT')

na_ind = [47, 91, 94, 97]
texturas = texturas.drop(na_ind)
grado = grado.drop(na_ind)

grado_dico = grado.replace({1:0, 2:1, 3:1})

In [None]:
X_texturas = texturas.values
scaler = StandardScaler()
X_texturas_scaled = scaler.fit_transform(X_texturas)
Y = grado_dico.values.T[0] 

<br>

### Laplacian Feature Selection

The smaller the laplacian score is, the more important the feature is

In [None]:
def compute_W(X, Y, eps):
    # Simple kernel
    n = X.shape[0]
    dist_matrix = pairwise_distances(X)
    nn_matrix = np.array([ [index for index, d in enumerate(dist_matrix[i,:]) if d < eps and index != i and Y[index] == Y[i]] for i in range(n) ])
    # Weight matrix
    W = []
    for i in range(n):
        w_aux = np.zeros((1, n))
        similarities = np.array([ 1 for v in nn_matrix[i]] )
        np.put(w_aux, nn_matrix[i], similarities)
        W.append(w_aux[0])
    W = np.array(W)
    return W

In [None]:
eps = 5000
W = compute_W(X_texturas_scaled, Y, eps)

In [None]:
W

In [None]:
L = lap_score(X_texturas_scaled, W = sparse.csr_matrix(W))
f = feature_ranking(L)
f

In [None]:
fig = plt.figure(figsize=(15,7))
plt.plot(list(range(len(f))), L[f], '-ok')
plt.grid(True)
plt.yticks(np.arange(min(L)-0.055, max(L)+0.05, step=0.05))
plt.title('Laplacian Scores')
plt.show()

<br>

## Cross-validation Laplacian Score
### SVM kernel = linear

In [None]:
modelos = [('linear', 1, 0, 'scale')] 

# (kernel, C, degree, gamma)
hiperparam = {c:None for c in modelos}
for (kernel, C, d, g),_ in hiperparam.items():
    results = {}
    laplacian_thresholds = [L[f][0] + (i * 0.0001) for i in range(1, 358)]
    for i in laplacian_thresholds:
        X_texturas_selected = X_texturas_scaled[:,f[L[f] < i]] # Variables con un valor de Laplacian Score menor que el threshold

        kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 344)
        probas = np.zeros((X_texturas_selected.shape[0],2))
        for train_index, test_index in kf.split(X_texturas_selected, Y):
            X_train, X_test = X_texturas_selected[train_index], X_texturas_selected[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            dt = SVC(probability = True, kernel = kernel, C = C, degree = d, gamma = g, random_state = 3)
            probas_ = dt.fit(X_train, y_train).predict_proba(X_test)
            probas[test_index, :] = probas_

        fpr, tpr, thresholds = roc_curve(Y, probas[:, 1])
        roc_auc = auc(fpr, tpr)
        results[i] = roc_auc
    hiperparam[(kernel, C, d, g)] = results

In [None]:
{m: max([(v, c) for c, v in results.items()]) for m, results in hiperparam.items()}

In [None]:
results_best_model = hiperparam[('linear', 1, 0, 'scale')]

In [None]:
fig = plt.figure(figsize=(15,7))
plt.axvline(x = max([(v, c) for c, v in results_best_model.items()])[1], color = 'r')
plt.plot(results_best_model.keys(), results_best_model.values(), '-ok')
plt.grid(True)
plt.yticks(np.arange(0, 1, step=0.05))
plt.xticks(np.arange(min(laplacian_thresholds), max(laplacian_thresholds), step = 0.01))
plt.title('AUC vs Laplacian Score')
plt.xlabel('Laplacian Score threshold')
plt.ylabel('AUC')
plt.show()

In [None]:
X_texturas_selected = X_texturas_scaled[:,f[L[f] < 0.992679181227688]] 
    
kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 344)
probas = np.zeros((X_texturas_selected.shape[0],2))
for train_index, test_index in kf.split(X_texturas_selected, Y):
    X_train, X_test = X_texturas_selected[train_index], X_texturas_selected[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    dt = SVC(probability = True, kernel = 'linear', C = 1, random_state = 3)
    probas_ = dt.fit(X_train, y_train).predict_proba(X_test)
    probas[test_index, :] = probas_

In [None]:
probs_df = pd.DataFrame(probas)
probs_df['Y'] = Y
probs_df.to_excel('Predictions_Laplacian_Score_SVM.xlsx')