In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.svm import SVC
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from skfeature.function.similarity_based.fisher_score import fisher_score, feature_ranking
import warnings
warnings.filterwarnings("ignore")
plt.style.use('seaborn-whitegrid')

<br>

In [None]:
texturas = pd.read_excel('BBDD_caEnd_articulo_jmCarot.xlsx', usecols='DG:FKW')
grado = pd.read_excel('BBDD_caEnd_articulo_jmCarot.xlsx', usecols='AT')

na_ind = [47, 91, 94, 97]
texturas = texturas.drop(na_ind)
grado = grado.drop(na_ind)

grado_dico = grado.replace({1:0, 2:1, 3:1})

In [None]:
texturas.shape

In [None]:
X_texturas = texturas.values
scaler = StandardScaler()
X_texturas_scaled = scaler.fit_transform(X_texturas)
Y = grado_dico.values.T[0] 

<br>

### Fisher Feature Selection

The larger the fisher score, the more important the feature is

In [None]:
L = fisher_score(X_texturas_scaled, Y)
f = feature_ranking(L)
f

In [None]:
fig = plt.figure(figsize=(15,7))
plt.plot(list(range(len(f))), L[f], '-ok')
plt.grid(True)
plt.yticks(np.arange(min(L)-0.055, max(L)+0.05, step=0.05))
plt.title('Fisher Scores')
plt.show()

<br>

## Cross-validation Fisher Score
### SVM kernel = linear

In [None]:
print(len(f[L[f] > [L[f][0] - (i * 0.001) for i in range(1, 47)][-1]]))

In [None]:
modelos = [('linear', 0.7, 0, 100)] 
# (kernel, C, degree, gamma)

hiperparam = {c:None for c in modelos}
for (kernel, C, d, g),_ in hiperparam.items():
    results = {}
    fisher_thresholds = [L[f][0] - (i * 0.001) for i in range(1, 47)]
    for i in fisher_thresholds:
        X_texturas_selected = X_texturas_scaled[:,f[L[f] > i]] # Variables con un valor de Fisher Score mayor que el threshold

        kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 344)
        probas = np.zeros((X_texturas_selected.shape[0],2))
        for train_index, test_index in kf.split(X_texturas_selected, Y):
            X_train, X_test = X_texturas_selected[train_index], X_texturas_selected[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            dt = SVC(probability = True, kernel = kernel, C = C, degree = d, gamma = g)
            probas_ = dt.fit(X_train, y_train).predict_proba(X_test)
            probas[test_index, :] = probas_

        fpr, tpr, thresholds = roc_curve(Y, probas[:, 1])
        roc_auc = auc(fpr, tpr)
        results[i] = roc_auc
    hiperparam[(kernel, C, d, g)] = results

In [None]:
{m: max([(v, c) for c, v in results.items()]) for m, results in hiperparam.items()}

In [None]:
results_best_model = hiperparam[('linear', 0.7, 0, 100)]

In [None]:
fig = plt.figure(figsize=(15,7))
plt.axvline(x = max([(v, c) for c, v in results_best_model.items()])[1], color = 'r')
plt.plot(results_best_model.keys(), results_best_model.values(), '-ok')
plt.grid(True)
plt.yticks(np.arange(0, 1, step=0.05))
plt.xticks(np.arange(min(fisher_thresholds), max(fisher_thresholds), step = 0.01))
plt.title('AUC vs Laplacian Score')
plt.xlabel('Fisher Score threshold')
plt.ylabel('AUC')
plt.show()

In [None]:
X_texturas_selected = X_texturas_scaled[:,f[L[f] > 0.02654191528330678]] 
    
kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 344)
probas = np.zeros((X_texturas_selected.shape[0],2))
for train_index, test_index in kf.split(X_texturas_selected, Y):
    X_train, X_test = X_texturas_selected[train_index], X_texturas_selected[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    dt = SVC(probability = True, kernel = 'linear', C = 0.7, gamma = 100)
    probas_ = dt.fit(X_train, y_train).predict_proba(X_test)
    probas[test_index, :] = probas_

In [None]:
probs_df = pd.DataFrame(probas)
probs_df['Y'] = Y
probs_df.to_excel('Predictions_Fisher_Score_SVM.xlsx')