In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# SVM
from sklearn.svm import SVC

# K-NN
from sklearn.neighbors import KNeighborsClassifier as KNN

### Convertendo caracteristicas do oBIFs extraidas de estogramas no matlab pra o dataset em python


In [2]:
file = open("gender_dataset_obifs.csv", "w")

for i in range(1, 401, 1):
  df = pd.read_csv('gender_extracted_data_obifs/image_%d.csv' % (i), header=None)
  df2 = df.sum(axis=1)
  class_of_instance = 1

  if i <= 50:
    class_of_instance = 1
  elif i <= 100:
    class_of_instance = 0
  elif i <= 150:
    class_of_instance = 1
  elif i <= 200:
    class_of_instance = 0
  elif i <= 250:
    class_of_instance = 1
  elif i <= 300:
    class_of_instance = 0
  elif i <= 350:
    class_of_instance = 1
  else:
    class_of_instance = 0

  df_with_class = pd.DataFrame(np.append(df2, [class_of_instance])).transpose()
  np.savetxt(file, df_with_class, delimiter=',')


file.close()

In [2]:
dataset_obifs = pd.read_csv('gender_dataset_obifs.csv', header=None)
dataset_obifs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,504,505,506,507,508,509,510,511,512,513
0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,1071.0,1866.0,2452.0,2119.0,866.0,457.0,457.0,431.0,431.0,1.0
1,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,707.0,1572.0,2711.0,2833.0,2271.0,885.0,610.0,433.0,431.0,1.0
2,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,616.0,1934.0,2466.0,2552.0,1448.0,1060.0,731.0,461.0,440.0,1.0
3,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,836.0,1731.0,2738.0,2516.0,1798.0,932.0,822.0,592.0,565.0,1.0
4,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,882.0,2304.0,2739.0,2882.0,941.0,706.0,523.0,440.0,431.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,716.0,1143.0,1230.0,1058.0,1035.0,933.0,1045.0,1198.0,1301.0,0.0
396,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,1029.0,1373.0,1209.0,1227.0,1026.0,1099.0,1429.0,1526.0,1293.0,0.0
397,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,1092.0,1682.0,1608.0,1317.0,1228.0,1300.0,1562.0,1546.0,1365.0,0.0
398,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,431.0,...,1049.0,1313.0,1017.0,934.0,887.0,1048.0,1146.0,1263.0,1065.0,0.0


In [29]:
y_obifs = dataset_obifs[513].values.ravel()
X_obifs = dataset_obifs.drop([513], axis=1).values
X_obifs_columns = dataset_obifs.drop([513], axis=1).columns
y_obifs_classes = np.unique(y_obifs).astype(str)

y_obifs_classes

array(['0.0', '1.0'], dtype='<U32')

### Carregando dataset de caracteristicas BSIF

In [30]:
dataset_bsif = pd.read_csv('gender_dataset_bsif.csv', header=None)
dataset_bsif

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,0.001981,0.001687,0.003587,0.000611,0.001520,0.003356,0.001153,0.000339,0.000416,0.000751,...,0.000597,0.000543,0.001759,0.000398,0.000502,0.001235,0.002741,0.000366,0.001312,1
1,0.003424,0.002628,0.004432,0.000954,0.002610,0.005011,0.001678,0.000814,0.001063,0.001176,...,0.001805,0.001108,0.002840,0.000579,0.001013,0.002474,0.003582,0.000905,0.002528,1
2,0.002677,0.003333,0.004821,0.000606,0.002053,0.004839,0.001606,0.000850,0.000846,0.000841,...,0.001411,0.000977,0.002777,0.000669,0.000805,0.002221,0.003460,0.000972,0.002573,1
3,0.002117,0.002542,0.003817,0.000543,0.001755,0.004554,0.001556,0.000687,0.000525,0.000678,...,0.001000,0.000859,0.002374,0.000660,0.000728,0.001728,0.002813,0.000787,0.001719,1
4,0.002858,0.002818,0.004826,0.000954,0.002298,0.005283,0.001786,0.000886,0.000850,0.001162,...,0.001199,0.001000,0.002971,0.000796,0.001067,0.002230,0.003795,0.000819,0.002311,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.001194,0.001687,0.002239,0.000299,0.001443,0.002913,0.000868,0.000412,0.000231,0.000597,...,0.000547,0.000461,0.001705,0.000466,0.000592,0.001095,0.001737,0.000262,0.000791,0
396,0.001248,0.001506,0.002198,0.000204,0.001642,0.003315,0.000846,0.000412,0.000294,0.000534,...,0.000701,0.000443,0.001741,0.000448,0.000588,0.001199,0.001791,0.000425,0.001719,0
397,0.001841,0.001764,0.002646,0.000326,0.001217,0.003582,0.000864,0.000403,0.000357,0.000565,...,0.000850,0.000520,0.001755,0.000416,0.000597,0.001420,0.002135,0.000425,0.001673,0
398,0.000941,0.000918,0.001565,0.000217,0.000886,0.002361,0.000633,0.000353,0.000204,0.000403,...,0.000583,0.000348,0.001393,0.000303,0.000384,0.000801,0.001615,0.000267,0.000995,0


In [32]:
y_bsif = dataset_bsif[256].values.ravel()
X_bsif = dataset_bsif.drop([256], axis=1).values
X_bsif_columns = dataset_bsif.drop([256], axis=1).columns
y_bsif_classes = np.unique(y_bsif).astype(str)

X_bsif

array([[0.00198098, 0.001687  , 0.00358656, ..., 0.0027408 , 0.00036635,
        0.00131161],
       [0.00342374, 0.00262773, 0.00443232, ..., 0.00358204, 0.00090456,
        0.00252823],
       [0.00267749, 0.00333329, 0.00482128, ..., 0.00345993, 0.0009724 ,
        0.00257346],
       ...,
       [0.00184077, 0.00176388, 0.00264583, ..., 0.00213475, 0.00042514,
        0.00167343],
       [0.00094074, 0.00091812, 0.00156488, ..., 0.00161463, 0.00026684,
        0.00099501],
       [0.00108094, 0.00106285, 0.00195836, ..., 0.00164629, 0.00033921,
        0.00107642]])

# Separando dataset com características do oBIFS em dados de treino e teste

In [17]:
X_train_obifs, X_test_obifs, y_train_obifs, y_test_obifs = train_test_split(X_obifs, y_obifs, test_size=0.2, random_state=1, stratify=y_obifs)

X_test_obifs.shape

(80, 513)

# Normalizando os dados extraídos com o oBIFS

In [21]:
ss_obifs = StandardScaler()
ss_obifs.fit(X_train_obifs)
X_train_obifs_ss = ss_obifs.transform(X_train_obifs)
X_test_obifs_ss = ss_obifs.transform(X_test_obifs)

X_test_obifs_ss

array([[-0.0880667 , -0.08683343, -0.08761608, ...,  0.9849463 ,
         0.35610773,  0.58238458],
       [-0.0880667 , -0.08683343, -0.08761608, ..., -0.43167502,
        -0.7397452 , -0.60783435],
       [-0.0880667 , -0.08683343, -0.08761608, ..., -0.17920786,
        -0.58604115, -0.53402233],
       ...,
       [-0.0880667 , -0.08683343, -0.08761608, ..., -0.17359748,
        -0.36687057, -0.88770496],
       [-0.0880667 , -0.08683343, -0.08761608, ..., -0.46253212,
        -0.56611655, -0.47866331],
       [-0.0880667 , -0.08683343, -0.08761608, ...,  1.41133529,
         0.52973637, -0.14035818]])

# Utilizando o SVM para classificação oBIFS

In [24]:
##SVM com Grid search
C_range = 2. ** np.arange(-5,15,2)
gamma_range = 2. ** np.arange(3,-15,-2)
k = [ 'rbf']

# instancia o classificador, gerando probabilidades
srv = SVC(probability=True, kernel='rbf')
ss = StandardScaler()

param_grid = {
   'C' : C_range,
   'gamma' : gamma_range
}

# faz a busca
grid = GridSearchCV(srv, param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_obifs_ss, y_train_obifs)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_obifs_ss)
print(classification_report(y_test_obifs, pred_svm))

Fitting 5 folds for each of 90 candidates, totalling 450 fits
{'C': 8.0, 'gamma': 0.001953125}
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.93        40
         1.0       0.95      0.90      0.92        40

    accuracy                           0.93        80
   macro avg       0.93      0.93      0.92        80
weighted avg       0.93      0.93      0.92        80



# Usando K-NN com os dados extraídos com o oBIFS

In [26]:
param_grid = {
   'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
}

grid = GridSearchCV(KNN(), param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_obifs_ss, y_train_obifs)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_obifs_ss)
print(classification_report(y_test_obifs, pred_svm))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_neighbors': 1}
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95        40
         1.0       0.97      0.93      0.95        40

    accuracy                           0.95        80
   macro avg       0.95      0.95      0.95        80
weighted avg       0.95      0.95      0.95        80



# -------------------------------------------------------------------------------------------------------------------------------------------------

# Separando dataset com características do BSIF em dados de treino e teste

In [33]:
X_train_bsif, X_test_bsif, y_train_bsif, y_test_bsif = train_test_split(X_bsif, y_bsif, test_size=0.2, random_state=1, stratify=y_bsif)

X_test_bsif.shape

(80, 256)

# Normalizando os dados extraídos com o BSIF

In [34]:
ss_bsif = StandardScaler()
ss_bsif.fit(X_train_bsif)
X_train_bsif_ss = ss_bsif.transform(X_train_bsif)
X_test_bsif_ss = ss_bsif.transform(X_test_bsif)

X_test_bsif_ss

array([[-3.57196274e-01, -9.00185561e-03, -4.92316174e-02, ...,
        -2.71271786e-01,  3.75679006e-01, -1.24742753e-01],
       [ 4.99520213e-01, -1.18477611e-03,  2.86742197e-01, ...,
         4.95193390e-01,  7.75546421e-01,  6.46006274e-04],
       [ 2.87333881e+00,  1.76938373e+00,  2.42475738e+00, ...,
         9.79024533e-01,  7.18422504e-01,  1.48158891e+00],
       ...,
       [-6.17270922e-01, -6.26551136e-01, -3.32264103e-01, ...,
        -1.51511603e-01, -6.23989530e-01, -4.73797946e-01],
       [ 2.67492831e-01,  2.60687387e-01,  4.03613997e-02, ...,
         3.70642799e-01, -5.27503665e-02,  5.59812093e-01],
       [-2.52656465e-01, -9.49897301e-02,  2.20355553e-02, ...,
         6.86809684e-01,  7.75546421e-01, -1.55242721e-01]])

# Utilizando o SVM para classificação BSIF

In [35]:
##SVM com Grid search
C_range = 2. ** np.arange(-5,15,2)
gamma_range = 2. ** np.arange(3,-15,-2)
k = [ 'rbf']

# instancia o classificador, gerando probabilidades
srv = SVC(probability=True, kernel='rbf')
ss = StandardScaler()

param_grid = {
   'C' : C_range,
   'gamma' : gamma_range
}

# faz a busca
grid = GridSearchCV(srv, param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_bsif_ss, y_train_bsif)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_bsif_ss)
print(classification_report(y_test_bsif, pred_svm))

Fitting 5 folds for each of 90 candidates, totalling 450 fits
{'C': 8.0, 'gamma': 0.001953125}
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        40
           1       0.93      0.95      0.94        40

    accuracy                           0.94        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80



# Usando K-NN com os dados extraídos com o BSIF

In [36]:
param_grid = {
   'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
}

grid = GridSearchCV(KNN(), param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_bsif_ss, y_train_bsif)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_bsif_ss)
print(classification_report(y_test_bsif, pred_svm))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_neighbors': 1}
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        40
           1       0.97      0.95      0.96        40

    accuracy                           0.96        80
   macro avg       0.96      0.96      0.96        80
weighted avg       0.96      0.96      0.96        80

