In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# SVM
from sklearn.svm import SVC

# K-NN
from sklearn.neighbors import KNeighborsClassifier as KNN

### Convertendo caracteristicas do oBIFs extraidas de estogramas no matlab pra o dataset em python


In [57]:
file = open("gender_dataset_bsif.csv", "w")
df = pd.read_csv('gender_bsif.txt', header=None, delimiter=' ')
df = df.iloc[:,:-1]
print(df)

for i in range(1, 401, 1):
  class_of_instance = 1

  if i <= 50:
    class_of_instance = 1
  elif i <= 100:
    class_of_instance = 0
  elif i <= 150:
    class_of_instance = 1
  elif i <= 200:
    class_of_instance = 0
  elif i <= 250:
    class_of_instance = 1
  elif i <= 300:
    class_of_instance = 0
  elif i <= 350:
    class_of_instance = 1
  else:
    class_of_instance = 0

  df_with_class = pd.DataFrame(np.append(df.iloc[i-1], [class_of_instance])).transpose()
  np.savetxt(file, df_with_class, delimiter=',')

file.close()

          0         1         2         3         4         5         6    \
0    0.001981  0.001687  0.003587  0.000611  0.001520  0.003356  0.001153   
1    0.003424  0.002628  0.004432  0.000954  0.002610  0.005011  0.001678   
2    0.002677  0.003333  0.004821  0.000606  0.002053  0.004839  0.001606   
3    0.002117  0.002542  0.003817  0.000543  0.001755  0.004554  0.001556   
4    0.002858  0.002818  0.004826  0.000954  0.002298  0.005283  0.001786   
..        ...       ...       ...       ...       ...       ...       ...   
395  0.001194  0.001687  0.002239  0.000299  0.001443  0.002913  0.000868   
396  0.001248  0.001506  0.002198  0.000204  0.001642  0.003315  0.000846   
397  0.001841  0.001764  0.002646  0.000326  0.001217  0.003582  0.000864   
398  0.000941  0.000918  0.001565  0.000217  0.000886  0.002361  0.000633   
399  0.001081  0.001063  0.001958  0.000244  0.001280  0.002786  0.000791   

          7         8         9    ...       246       247       248  \
0  

In [58]:
dataset_obifs = pd.read_csv('gender_dataset_obifs.csv', header=None)
dataset_obifs_len = len(dataset_obifs.columns) - 1

dataset_obifs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,391,392,393,394,395,396,397,398,399,400
0,521.0,1179.0,1051.0,928.0,934.0,652.0,678.0,885.0,1056.0,1096.0,...,543.0,552.0,483.0,591.0,523.0,473.0,411.0,383.0,324.0,1.0
1,90.0,103.0,101.0,54.0,73.0,87.0,123.0,126.0,86.0,196.0,...,40.0,65.0,51.0,48.0,22.0,30.0,51.0,21.0,16.0,1.0
2,5.0,16.0,4.0,6.0,1.0,15.0,18.0,20.0,63.0,33.0,...,5.0,1.0,3.0,17.0,7.0,1.0,6.0,2.0,28.0,1.0
3,42.0,8.0,1.0,11.0,30.0,59.0,59.0,91.0,103.0,48.0,...,0.0,1.0,3.0,5.0,1.0,4.0,50.0,15.0,2.0,1.0
4,18.0,17.0,2.0,18.0,17.0,31.0,6.0,16.0,45.0,59.0,...,7.0,5.0,1.0,5.0,7.0,2.0,10.0,5.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,142.0,333.0,262.0,219.0,230.0,262.0,284.0,366.0,232.0,308.0,...,69.0,99.0,92.0,134.0,97.0,70.0,106.0,64.0,82.0,0.0
396,104.0,207.0,132.0,119.0,127.0,119.0,150.0,162.0,176.0,224.0,...,61.0,88.0,46.0,78.0,24.0,41.0,39.0,22.0,33.0,0.0
397,76.0,205.0,173.0,90.0,100.0,180.0,179.0,229.0,228.0,250.0,...,120.0,97.0,109.0,146.0,59.0,92.0,88.0,54.0,55.0,0.0
398,74.0,112.0,47.0,61.0,38.0,82.0,74.0,62.0,128.0,122.0,...,28.0,78.0,42.0,58.0,30.0,24.0,39.0,21.0,11.0,0.0


In [59]:
y_obifs = dataset_obifs[dataset_obifs_len].values.ravel()
X_obifs = dataset_obifs.drop([dataset_obifs_len], axis=1).values
X_obifs_columns = dataset_obifs.drop([dataset_obifs_len], axis=1).columns
y_obifs_classes = np.unique(y_obifs).astype(str)

y_obifs_classes

array(['0.0', '1.0'], dtype='<U32')

### Carregando dataset de caracteristicas BSIF

In [60]:
dataset_bsif = pd.read_csv('gender_dataset_bsif.csv', header=None)
dataset_bsif_len = len(dataset_bsif.columns) - 1

dataset_bsif

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,0.001981,0.001687,0.003587,0.000611,0.001520,0.003356,0.001153,0.000339,0.000416,0.000751,...,0.000597,0.000543,0.001759,0.000398,0.000502,0.001235,0.002741,0.000366,0.001312,1.0
1,0.003424,0.002628,0.004432,0.000954,0.002610,0.005011,0.001678,0.000814,0.001063,0.001176,...,0.001805,0.001108,0.002840,0.000579,0.001013,0.002474,0.003582,0.000905,0.002528,1.0
2,0.002677,0.003333,0.004821,0.000606,0.002053,0.004839,0.001606,0.000850,0.000846,0.000841,...,0.001411,0.000977,0.002777,0.000669,0.000805,0.002221,0.003460,0.000972,0.002573,1.0
3,0.002117,0.002542,0.003817,0.000543,0.001755,0.004554,0.001556,0.000687,0.000525,0.000678,...,0.001000,0.000859,0.002374,0.000660,0.000728,0.001728,0.002813,0.000787,0.001719,1.0
4,0.002858,0.002818,0.004826,0.000954,0.002298,0.005283,0.001786,0.000886,0.000850,0.001162,...,0.001199,0.001000,0.002971,0.000796,0.001067,0.002230,0.003795,0.000819,0.002311,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.001194,0.001687,0.002239,0.000299,0.001443,0.002913,0.000868,0.000412,0.000231,0.000597,...,0.000547,0.000461,0.001705,0.000466,0.000592,0.001095,0.001737,0.000262,0.000791,0.0
396,0.001248,0.001506,0.002198,0.000204,0.001642,0.003315,0.000846,0.000412,0.000294,0.000534,...,0.000701,0.000443,0.001741,0.000448,0.000588,0.001199,0.001791,0.000425,0.001719,0.0
397,0.001841,0.001764,0.002646,0.000326,0.001217,0.003582,0.000864,0.000403,0.000357,0.000565,...,0.000850,0.000520,0.001755,0.000416,0.000597,0.001420,0.002135,0.000425,0.001673,0.0
398,0.000941,0.000918,0.001565,0.000217,0.000886,0.002361,0.000633,0.000353,0.000204,0.000403,...,0.000583,0.000348,0.001393,0.000303,0.000384,0.000801,0.001615,0.000267,0.000995,0.0


In [61]:
y_bsif = dataset_bsif[dataset_bsif_len].values.ravel()
X_bsif = dataset_bsif.drop([dataset_bsif_len], axis=1).values
X_bsif_columns = dataset_bsif.drop([dataset_bsif_len], axis=1).columns
y_bsif_classes = np.unique(y_bsif).astype(str)

X_bsif

array([[0.001981, 0.001687, 0.003587, ..., 0.002741, 0.000366, 0.001312],
       [0.003424, 0.002628, 0.004432, ..., 0.003582, 0.000905, 0.002528],
       [0.002677, 0.003333, 0.004821, ..., 0.00346 , 0.000972, 0.002573],
       ...,
       [0.001841, 0.001764, 0.002646, ..., 0.002135, 0.000425, 0.001673],
       [0.000941, 0.000918, 0.001565, ..., 0.001615, 0.000267, 0.000995],
       [0.001081, 0.001063, 0.001958, ..., 0.001646, 0.000339, 0.001076]])

# Separando dataset com características do oBIFS em dados de treino e teste

In [31]:
X_train_obifs, X_test_obifs, y_train_obifs, y_test_obifs = train_test_split(X_obifs, y_obifs, test_size=0.2, random_state=1, stratify=y_obifs)

X_test_obifs.shape

(80, 400)

# Normalizando os dados extraídos com o oBIFS

In [32]:
ss_obifs = StandardScaler()
ss_obifs.fit(X_train_obifs)
X_train_obifs_ss = ss_obifs.transform(X_train_obifs)
X_test_obifs_ss = ss_obifs.transform(X_test_obifs)

X_test_obifs_ss

array([[-0.46910984, -0.45722715, -0.43792238, ..., -0.45348792,
        -0.38273662, -0.41457335],
       [-0.3925544 , -0.39568835, -0.38135155, ..., -0.41262165,
        -0.42472761, -0.38815354],
       [-0.28980894, -0.32435929, -0.32741192, ..., -0.30948296,
        -0.31075207, -0.34011752],
       ...,
       [-0.48724139, -0.5117729 , -0.49975562, ..., -0.48462413,
        -0.50571023, -0.50824358],
       [-0.49328524, -0.51876595, -0.50238682, ..., -0.49630021,
        -0.51170895, -0.50824358],
       [-0.46508061, -0.50757708, -0.4616032 , ..., -0.49630021,
        -0.49971152, -0.50343998]])

# Utilizando o SVM para classificação oBIFS

In [33]:
##SVM com Grid search
C_range = 2. ** np.arange(-5,15,2)
gamma_range = 2. ** np.arange(3,-15,-2)
k = [ 'rbf']

# instancia o classificador, gerando probabilidades
srv = SVC(probability=True, kernel='rbf')
ss = StandardScaler()

param_grid = {
   'C' : C_range,
   'gamma' : gamma_range
}

# faz a busca
grid = GridSearchCV(srv, param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_obifs_ss, y_train_obifs)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_obifs_ss)
print(classification_report(y_test_obifs, pred_svm))

Fitting 5 folds for each of 90 candidates, totalling 450 fits
{'C': 512.0, 'gamma': 0.001953125}
              precision    recall  f1-score   support

         0.0       0.59      0.65      0.62        40
         1.0       0.61      0.55      0.58        40

    accuracy                           0.60        80
   macro avg       0.60      0.60      0.60        80
weighted avg       0.60      0.60      0.60        80



# Usando K-NN com os dados extraídos com o oBIFS

In [34]:
param_grid = {
   'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
}

grid = GridSearchCV(KNN(), param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_obifs_ss, y_train_obifs)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_obifs_ss)
print(classification_report(y_test_obifs, pred_svm))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_neighbors': 9}
              precision    recall  f1-score   support

         0.0       0.49      0.47      0.48        40
         1.0       0.49      0.50      0.49        40

    accuracy                           0.49        80
   macro avg       0.49      0.49      0.49        80
weighted avg       0.49      0.49      0.49        80



# -------------------------------------------------------------------------------------------------------------------------------------------------

# Separando dataset com características do BSIF em dados de treino e teste

In [35]:
X_train_bsif, X_test_bsif, y_train_bsif, y_test_bsif = train_test_split(X_bsif, y_bsif, test_size=0.2, random_state=1, stratify=y_bsif)

X_test_bsif.shape

(80, 256)

# Normalizando os dados extraídos com o BSIF

In [36]:
ss_bsif = StandardScaler()
ss_bsif.fit(X_train_bsif)
X_train_bsif_ss = ss_bsif.transform(X_train_bsif)
X_test_bsif_ss = ss_bsif.transform(X_test_bsif)

X_test_bsif_ss

array([[-3.57196274e-01, -9.00185561e-03, -4.92316174e-02, ...,
        -2.71271786e-01,  3.75679006e-01, -1.24742753e-01],
       [ 4.99520213e-01, -1.18477611e-03,  2.86742197e-01, ...,
         4.95193390e-01,  7.75546421e-01,  6.46006274e-04],
       [ 2.87333881e+00,  1.76938373e+00,  2.42475738e+00, ...,
         9.79024533e-01,  7.18422504e-01,  1.48158891e+00],
       ...,
       [-6.17270922e-01, -6.26551136e-01, -3.32264103e-01, ...,
        -1.51511603e-01, -6.23989530e-01, -4.73797946e-01],
       [ 2.67492831e-01,  2.60687387e-01,  4.03613997e-02, ...,
         3.70642799e-01, -5.27503665e-02,  5.59812093e-01],
       [-2.52656465e-01, -9.49897301e-02,  2.20355553e-02, ...,
         6.86809684e-01,  7.75546421e-01, -1.55242721e-01]])

# Utilizando o SVM para classificação BSIF

In [37]:
##SVM com Grid search
C_range = 2. ** np.arange(-5,15,2)
gamma_range = 2. ** np.arange(3,-15,-2)
k = [ 'rbf']

# instancia o classificador, gerando probabilidades
srv = SVC(probability=True, kernel='rbf')
ss = StandardScaler()

param_grid = {
   'C' : C_range,
   'gamma' : gamma_range
}

# faz a busca
grid = GridSearchCV(srv, param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_bsif_ss, y_train_bsif)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_bsif_ss)
print(classification_report(y_test_bsif, pred_svm))

Fitting 5 folds for each of 90 candidates, totalling 450 fits
{'C': 8.0, 'gamma': 0.001953125}
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        40
           1       0.93      0.95      0.94        40

    accuracy                           0.94        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80



# Usando K-NN com os dados extraídos com o BSIF

In [38]:
param_grid = {
   'n_neighbors' : [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
}

grid = GridSearchCV(KNN(), param_grid, n_jobs=-1, verbose=True)
grid.fit(X_train_bsif_ss, y_train_bsif)

# recupera o melhor modelo
model = grid.best_estimator_
print(grid.best_params_)
pred_svm = model.predict(X_test_bsif_ss)
print(classification_report(y_test_bsif, pred_svm))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_neighbors': 1}
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        40
           1       0.97      0.95      0.96        40

    accuracy                           0.96        80
   macro avg       0.96      0.96      0.96        80
weighted avg       0.96      0.96      0.96        80

