In [2]:
import pandas as pd
import json

In [3]:
data = None
with open('metrics.json', 'r') as f:
    data = json.load(f)
df = pd.DataFrame.from_dict(data, orient='index', columns=['Value']).reset_index()

In [4]:
df

Unnamed: 0,index,Value
0,hog_128_16_kfold_k=1,0.599035
1,hog_128_16_holdout_k=1,0.550000
2,hog_128_16_kfold_k=3,0.560206
3,hog_128_16_holdout_k=3,0.516667
4,hog_128_16_kfold_k=5,0.541361
...,...,...
235,cnn_19_max_256_holdout_k=15,0.637500
236,cnn_19_max_256_kfold_k=17,0.600348
237,cnn_19_max_256_holdout_k=17,0.629167
238,cnn_19_max_256_kfold_k=19,0.605332


In [5]:
# Map for full Base names
base_name_map = {
    "hog_128_16": "HOG_128_16x16",
    "hog_128_20": "HOG_128_20x20",
    "hog_256_16": "HOG_256_16x16",
    "hog_256_20": "HOG_256_20x20",
    "cnn_16_avg_128": "CNN_VGG16_128_avg",
    "cnn_16_max_128": "CNN_VGG16_128_max",
    "cnn_19_avg_128": "CNN_VGG19_128_avg",
    "cnn_19_max_128": "CNN_VGG19_128_max",
    "cnn_16_avg_256": "CNN_VGG16_256_avg",
    "cnn_16_max_256": "CNN_VGG16_256_max",
    "cnn_19_avg_256": "CNN_VGG19_256_avg",
    "cnn_19_max_256": "CNN_VGG19_256_max",
}

# Extract information from the `index` column
df[['Base', 'Type', 'k']] = df['index'].str.extract(r'^(.*?)_(kfold|holdout)_k=(\d+)$')
df['k'] = df['k'].astype(int)  # Convert `k` to integers
df['Type'] = df['Type'].map({'kfold': '10-fold CV', 'holdout': '70/30'})  # Map types to descriptive names
df['Base'] = df['Base'].map(base_name_map)  # Map Base to full names

# import pdb; pdb.set_trace()
# Drop the original `index` column
df = df.drop(columns=['index'])

# Pivot to reshape the DataFrame
pivot_df = df.pivot_table(
    index=['Base', 'Type'], 
    columns='k', 
    values='Value'
).reset_index()

# Format the columns for readability
pivot_df.columns.name = None  # Remove the column group name
pivot_df = pivot_df.rename_axis(None, axis=1)  # Clean index name

# Format the cells to contain only 2 decimal places
pivot_df = pivot_df.round(2)

pivot_df

Unnamed: 0,Base,Type,1,3,5,7,9,11,13,15,17,19
0,CNN_VGG16_128_avg,10-fold CV,0.58,0.57,0.56,0.56,0.57,0.58,0.56,0.56,0.56,0.58
1,CNN_VGG16_128_avg,70/30,0.54,0.6,0.6,0.61,0.6,0.56,0.57,0.57,0.57,0.57
2,CNN_VGG16_128_max,10-fold CV,0.54,0.53,0.56,0.57,0.58,0.57,0.55,0.55,0.57,0.57
3,CNN_VGG16_128_max,70/30,0.54,0.56,0.62,0.62,0.6,0.56,0.58,0.58,0.57,0.56
4,CNN_VGG16_256_avg,10-fold CV,0.61,0.65,0.66,0.66,0.65,0.65,0.64,0.64,0.64,0.67
5,CNN_VGG16_256_avg,70/30,0.6,0.62,0.68,0.65,0.65,0.67,0.68,0.7,0.71,0.7
6,CNN_VGG16_256_max,10-fold CV,0.59,0.56,0.57,0.57,0.59,0.61,0.6,0.61,0.6,0.61
7,CNN_VGG16_256_max,70/30,0.58,0.52,0.52,0.58,0.6,0.6,0.57,0.62,0.57,0.58
8,CNN_VGG19_128_avg,10-fold CV,0.54,0.53,0.56,0.55,0.58,0.58,0.57,0.58,0.56,0.58
9,CNN_VGG19_128_avg,70/30,0.57,0.57,0.55,0.57,0.59,0.61,0.59,0.61,0.59,0.57


In [6]:
# Agrupar por 'Base' e calcular a média das linhas "70/30" e "10-fold CV"

grouped_df = pivot_df.drop(columns=['Type']).groupby('Base').mean()

# Calcular a média dos resultados
final_mean_df = grouped_df.mean(axis=1).reset_index()
final_mean_df.columns = ['Base', 'Mean']

# Ordenar os resultados
final_mean_df = final_mean_df.sort_values(by='Mean', ascending=False)

# Pegar os nomes das 6 melhores bases
top_6_bases = final_mean_df['Base'].head(6)

final_mean_df


Unnamed: 0,Base,Mean
2,CNN_VGG16_256_avg,0.6565
7,CNN_VGG19_256_max,0.605
6,CNN_VGG19_256_avg,0.595
3,CNN_VGG16_256_max,0.5825
0,CNN_VGG16_128_avg,0.5735
4,CNN_VGG19_128_avg,0.5725
1,CNN_VGG16_128_max,0.569
5,CNN_VGG19_128_max,0.565
9,HOG_128_20x20,0.5495
8,HOG_128_16x16,0.512


In [7]:
pivot_df.to_csv('metricas.csv', index=False)

In [8]:
# Map for full Base names
base_name_map_inv = {
    "HOG_128_16x16" : "hog_128_16",
    "HOG_128_20x20" : "hog_128_20",
    "HOG_256_16x16" : "hog_256_16",
    "HOG_256_20x20" : "hog_256_20",
    "CNN_VGG16_128_avg" : "cnn_16_avg_128",
    "CNN_VGG16_128_max" : "cnn_16_max_128",
    "CNN_VGG19_128_avg" : "cnn_19_avg_128",
    "CNN_VGG19_128_max" : "cnn_19_max_128",
    "CNN_VGG16_256_avg" : "cnn_16_avg_256",
    "CNN_VGG16_256_max" : "cnn_16_max_256",
    "CNN_VGG19_256_avg" : "cnn_19_avg_256",
    "CNN_VGG19_256_max" : "cnn_19_max_256",
}

with open('features.json', 'r') as f:
    data = json.load(f)


In [9]:
### Normalizando o dataset
from sklearn.preprocessing import StandardScaler

bases = [base_name_map_inv[base] for base in top_6_bases.to_list()]
features = {base: data[base] for base in bases}
features_std = {base: StandardScaler().fit_transform(features[base]) for base in bases}

In [10]:
### Importando PCA do Sklearn
from sklearn.decomposition import PCA

# Aplicar PCA em cada uma das bases presentes em features_std
pca = PCA(n_components=10, whiten=True)

features_pca_dict = {base + '_pca' : pca.fit_transform(features_std[base]) for base in bases}

In [11]:
import numpy as np

# Function to convert numpy arrays to lists in the dictionary
def convert_ndarrays(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_ndarrays(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_ndarrays(i) for i in obj]
    return obj

# Convert all ndarrays to lists
serializable_pca_features_dict = convert_ndarrays(features_pca_dict)

# Write the dictionary to JSON
with open('features_pca.json', 'w') as f:
    json.dump(serializable_pca_features_dict, f, indent=4)

print("Data saved to features.json")

Data saved to features.json
