In [1]:
import os
import warnings
os.environ['OMP_NUM_THREADS'] = '1'
warnings.filterwarnings("ignore", category=FutureWarning)

import re
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from itertools import product
import json


# Inicializa√ß√µes do NLTK
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Pr√©-processamento de texto
preprocess_methods = {
    'raw': lambda x: x,
    'clean': lambda text: re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', re.sub(r'[^\x00-\x7F]+', '', text.lower()))),
    'stopwords': lambda text: ' '.join([word for word in text.split() if word not in stop_words]),
    'lemmatization': lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]),
    'stopwords_lemmatization': lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]),

    'clean_stopwords': lambda text: ' '.join(
        [word for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split() if word not in stop_words]
    ),

    'clean_lemmatization': lambda text: ' '.join(
        [lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split()]
    ),

    'clean_stopwords_lemmatization': lambda text: ' '.join(
        [lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split() if word not in stop_words]
    )
}

# Gerar embeddings
def get_text_features(texts, model_name):
    model = SentenceTransformer(model_name)
    return model.encode(texts, show_progress_bar=True)

# Fun√ß√£o para salvar a matriz de confus√£o como imagem
def save_confusion_matrix(cm, class_names, output_path, title):
    plt.figure(figsize=(12, 8))
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.colorbar()
    plt.xticks(np.arange(len(class_names)), class_names, rotation=45, ha='right')
    plt.yticks(np.arange(len(class_names)), class_names)

    threshold = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > threshold else "black")

    plt.title(title, fontsize=16)
    plt.xlabel("Predicted", fontsize=12)
    plt.ylabel("True Labels", fontsize=12)

    plt.tight_layout()

    plt.savefig(output_path)
    plt.close()

# Salvar relat√≥rio de classifica√ß√£o em CSV
def save_classification_metrics(y_true, y_pred, class_names, output_path):
    report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True, zero_division=0)
    df_report = pd.DataFrame(report).transpose()
    df_report.to_csv(output_path, index=True)

# Treinar KMeans com inicializa√ß√£o pelos centr√≥ides m√©dios de cada classe
def train_kmeans(train_data, init_centroids):
    kmeans_instance = KMeans(n_clusters=len(init_centroids), n_init=1, init=init_centroids, random_state=42)
    kmeans_instance.fit(train_data)
    return kmeans_instance

# Testar o modelo KMeans
def test_kmeans(kmeans, test_data):
    return kmeans.predict(test_data)

def run_kmeans_cv(X, y, k_folds, seed, output_base_dir, le):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    y_true_general, y_pred_general = [], []
    results = {}
    classes = np.unique(y)

    for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #init_centroids = np.array([X_train[y_train == i].mean(axis=0) for i in np.unique(y_train)])
        init_centroids = np.array(pd.DataFrame(X_train).groupby(y_train).mean())
        kmeans_instance = train_kmeans(X_train, init_centroids)
        y_pred = test_kmeans(kmeans_instance, X_test)
        
        fold_dir = os.path.join(output_base_dir, f'{k_folds}_folds/fold_{fold_idx + 1}')
        os.makedirs(fold_dir, exist_ok=True)    

        cm = confusion_matrix(y_test, y_pred, labels=classes)
        save_confusion_matrix(cm, le.classes_, os.path.join(fold_dir, 'confusion_matrix.png'), f"Fold {fold_idx + 1}")
        save_classification_metrics(y_test, y_pred, le.classes_, os.path.join(fold_dir, 'classification_metrics.csv'))

        results[f'fold_{fold_idx + 1}'] =  accuracy_score(y_test, y_pred)
        y_true_general.extend(y_test)
        y_pred_general.extend(y_pred)

    cm = confusion_matrix(y_true_general, y_pred_general, labels=classes)
    save_confusion_matrix(cm, le.classes_, os.path.join(output_base_dir, f'{k_folds}_folds/confusion_matrix.png'), f"Geral - {k_folds}_folds")
    save_classification_metrics(y_true_general, y_pred_general, le.classes_, os.path.join(output_base_dir, f'{k_folds}_folds/classification_metrics.csv'))

    overall_accuracy = accuracy_score(y_true_general, y_pred_general)
    results['overall_accuracy'] = overall_accuracy
    return results

In [None]:
dataset_csv_path      = 'C:/Users/samue/Downloads/dataset_v2_atributos_25.csv'
data = pd.read_csv(dataset_csv_path, header=0, sep=';')
data['category'] = data['category'].str.replace('_', '-')
data = data[data['category'].notnull()].reset_index(drop=True)
data = data[~data['category'].isin(['domain-names', 'sports-collectibles'])]

# Filtrar data2 para manter apenas as linhas cujas 'slug' est√£o presentes em data
slugs_data = set(data['slug'])

# Principal
if __name__ == "__main__":
    #data = pd.read_csv("C:/Users/samue/Downloads/df_with_description.csv")
    data = pd.read_csv("C:/Users/samue/Downloads/dataset_description.csv")
    #data = pd.read_csv("C:/Users/samue/Downloads/dataset_description_25.csv")
    data = data[~data['category'].isin(['domain-names', 'sports-collectibles'])]
    data = data[data['slug'] != 'panoramic-portraits'].reset_index(drop=True)
    #data = data.groupby('category').head(25).reset_index(drop=True)

    data_filtrado = data[data['slug'].isin(slugs_data)].reset_index(drop=True)
    data = data_filtrado.copy()

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data['category'])

    embedding_models = [
        "paraphrase-multilingual-MiniLM-L12-v2",
        "all-MiniLM-L6-v2"
    ]

    #output_base = "C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys"
    output_base = "C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set"

    k_folds = [5, 10] 
    seed = 42
    results = {}
    for (prep_name, prep_func), model_name in product(preprocess_methods.items(), embedding_models):

        dir_name = f"{prep_name}_{model_name.replace('/', '_')}"
        output_dir = os.path.join(output_base, dir_name)
        os.makedirs(output_dir, exist_ok=True)

        print(f"‚ö†Ô∏èProcessando: {prep_name} | Modelo: {model_name}")
        processed_text = data['description'].fillna('').apply(prep_func)
        embeddings = get_text_features(processed_text, model_name)
        
        np.save(os.path.join(output_dir, 'embeddings.npy'), embeddings)
        print(f'‚úÖMatriz embeddings Salva! >>> {output_dir}')

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(embeddings)
        for fold in k_folds:
            results[str(fold) + '_folds'] = run_kmeans_cv(X_scaled, y, fold, seed, output_dir, label_encoder)
            print(f'| - {fold}_folds >>> accuracy: {results[str(fold) + "_folds"]["overall_accuracy"]}')

        nome_arquivo = f'{output_dir}/metrics.json'
        with open(nome_arquivo, 'w') as arquivo_json:
            json.dump(results, arquivo_json, ensure_ascii=False, indent=4)

        print(f"‚úÖDicion√°rio salvo em {nome_arquivo}")
        print(f"‚úÖProcessamento conclu√≠do!\n")


‚ö†Ô∏èProcessando: raw | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\raw_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5775075987841946
| - 10_folds >>> accuracy: 0.5820668693009119
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\raw_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: raw | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\raw_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.6033434650455927
| - 10_folds >>> accuracy: 0.6033434650455927
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\raw_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.6003039513677811
| - 10_folds >>> accuracy: 0.5927051671732523
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5957446808510638
| - 10_folds >>> accuracy: 0.6003039513677811
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: stopwords | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5927051671732523
| - 10_folds >>> accuracy: 0.601823708206687
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: stopwords | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5866261398176292
| - 10_folds >>> accuracy: 0.5987841945288754
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: lemmatization | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\lemmatization_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5668693009118541
| - 10_folds >>> accuracy: 0.574468085106383
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\lemmatization_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: lemmatization | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\lemmatization_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5942249240121581
| - 10_folds >>> accuracy: 0.5927051671732523
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\lemmatization_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: stopwords_lemmatization | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_lemmatization_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5896656534954408
| - 10_folds >>> accuracy: 0.5820668693009119
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_lemmatization_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: stopwords_lemmatization | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_lemmatization_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5927051671732523
| - 10_folds >>> accuracy: 0.60790273556231
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\stopwords_lemmatization_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_stopwords | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5957446808510638
| - 10_folds >>> accuracy: 0.5957446808510638
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_stopwords | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5775075987841946
| - 10_folds >>> accuracy: 0.5790273556231003
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_lemmatization | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_lemmatization_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5987841945288754
| - 10_folds >>> accuracy: 0.5896656534954408
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_lemmatization_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_lemmatization | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_lemmatization_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.6048632218844985
| - 10_folds >>> accuracy: 0.5987841945288754
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_lemmatization_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_stopwords_lemmatization | Modelo: paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_lemmatization_paraphrase-multilingual-MiniLM-L12-v2
| - 5_folds >>> accuracy: 0.5927051671732523
| - 10_folds >>> accuracy: 0.5881458966565349
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_lemmatization_paraphrase-multilingual-MiniLM-L12-v2/metrics.json
‚úÖProcessamento conclu√≠do!

‚ö†Ô∏èProcessando: clean_stopwords_lemmatization | Modelo: all-MiniLM-L6-v2


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖMatriz embeddings Salva! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_lemmatization_all-MiniLM-L6-v2
| - 5_folds >>> accuracy: 0.5835866261398176
| - 10_folds >>> accuracy: 0.5790273556231003
‚úÖDicion√°rio salvo em C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys_set\clean_stopwords_lemmatization_all-MiniLM-L6-v2/metrics.json
‚úÖProcessamento conclu√≠do!



In [25]:
data

Unnamed: 0,Zname,slug,category,description
0,Bored Ape Yacht Club,boredapeyachtclub,pfps,The Bored Ape Yacht Club is a collection of 10...
1,CryptoPunks,cryptopunks,pfps,"CryptoPunks launched as a fixed set of 10,000 ..."
2,Mutant Ape Yacht Club,mutant-ape-yacht-club,pfps,The MUTANT APE YACHT CLUB is a collection of u...
3,Azuki,azuki,pfps,Take the red bean to join the garden. View the...
4,CLONE X - X TAKASHI MURAKAMI,clonex,pfps,"üß¨ CLONE X üß¨\n\n20,000 next-gen Avatars, by RTF..."
...,...,...,...,...
653,Eric Rubens Editions,eric-rubens-editions,photography,Eric Rubens editions
654,Skateboarding: Culture in Motion,scim,photography,‚ÄúSkateboarding: Culture in Motion‚Äù is a collec...
655,Afromythology by Shawn Theodore,afromythology-by-shawn-theodore,photography,"Bucking traditional photographic formalism, Sh..."
656,NYC Underground Stories by Monaris,nycus,photography,Picture the New York Subway and the millions o...


In [12]:
import os
import json
import pandas as pd

# Caminho base onde est√£o as pastas
base_path = "C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys"

# Listas dos tipos de pr√©-processamento e modelos
preprocess_methods = [
    'clean_stopwords_lemmatization', 'clean_stopwords', 'clean_lemmatization',
    'stopwords_lemmatization', 'clean', 'stopwords', 'lemmatization', 'raw']
    

embedding_models = [
    "paraphrase-multilingual-MiniLM-L12-v2",
    "all-MiniLM-L6-v2"
]

# Lista para armazenar os resultados
results = []

# Iterar pelas pastas dentro do diret√≥rio base
for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)
    metrics_file = os.path.join(folder_path, 'metrics.json')
    
    if os.path.isdir(folder_path) and os.path.exists(metrics_file):
        # Identificar o tipo de pr√©-processamento e o modelo
        preprocess = next((p for p in preprocess_methods if p in folder), 'Desconhecido')
        model = next((m for m in embedding_models if m in folder), 'Desconhecido')
        
        # Ler o arquivo metrics.json
        with open(metrics_file, 'r') as f:
            data = json.load(f)
        
        # Adicionar cada resultado ao DataFrame
        for kfold, values in data.items():
            if 'overall_accuracy' in values:
                results.append({
                    'modelos': model,
                    'preprocessamento': preprocess,
                    'kfolds': kfold,
                    'acuracia': values['overall_accuracy']
                })

# Criar DataFrame
df_results = pd.DataFrame(results)
df_results.sort_values(by=['acuracia', 'modelos', 'preprocessamento', 'kfolds'], ascending=False, inplace=True)
df_results

Unnamed: 0,modelos,preprocessamento,kfolds,acuracia
24,all-MiniLM-L6-v2,stopwords,5_folds,0.594993
31,paraphrase-multilingual-MiniLM-L12-v2,stopwords,10_folds,0.59352
26,all-MiniLM-L6-v2,stopwords_lemmatization,5_folds,0.59352
27,all-MiniLM-L6-v2,stopwords_lemmatization,10_folds,0.592047
20,all-MiniLM-L6-v2,raw,5_folds,0.592047
30,paraphrase-multilingual-MiniLM-L12-v2,stopwords,5_folds,0.589102
25,all-MiniLM-L6-v2,stopwords,10_folds,0.589102
2,all-MiniLM-L6-v2,clean_lemmatization,5_folds,0.586156
29,paraphrase-multilingual-MiniLM-L12-v2,stopwords_lemmatization,10_folds,0.584683
15,paraphrase-multilingual-MiniLM-L12-v2,clean_stopwords,10_folds,0.584683


In [13]:
# Salvar o resultado em um arquivo CSV
output_path = "C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys/results_summary.csv"
df_results.to_csv(output_path, index=False)

print(f"‚úÖ Resultados organizados e salvos com sucesso em: {output_path}")

‚úÖ Resultados organizados e salvos com sucesso em: C:/Users/samue/Downloads/NFT25/results_dynamic_7categorys/results_summary.csv


### C√≥dido Dayan

In [21]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

def create_dirs(base_dir, sub_dirs):
    for sub_dir in sub_dirs:
        os.makedirs(os.path.join(base_dir, sub_dir), exist_ok=True)

def save_confusion_matrix(cm, class_names, output_path, title):
    plt.figure(figsize=(14, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(title, fontsize=12)
    plt.xticks(rotation=45)
    plt.xlabel("Predicted")
    plt.ylabel("True Labels")
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

def save_classification_metrics(y_true, y_pred, class_names, output_path):
    report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True, zero_division=0)
    df_report = pd.DataFrame(report).transpose()
    df_report.to_csv(output_path, index=True)

def train_kmeans(train_data, init_centroids):
    kmeans_instance = KMeans(n_clusters=9, n_init=1, init=init_centroids, random_state=42)
    kmeans_instance.fit(train_data)
    return kmeans_instance

def test_kmeans(kmeans, test_data):
    return kmeans.predict(test_data)

def run_kmeans_cv(X, y, k_folds, seed, output_base_dir, fold_col_suffix):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    classes = np.unique(y)
    y_true_general, y_pred_general = [], []

    for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Treinar o KMeans
        init_centroids = np.array(pd.DataFrame(X_train).groupby(y_train).mean())
        kmeans_instance = train_kmeans(X_train, init_centroids)
        y_pred = test_kmeans(kmeans_instance, X_test)

        y_true_general.extend(y_test)
        y_pred_general.extend(y_pred)

        # Salvar matriz de confus√£o por fold
        cm = confusion_matrix(y_test, y_pred, labels=classes)
        fold_dir = os.path.join(output_base_dir, f'fold_{fold_idx + 1}')
        os.makedirs(fold_dir, exist_ok=True)
        save_confusion_matrix(cm, le.classes_, os.path.join(fold_dir, 'confusion_matrix.png'), f"Fold {fold_idx + 1}")

        # Salvar m√©tricas de classifica√ß√£o
        save_classification_metrics(y_test, y_pred, le.classes_, os.path.join(fold_dir, 'classification_metrics.csv'))

    # Matriz de confus√£o geral
    cm_general = confusion_matrix(y_true_general, y_pred_general, labels=classes)
    save_confusion_matrix(cm_general, le.classes_, os.path.join(output_base_dir, 'general_confusion_matrix.png'), "Geral")
    save_classification_metrics(y_true_general, y_pred_general, le.classes_, os.path.join(output_base_dir, 'general_classification_metrics.csv'))

# Carregar os dados
columns = ['total_volume', 'total_sales', 'total_supply', 'num_owners', 'average_price', 'market_cap', 'qtd_traits', 'qtd_editors', 'category']
df_path_710 = 'C:/Users/samue/Downloads/dataset_v2_atributos.csv'
data_710 = pd.read_csv(df_path_710)

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_710[columns[:-1]])
le = LabelEncoder()
y = le.fit_transform(data_710['category'])

# Definir sa√≠da
output_base_dir = "C:/Users/samue/Downloads/NFT25/dayan"
create_dirs(output_base_dir, ['3_folds', '5_folds', '10_folds'])

# Executar KMeans com 3, 5 e 10 folds
print("Executando KMeans com 3 folds...")
run_kmeans_cv(data_scaled, y, 3, 42, os.path.join(output_base_dir, '3_folds'), '3fold')

print("Executando KMeans com 5 folds...")
run_kmeans_cv(data_scaled, y, 5, 42, os.path.join(output_base_dir, '5_folds'), '5fold')

print("Executando KMeans com 10 folds...")
run_kmeans_cv(data_scaled, y, 10, 42, os.path.join(output_base_dir, '10_folds'), '10fold')

print("Processamento conclu√≠do!")


Executando KMeans com 3 folds...
Executando KMeans com 5 folds...
Executando KMeans com 10 folds...
Processamento conclu√≠do!


### Oversampling simples e smote

In [2]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

In [8]:
# Pr√©-processamento de texto
preprocess_methods = {
    'raw': lambda x: x,
    'clean': lambda text: re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', re.sub(r'[^\x00-\x7F]+', '', text.lower()))),
    'stopwords': lambda text: ' '.join([word for word in text.split() if word not in stop_words]),
    'lemmatization': lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]),
    'stopwords_lemmatization': lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]),

    'clean_stopwords': lambda text: ' '.join(
        [word for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split() if word not in stop_words]
    ),

    'clean_lemmatization': lambda text: ' '.join(
        [lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split()]
    ),

    'clean_stopwords_lemmatization': lambda text: ' '.join(
        [lemmatizer.lemmatize(word) for word in re.sub(r'[^\w\s]', '', 
        re.sub(r'\d+', '', 
        re.sub(r'[^\x00-\x7F]+', '', text.lower()))).split() if word not in stop_words]
    )
}

# Gerar embeddings
def get_text_features(texts, model_name):
    model = SentenceTransformer(model_name)
    return model.encode(texts, show_progress_bar=True)

# Fun√ß√£o para salvar a matriz de confus√£o como imagem
def save_confusion_matrix(cm, class_names, output_path, title):
    plt.figure(figsize=(12, 8))
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.colorbar()
    plt.xticks(np.arange(len(class_names)), class_names, rotation=45, ha='right')
    plt.yticks(np.arange(len(class_names)), class_names)

    threshold = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > threshold else "black")

    plt.title(title, fontsize=16)
    plt.xlabel("Predicted", fontsize=12)
    plt.ylabel("True Labels", fontsize=12)

    plt.tight_layout()

    plt.savefig(output_path)
    plt.close()

# Salvar relat√≥rio de classifica√ß√£o em CSV
def save_classification_metrics(y_true, y_pred, class_names, output_path):
    report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True, zero_division=0)
    df_report = pd.DataFrame(report).transpose()
    df_report.to_csv(output_path, index=True)

# Treinar KMeans com inicializa√ß√£o pelos centr√≥ides m√©dios de cada classe
def train_kmeans(train_data, init_centroids):
    kmeans_instance = KMeans(n_clusters=len(init_centroids), n_init=1, init=init_centroids, random_state=42)
    kmeans_instance.fit(train_data)
    return kmeans_instance

# Testar o modelo KMeans
def test_kmeans(kmeans, test_data):
    return kmeans.predict(test_data)

# Fun√ß√£o para gerar o gr√°fico
def plot_kmeans_clusters(X, kmeans):
    # Predi√ß√£o do KMeans nos dados
    labels = kmeans.labels_

    # Contagem de dados por cluster
    cluster_counts = np.bincount(labels)
    
    # Plotando gr√°fico de barras
    plt.figure(figsize=(10, 6))
    sns.barplot(x=np.arange(len(cluster_counts)), y=cluster_counts, palette='viridis')
    plt.title('N√∫mero de Dados por Cluster', fontsize=16)
    plt.xlabel('Cluster', fontsize=12)
    plt.ylabel('N√∫mero de Dados', fontsize=12)
    plt.show()

# Plotar gr√°fico de dispers√£o
def plot_kmeans_scatter(X, kmeans):
    # Predi√ß√£o do KMeans nos dados
    labels = kmeans.labels_

    plt.figure(figsize=(10, 8))
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.title('Visualiza√ß√£o dos Clusters KMeans', fontsize=16)
    plt.xlabel('Feature 1', fontsize=12)
    plt.ylabel('Feature 2', fontsize=12)
    plt.colorbar(label='Cluster')
    plt.show()
    
def run_kmeans_cv(X, y, k_folds, seed, output_base_dir, le, smote):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed)
    y_true_general, y_pred_general = [], []
    results = {}
    classes = np.unique(y)

    for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Passo 1: Treinar o KMeans nos dados de treino
        kmeans_initial = KMeans(n_clusters=3, random_state=seed)
        kmeans_initial.fit(X_train)
        cluster_labels = kmeans_initial.predict(X_train)

        # Plotar a quantidade de dados por cluster
        #plot_kmeans_clusters(X_train, kmeans_initial)
        
        # Plotar a visualiza√ß√£o dos clusters
        #plot_kmeans_scatter(X_train, kmeans_initial)

        # Passo 2: Filtrar os melhores clusters
        # Vamos filtrar os clusters com base na densidade ou na quantidade de pontos em cada cluster
        cluster_counts = np.bincount(cluster_labels)
        
        # Encontrar o √≠ndice do cluster com o menor n√∫mero de elementos
        smallest_cluster = np.argmin(cluster_counts)    
        #filtered_clusters = np.where(cluster_counts > 10)[0]  # Manter apenas os clusters com mais de 10 pontos

        # Excluir o cluster com o menor n√∫mero de elementos
        filter_mask = cluster_labels != smallest_cluster
        # Filtrar os dados de treino com base nos clusters selecionados
        #filter_mask = np.isin(cluster_labels, filtered_clusters)
        
        X_train_filtered = X_train[filter_mask]
        y_train_filtered = y_train[filter_mask]

        X_train = X_train_filtered
        y_train = y_train_filtered
        # Aplicar SMOTE apenas no conjunto de treinamento
        if smote:
            X_train, y_train = smote.fit_resample(X_train, y_train)

        # Treinar KMeans com dados balanceados
        init_centroids = np.array(pd.DataFrame(X_train).groupby(y_train).mean())
        kmeans_instance = train_kmeans(X_train, init_centroids)
        y_pred = test_kmeans(kmeans_instance, X_test)

        fold_dir = os.path.join(output_base_dir, f'{k_folds}_folds/fold_{fold_idx + 1}')
        os.makedirs(fold_dir, exist_ok=True)    

        cm = confusion_matrix(y_test, y_pred, labels=classes)
        save_confusion_matrix(cm, le.classes_, os.path.join(fold_dir, 'confusion_matrix.png'), f"Fold {fold_idx + 1}")
        save_classification_metrics(y_test, y_pred, le.classes_, os.path.join(fold_dir, 'classification_metrics.csv'))
    
        results[f'fold_{fold_idx + 1}'] =  accuracy_score(y_test, y_pred)
        y_true_general.extend(y_test)
        y_pred_general.extend(y_pred)

    cm = confusion_matrix(y_true_general, y_pred_general, labels=classes)
    save_confusion_matrix(cm, le.classes_, os.path.join(output_base_dir, f'{k_folds}_folds/confusion_matrix.png'), f"Geral - {k_folds}_folds")
    save_classification_metrics(y_true_general, y_pred_general, le.classes_, os.path.join(output_base_dir, f'{k_folds}_folds/classification_metrics.csv'))

    overall_accuracy = accuracy_score(y_true_general, y_pred_general)
    results['overall_accuracy'] = overall_accuracy
    return results

In [10]:
# Principal
if __name__ == "__main__":
    #data = pd.read_csv("C:/Users/samue/Downloads/df_with_description.csv")
    data = pd.read_csv("C:/Users/samue/Downloads/dataset_description.csv")
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data['category'])

    embedding_models = [
        "paraphrase-multilingual-MiniLM-L12-v2",
        "all-MiniLM-L6-v2"
    ]

    k_folds = [3, 5, 10] 
    seed = 42
    results = {}

    #Gerar Embbedings
    input_base = "C:/Users/samue/Downloads/NFT25/results_dynamic_2"
    #output_base = "C:/Users/samue/Downloads/NFT25/results_dynamic"
    
    #Carregar
    #smote = BorderlineSMOTE(random_state=seed)
    #output_base = "C:/Users/samue/Downloads/NFT25/results_dynamic_BorderlineSMOTE"

    smote = SMOTE(random_state=seed)
    output_base = "C:/Users/samue/Downloads/NFT25/results_dynamic_SMOTE_new2"

    for (prep_name, prep_func), model_name in product(preprocess_methods.items(), embedding_models):

        dir_name = f"{prep_name}_{model_name.replace('/', '_')}"
        input_dir = os.path.join(input_base, dir_name)
        output_dir = os.path.join(output_base, dir_name)
        os.makedirs(output_dir, exist_ok=True)

        print(f"‚ö†Ô∏èProcessando: {prep_name} | Modelo: {model_name}")
        processed_text = data['description'].fillna('').apply(prep_func)
        
        # Gerar Embbedings
        #embeddings = get_text_features(processed_text, model_name)
        #np.save(os.path.join(output_dir, 'embeddings.npy'), embeddings)
        #print(f'‚úÖMatriz embeddings Salva! >>> {output_dir}')

        # Carregar Embbedings
        embeddings_path = os.path.join(input_dir, 'embeddings.npy')        
        embeddings = np.load(embeddings_path)
        print(f'‚úÖMatriz embeddings carregada! >>> {embeddings_path}')
        print(f'‚úÖMatriz embeddings >>> {embeddings.shape}')
        print(f'‚úÖMatriz embeddings >>> {embeddings}')
        break


        '''
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(embeddings)

        for fold in k_folds:
            results[str(fold) + '_folds'] = run_kmeans_cv(X_scaled, y, fold, seed, output_dir, label_encoder, smote)
            print(f'| - {fold}_folds >>> accuracy: {results[str(fold) + "_folds"]["overall_accuracy"]}')

        nome_arquivo = f'{output_dir}/metrics.json'
        with open(nome_arquivo, 'w') as arquivo_json:
            json.dump(results, arquivo_json, ensure_ascii=False, indent=4)

        print(f"‚úÖDicion√°rio salvo em {nome_arquivo}")'''
        print(f"‚úÖProcessamento conclu√≠do!\n")


‚ö†Ô∏èProcessando: raw | Modelo: paraphrase-multilingual-MiniLM-L12-v2
‚úÖMatriz embeddings carregada! >>> C:/Users/samue/Downloads/NFT25/results_dynamic_2\raw_paraphrase-multilingual-MiniLM-L12-v2\embeddings.npy
‚úÖMatriz embeddings >>> (711, 384)
‚úÖMatriz embeddings >>> [[ 0.04778653  0.03305385  0.12186813 ...  0.09785338 -0.30351835
   0.14238296]
 [-0.2104857  -0.26380208 -0.21774286 ... -0.20872392 -0.04061646
   0.11463138]
 [-0.19266498 -0.07589609 -0.19777267 ...  0.08735441  0.17186333
   0.14984657]
 ...
 [ 0.09293503  0.14670089 -0.25660846 ...  0.20481679  0.00595592
  -0.02706998]
 [ 0.14858088 -0.1650946   0.15047547 ... -0.04498821 -0.40291464
  -0.10871606]
 [-0.17094752  0.06419861 -0.2678259  ... -0.04350909  0.10477987
  -0.03905095]]


In [5]:
# Listas dos tipos de pr√©-processamento e modelos
preprocess_methods_ = [
    'clean_stopwords_lemmatization', 'clean_stopwords', 'clean_lemmatization',
    'stopwords_lemmatization', 'clean', 'stopwords', 'lemmatization', 'raw']
    
embedding_models = [
    "paraphrase-multilingual-MiniLM-L12-v2",
    "all-MiniLM-L6-v2"
]

results = []
for folder in os.listdir(output_base):
    folder_path = os.path.join(output_base, folder)
    metrics_file = os.path.join(folder_path, 'metrics.json')
    
    if os.path.isdir(folder_path) and os.path.exists(metrics_file):
        preprocess = next((p for p in preprocess_methods_ if p in folder), 'Desconhecido')
        model = next((m for m in embedding_models if m in folder), 'Desconhecido')
        
        with open(metrics_file, 'r') as f:
            data = json.load(f)
        
        for kfold, values in data.items():
            if 'overall_accuracy' in values:
                results.append({
                    'modelos': model,
                    'preprocessamento': preprocess,
                    'kfolds': kfold,
                    'acuracia': values['overall_accuracy']
                })

# Criar DataFrame
df_results = pd.DataFrame(results)
df_results.sort_values(by=['modelos', 'preprocessamento', 'kfolds'], ascending=False, inplace=True)

# Salvar o resultado em um arquivo CSV
output_path = f"{output_base}/results_summary.csv"
df_results.to_csv(output_path, index=False)

print(f"‚úÖ Resultados organizados e salvos com sucesso em: {output_path}")

‚úÖ Resultados organizados e salvos com sucesso em: C:/Users/samue/Downloads/NFT25/results_dynamic_SMOTE_new/results_summary.csv


In [None]:
import pandas as pd

# Carregar os tr√™s arquivos CSV
normal = pd.read_csv("C:/Users/samue/Downloads/NFT25/results_dynamic/results_summary.csv")
smote = pd.read_csv("C:/Users/samue/Downloads/NFT25/results_dynamic_SMOTE/results_summary.csv")
borderlinesMOTE = pd.read_csv("C:/Users/samue/Downloads/NFT25/results_dynamic_BorderlineSMOTE/results_summary.csv")

# Adicionar as colunas de acur√°cia para cada dataset
normal['accBaseLine'] = normal['acuracia']
smote['accSmote'] = smote['acuracia']
borderlinesMOTE['accBorderlinesMOTE'] = borderlinesMOTE['acuracia']

# Remover a coluna 'acuracia' original, pois j√° a incorporamos nas novas colunas
normal.drop('acuracia', axis=1, inplace=True)
smote.drop('acuracia', axis=1, inplace=True)
borderlinesMOTE.drop('acuracia', axis=1, inplace=True)

# Juntar os dataframes com base nas colunas 'modelos', 'preprocessamento', 'kfolds'
merged_df = pd.merge(normal, smote, on=['modelos', 'preprocessamento', 'kfolds'], how='outer')
merged_df = pd.merge(merged_df, borderlinesMOTE, on=['modelos', 'preprocessamento', 'kfolds'], how='outer')

# Salvar o dataframe no formato CSV com separa√ß√£o por v√≠rgula
output_path = "C:/Users/samue/Downloads/NFT25/merged_results.csv"
merged_df.to_csv(output_path, index=False, decimal=',', sep='\t')

print(f"‚úÖ Dataframe salvo em {output_path}")


### Random Forest ou SVM