In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from torchvision import transforms

def load_images_from_folder(folder):
    images = []
    labels = []
    for label in os.listdir(folder):
        label_folder = os.path.join(folder, label)
        for filename in os.listdir(label_folder):
            img_path = os.path.join(label_folder, filename)
            img = Image.open(img_path).convert('RGB')
            images.append(img)
            labels.append(label)
    return images, labels

def preprocess_images(images, size=(224, 224)):
    transform = transforms.Compose([
        transforms.Resize(size),
        transforms.ToTensor(),
        # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return [transform(img).numpy().flatten() for img in images]

def apply_pca_and_tsne(train_data, valid_data, test_data, dataset):
    # PCA
    pca = PCA(n_components=50)
    train_data_pca = pca.fit_transform(train_data)

    # Save PCA model
    np.save(f'pca_components_of_{dataset}.npy', pca.components_)

    # Apply PCA to validation and test data
    valid_data_pca = pca.transform(valid_data)
    test_data_pca = pca.transform(test_data)

    # T-SNE
    # Combine all data for t-SNE
    all_data_pca = np.vstack([train_data_pca, valid_data_pca, test_data_pca])
    all_labels = ['Train'] * len(train_data_pca) + ['Valid'] * len(valid_data_pca) + ['Test'] * len(test_data_pca)
    tsne = TSNE(n_components=2, random_state=0)
    all_data_tsne = tsne.fit_transform(all_data_pca)

    # Split t-SNE data back into train, valid, test
    n_train = len(train_data_pca)
    n_valid = len(valid_data_pca)
    
    train_data_tsne = all_data_tsne[:n_train]
    valid_data_tsne = all_data_tsne[n_train:n_train+n_valid]
    test_data_tsne = all_data_tsne[n_train+n_valid:]

    return train_data_pca, valid_data_pca, test_data_pca, train_data_tsne, valid_data_tsne, test_data_tsne

def plot_pca_results(train_pca, valid_pca, test_pca, valid_labels, test_labels, dataset):
    df_train_pca = pd.DataFrame(train_pca, columns=[f'PC{i+1}' for i in range(train_pca.shape[1])])
    df_train_pca['Dataset'] = 'Train'

    df_valid_pca = pd.DataFrame(valid_pca, columns=[f'PC{i+1}' for i in range(valid_pca.shape[1])])
    df_valid_pca['Dataset'] = 'Valid'
    df_valid_pca['Label'] = valid_labels

    df_test_pca = pd.DataFrame(test_pca, columns=[f'PC{i+1}' for i in range(test_pca.shape[1])])
    df_test_pca['Dataset'] = 'Test'
    df_test_pca['Label'] = test_labels

    # Plot PCA results for each dataset
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_train_pca, x='PC1', y='PC2', hue='Dataset', palette='Set1')
    plt.title(f'PCA Projection - {dataset} (Train)')
    plt.savefig(f'pca_train_{dataset}.png')
    plt.show()

    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_valid_pca, x='PC1', y='PC2', hue='Label', palette='Set1')
    plt.title(f'PCA Projection - {dataset} (Valid)')
    plt.savefig(f'pca_valid_{dataset}.png')
    plt.show()

    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_test_pca, x='PC1', y='PC2', hue='Label', palette='Set1')
    plt.title(f'PCA Projection - {dataset} (Test)')
    plt.savefig(f'pca_test_{dataset}.png')
    plt.show()

def plot_tsne_results(train_tsne, valid_tsne, test_tsne, valid_labels, test_labels, dataset):
    df_train_tsne = pd.DataFrame(train_tsne, columns=['Dim1', 'Dim2'])
    df_train_tsne['Dataset'] = 'Train'

    df_valid_tsne = pd.DataFrame(valid_tsne, columns=['Dim1', 'Dim2'])
    df_valid_tsne['Dataset'] = 'Valid'
    df_valid_tsne['Label'] = valid_labels

    df_test_tsne = pd.DataFrame(test_tsne, columns=['Dim1', 'Dim2'])
    df_test_tsne['Dataset'] = 'Test'
    df_test_tsne['Label'] = test_labels

    # Plot t-SNE results for each dataset
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_train_tsne, x='Dim1', y='Dim2', hue='Dataset', palette='Set1')
    plt.title(f'T-SNE Projection - {dataset} (Train)')
    plt.savefig(f'tsne_train_{dataset}.png')
    plt.show()

    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_valid_tsne, x='Dim1', y='Dim2', hue='Label', palette='Set1')
    plt.title(f'T-SNE Projection - {dataset} (Valid)')
    plt.savefig(f'tsne_valid_{dataset}.png')
    plt.show()

    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_test_tsne, x='Dim1', y='Dim2', hue='Label', palette='Set1')
    plt.title(f'T-SNE Projection - {dataset} (Test)')
    plt.savefig(f'tsne_test_{dataset}.png')
    plt.show()

def main(data_folder, datasetversion):
    train_folder = os.path.join(data_folder, 'train')
    valid_folder = os.path.join(data_folder, 'valid')
    test_folder = os.path.join(data_folder, 'test')

    # Load and preprocess images
    train_images, train_labels = load_images_from_folder(train_folder)
    valid_images, valid_labels = load_images_from_folder(valid_folder)
    test_images, test_labels = load_images_from_folder(test_folder)

    train_data = preprocess_images(train_images)
    valid_data = preprocess_images(valid_images)
    test_data = preprocess_images(test_images)

    # Apply PCA and T-SNE
    train_pca, valid_pca, test_pca, train_tsne, valid_tsne, test_tsne = apply_pca_and_tsne(train_data, valid_data, test_data, datasetversion)

    # Plot results
    plot_pca_results(train_pca, valid_pca, test_pca, valid_labels, test_labels, datasetversion)
    plot_tsne_results(train_tsne, valid_tsne, test_tsne, valid_labels, test_labels, datasetversion)

In [None]:
if __name__ == "__main__":
    data_folder = '/kaggle/input/thyroidcancer-ver1/dataver1/ver1'  # Thay đổi đường dẫn tới thư mục dữ liệu của bạn
    dataset_version = 'ver1'
    main(data_folder, dataset_version)

In [None]:
if __name__ == "__main__":
    data_folder = '/kaggle/input/thyroidcancer-ver1/dataver2/dataver2'  # Thay đổi đường dẫn tới thư mục dữ liệu của bạn
    dataset_version = 'ver2'
    main(data_folder, dataset_version)

In [None]:
if __name__ == "__main__":
    data_folder = '/kaggle/input/thyroidcancer-ver1/dataver3/dataver3'  # Thay đổi đường dẫn tới thư mục dữ liệu của bạn
    dataset_version = 'ver3'
    main(data_folder, dataset_version)

In [None]:
%cd /kaggle/working
!zip -r visualize_dataset.zip .