## Load the decoded data

In [64]:
import numpy as np
from utils.data import load_pickle
from sklearn.metrics import accuracy_score


# Labels
labels = ['丝瓜', '你', '关门', '凳子', '厕所', '口渴', '吃', '喝', '嘴巴', '外卖', '头疼', '家人', '小刀', '帮助', '平静', '心情', '怎样', '感觉', '愿意', '我', '手机', '找', '把', '护士', '拿', '换药', '放在', '是', '有', '朋友', '橙汁', '毛巾', '汤圆', '漂亮', '热水', '猪肉', '玩', '电脑', '看', '碗', '穿', '篮球', '米饭', '给', '脸盆', '菠萝', '葱花', '蒜泥', '衣服', '豆腐', '软糖', '醋', '钢琴', '问题', '需要', '青菜', '面条', '音乐', '预约', '香肠', '鱼块']

labels_catg = [
    ['菠萝', '豆腐', '米饭', '面条', '醋', '青菜', '蒜泥', '香肠', '橙汁', '猪肉', '鱼块', '汤圆', '丝瓜', '葱花', '软糖', '外卖', '热水'],
    ['帮助', '把', '放在', '关门', '拿', '找', '吃', '穿', '喝', '给',	'玩', '看', '感觉', '预约', '换药'],
    ['脸盆', '衣服', '毛巾', '电脑', '凳子', '小刀', '手机', '钢琴', '碗', '篮球', '厕所', '音乐'],
    ['朋友', '家人', '护士', '你', '我', '嘴巴', '漂亮'],
    ['心情', '平静', '口渴', '头疼'],
    ['是', '有', '怎样', '问题', '需要', '愿意']
]

labels_cat_idx = [
    [labels.index(l) for l in cat] for cat in labels_catg
]

######################################
subj_eval = '002'  # '001' ~ '012'
######################################

path = f'../../summaries/duin/{subj_eval}/'
task = ['Train', 'Valid', 'Test']

# Load the decoded representations
decoded = load_pickle(fname=path + 'decoded.pkl')

y_pred = [
    decoded['y_pred_train'],
    decoded['y_pred_valid'],
    decoded['y_pred_test']
]
embed = [
    decoded['embed_train'],
    decoded['embed_valid'],
    decoded['embed_test']
]
y_true = [
    decoded['y_true_train'],
    decoded['y_true_valid'],
    decoded['y_true_test']
]


print(f'Embedding shape: {embed[0].shape}')
print(f'Label shape: {y_true[0].shape}')
print(f'Train set: {embed[0].shape[0]} samples')
print(f'Valid set: {embed[1].shape[0]} samples')
print(f'Test set: {embed[-1].shape[0]} samples')

# print accuracy
acc = [accuracy_score(y_true[i], y_pred[i]) for i in range(3)]
print(f'Train set accuracy: {acc[0]:.4f}')
print(f'Valid set accuracy: {acc[1]:.4f}')
print(f'Test set accuracy: {acc[-1]:.4f}')

Embedding shape: (2895, 30, 160)
Label shape: (2895,)
Train set: 2895 samples
Valid set: 365 samples
Test set: 365 samples
Train set accuracy: 0.9914
Valid set accuracy: 0.8000
Test set accuracy: 0.7918


## Embedding visualization

In [None]:
# T-SNE visualization
import matplotlib
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# T-SNE
tsne = TSNE(n_components=2, random_state=0)
X = [tsne.fit_transform(embed[i].reshape(embed[i].shape[0], -1)) for i in range(3)]

# Plot
cmap = plt.get_cmap('gist_rainbow')
colors_all = cmap(np.linspace(0, 1, len(labels)))
# cmap = [plt.get_cmap('Blues'), plt.get_cmap('Greens'), plt.get_cmap('Reds'), 
#         plt.get_cmap('Purples'), plt.get_cmap('Oranges'), plt.get_cmap('Greys')]

matplotlib.rcParams['font.family'] = 'Arial'  # Whole font family
my_font = FontProperties(fname=r"c:\windows\fonts\SimHei.ttf")  # Chinese font

# 类别序号的偏移：上，下，右，左，左上，右上
offset = [[0, 3], [0, -4], [3, 0], [-3, 0], [-2, 2], [2, 2]]

for xi in range(len(X)):
    cn = 0
    plt.figure(figsize=(12, 8))
    for i in range(len(labels_catg)):
        # colors = cmap[i](np.linspace(0, 1, len(labels_catg[i])))
        for j in range(len(labels_catg[i])):
            plt.scatter(X[xi][y_true[xi] == labels_cat_idx[i][j], 0], X[xi][y_true[xi] == labels_cat_idx[i][j], 1], 
                        label=f'{cn+1}-{labels[labels_cat_idx[i][j]]}', color=colors_all[cn], s=10)
            # 在每一种类别的散点的中心标注类别序号
            x_mean = np.mean(X[xi][y_true[xi] == labels_cat_idx[i][j], 0])
            y_mean = np.mean(X[xi][y_true[xi] == labels_cat_idx[i][j], 1])
            # 加粗字体
            plt.text(x_mean + offset[i][0], y_mean + offset[i][1], f'{cn+1}', ha='center', va='center',
                     fontsize=10, color=colors_all[cn], alpha=0.7, fontweight='bold')
            cn += 1

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=2, prop=my_font)
    plt.title(f'T-SNE Visualization of {task[xi]} Set', fontsize=30)
    plt.xlabel('Component 1', fontsize=20)
    plt.ylabel('Component 2', fontsize=20)
    plt.tight_layout()
    
    plt.savefig(f'{path}/tsne_colorAll_{task[xi]}.png', dpi=300)
    # plt.show()


### Embedding visualization Train-Valid-Test

In [None]:
# T-SNE visualization
import matplotlib
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# Prepare data
x_embed = np.concatenate([embed[i].reshape(embed[i].shape[0], -1) for i in range(3)], axis=0)
train_idx = [0, embed[0].shape[0]]
valid_idx = [train_idx[-1], train_idx[-1] + embed[1].shape[0]]
test_idx = [valid_idx[-1], valid_idx[-1] + embed[2].shape[0]]

# T-SNE
tsne = TSNE(n_components=2, random_state=0)
X = tsne.fit_transform(x_embed)

# Plot
cmap = plt.get_cmap('gist_rainbow')
colors_all = cmap(np.linspace(0, 1, len(labels)))

matplotlib.rcParams['font.family'] = 'Arial'  # Whole font family
my_font = FontProperties(fname=r"c:\windows\fonts\SimHei.ttf")  # Chinese font

# 类别序号的偏移：上，下，右，左，左上，右上
offset = [[0, 4], [0, -5], [4, 0], [-3, 0], [-3, 3], [3, 3]]

cn = 0
plt.figure(figsize=(12, 8))
for i in range(len(labels_catg)):
    for j in range(len(labels_catg[i])):
        # Train set
        x_mean = np.mean(X[train_idx[0]:valid_idx[0], 0][y_true[0] == labels_cat_idx[i][j]])
        y_mean = np.mean(X[train_idx[0]:valid_idx[0], 1][y_true[0] == labels_cat_idx[i][j]])
        plt.scatter(x_mean, y_mean, color=colors_all[cn], s=500, alpha=0.1, marker='o')
        # 加粗字体
        plt.text(x_mean + offset[i][0], y_mean + offset[i][1], f'{cn+1}', ha='center', va='center',
                    fontsize=10, color=colors_all[cn], fontweight='bold')
        # Valid set
        plt.scatter(X[valid_idx[0]:valid_idx[1], 0][y_true[1] == labels_cat_idx[i][j]], 
                    X[valid_idx[0]:valid_idx[1], 1][y_true[1] == labels_cat_idx[i][j]], 
                    color=colors_all[cn], s=30, marker='x', alpha=0.7)
        # Test set
        plt.scatter(X[test_idx[0]:test_idx[1], 0][y_true[2] == labels_cat_idx[i][j]], 
                    X[test_idx[0]:test_idx[1], 1][y_true[2] == labels_cat_idx[i][j]], 
                    label=f'{cn+1}-{labels[labels_cat_idx[i][j]]}', color=colors_all[cn], s=50, marker='*', alpha=0.7)
        
        cn += 1

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=2, prop=my_font)
plt.title(f'T-SNE Visualization of All Data', fontsize=30)
plt.xlabel('Component 1', fontsize=20)
plt.ylabel('Component 2', fontsize=20)
plt.tight_layout()

plt.savefig(f'{path}/tsne_colorAll.png', dpi=300)
plt.show()


## Confusion Matrix

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.metrics import confusion_matrix

# Confusion matrix
labels_catg_list = [l for cat in labels_catg for l in cat]
labels_catg_idx_list = [l for cat in labels_cat_idx for l in cat]
conf_mat = [confusion_matrix(y_true[i], y_pred[i], normalize='true', labels=labels_catg_idx_list) for i in range(3)]

# Plot
matplotlib.rcParams['font.family'] = 'Arial'  # Whole font family
my_font = FontProperties(fname=r"c:\windows\fonts\SimHei.ttf", size=15)  # Chinese font
l_width = 3

# 每一类的方框坐标 [x, y]
rect_coord = [[0, len(labels_catg[0])]]
rect_left = 0
for i in range(1, len(labels_catg)):
    rect_left += len(labels_catg[i-1])
    rect_coord.append([rect_left, rect_left + len(labels_catg[i])])

for xi in range(len(conf_mat)):
    cn = 0
    colors = ['red', 'green', 'blue', 'purple', 'orange', 'grey']
    fig, ax = plt.subplots(figsize=(16, 16))
    cm = ax.imshow(conf_mat[xi], cmap='coolwarm_r', interpolation="nearest", vmin=0.0, vmax=1.0)  # coolwarm_r, GnBu
    
    # 绘制一条对角线
    max_l = len(labels) - 1
    ax.plot([-0.5, max_l + 0.5], [-0.5, max_l + 0.5], color='black', alpha=0.2)
    
    # ticks长度粗细
    ax.tick_params(axis='both', which='major', length=5, width=l_width)
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels(labels_catg_list, rotation=90, fontproperties=my_font)
    ax.set_yticklabels(labels_catg_list, fontproperties=my_font)
    # 标签分组用不同的颜色
    for i in range(len(labels_cat_idx)):
        for j in range(len(labels_cat_idx[i])):
            ax.get_xticklabels()[cn].set_color(colors[i])
            ax.get_yticklabels()[cn].set_color(colors[i])
            cn += 1
    
    # 设置边框粗细
    ax.spines['top'].set_linewidth(l_width)
    ax.spines['right'].set_linewidth(l_width)
    ax.spines['bottom'].set_linewidth(l_width)
    ax.spines['left'].set_linewidth(l_width)
            
    # 将每一种类别的部分矩阵用对应颜色方框标注
    for i in range(len(labels_catg)):
        rect = matplotlib.patches.Rectangle(
            (rect_coord[i][0]-0.5, rect_coord[i][0]-0.5), len(labels_catg[i]), len(labels_catg[i]),
            linewidth=l_width, edgecolor=colors[i], facecolor='none'
        )
        ax.add_patch(rect)
    
    plt.xlabel('Predicted', fontsize=40)
    plt.ylabel('True label', fontsize=40)
    plt.title(f'61 words classification ({task[xi]})', fontsize=60)
    
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="4%", pad=0.2)
    cb = fig.colorbar(cm, cax=cax)
    cb.ax.tick_params(labelsize=30)
    
    plt.tight_layout()
    plt.savefig(f'{path}/confusion_{task[xi]}.png', dpi=300)
    # plt.show()
    
