# import libs

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import time
from sklearn.metrics import accuracy_score, f1_score
from keras.datasets import mnist, cifar10
from sklearn.model_selection import train_test_split
import torch
from sklearn.preprocessing import OneHotEncoder
import random
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder

In [13]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_everything(22)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define model

In [14]:
def softmax_stable(Z):
    e_z = torch.exp(Z - torch.max(Z, dim=1, keepdim=True).values)
    A = e_z / e_z.sum(dim=1, keepdim=True)
    return A

def softmax_loss(X, y, W):
    A = softmax_stable(torch.mm(X, W))
    # print(torch.log(A)[0][0])
    return -torch.mean(torch.sum(y * torch.log(A), dim=1))

def softmax_grad(X, y, W):
    A = softmax_stable(torch.mm(X, W))
    A = A - y
    return torch.mm(X.T, A.to(torch.float32)) / X.shape[0]

def softmax_fit(X, y, W, lr=0.05, epochs=500, tol=1e-5, batch_size=32):
    W_old = torch.clone(W)
    ep = 0
    loss_hist = [softmax_loss(X, y, W).cpu().numpy()]
    N = X.shape[0]
    batches = int(np.ceil(N/batch_size))
    
    while ep < epochs:
        ep += 1
        mix_ids = torch.randperm(N)

        for i in range(batches):
            batch_ids = mix_ids[batch_size*i : min(batch_size*(i+1), N)]
            X_batch, y_batch = X[batch_ids], y[batch_ids]
            W = W - lr * softmax_grad(X_batch, y_batch, W)

        loss_hist.append(softmax_loss(X, y, W).cpu().numpy())
        print('Epoch:', ep, 'loss:', loss_hist[-1])        
        # if torch.linalg.norm(W - W_old) / torch.numel(W) < tol:
        #     print('Small change in weights')
        #     break
        
        W_old = W
    
    return W, loss_hist

def pred(W, X):
    A = softmax_stable(torch.mm(X, W))
    return torch.argmax(A, dim=1)

# vnnews

In [21]:
# Đường dẫn
train_x_path = 'data/vnnews/train_x.sav'
train_y_path = 'data/vnnews/train_y.sav'
test_x_path = 'data/vnnews/test_x.sav'
test_y_path = 'data/vnnews/test_y.sav'

# Load dữ liệu
X_train = np.array(pickle.load(open(train_x_path, 'rb')))
y_train = np.array(pickle.load(open(train_y_path, 'rb')))
X_test = np.array(pickle.load(open(test_x_path, 'rb')))
y_test = np.array(pickle.load(open(test_y_path, 'rb')))

In [22]:
feature_extractor = Pipeline([
    # Chuyển tập dữ liệu sang dạng vector count dựa trên một vocab chung (BoW), loại bỏ những word có tần suất xuất hiện > 0.8
    ('vectorize', CountVectorizer(max_df=0.8, encoding='utf-16')),
    
    # Áp dụng TF-IDF để trích chọn đặc trưng (extract feature)
    ('feature extracter', TfidfTransformer())
])

X_train = feature_extractor.fit_transform(X_train, y_train)
X_test = feature_extractor.transform(X_test)
print('Kích thước vector input: ', X_train.shape)
print('Kích thước vector test', X_test.shape)

Kích thước vector input:  (14375, 79170)
Kích thước vector test (12076, 79170)


In [23]:
X_train = X_train.astype('float32').toarray()
X_test = X_test.astype('float32').toarray()

In [24]:
print(y_train[0])
le = OneHotEncoder()
y_train = le.fit_transform(y_train.reshape(-1, 1)).toarray()
print(y_train[0])
print('Train samples:', X_train.shape)
print('Test samples:', X_test.shape)

Am nhac
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.]
Train samples: (14375, 79170)
Test samples: (12076, 79170)


In [25]:
X_train = torch.from_numpy(X_train).to(device)
y_train = torch.from_numpy(y_train).to(device)
X_test = torch.from_numpy(X_test).to(device)
# y_test = torch.from_numpy(y_test).to(device)

In [26]:
W_init = torch.randn(X_train.shape[1], y_train.shape[1]).to(device)

In [27]:
lr = 0.5
start = time.time()
W, loss_hist = softmax_fit(X_train, y_train, W_init, lr=lr, batch_size=32, epochs=200)
print('Train completed in {}s'.format(time.time() - start))

Epoch: 1 loss: 3.014058851588809
Epoch: 2 loss: 2.472543764261707
Epoch: 3 loss: 2.1073250327932205
Epoch: 4 loss: 1.850442763061785
Epoch: 5 loss: 1.6591570571464345
Epoch: 6 loss: 1.509541540275511
Epoch: 7 loss: 1.3883831995866711
Epoch: 8 loss: 1.2880420384019335
Epoch: 9 loss: 1.2031268249595666
Epoch: 10 loss: 1.13028530666336
Epoch: 11 loss: 1.0670587510514806
Epoch: 12 loss: 1.011566279568772
Epoch: 13 loss: 0.9625028589305465
Epoch: 14 loss: 0.9186715742162581
Epoch: 15 loss: 0.8792935268345723
Epoch: 16 loss: 0.8436653865134781
Epoch: 17 loss: 0.811286627400483
Epoch: 18 loss: 0.7816374291888964
Epoch: 19 loss: 0.7544422819634194
Epoch: 20 loss: 0.7293775309431999
Epoch: 21 loss: 0.7062047111649128
Epoch: 22 loss: 0.684708939362258
Epoch: 23 loss: 0.664662182510874
Epoch: 24 loss: 0.6459641579084061
Epoch: 25 loss: 0.6284471022878135
Epoch: 26 loss: 0.6120195392137155
Epoch: 27 loss: 0.5965745399464863
Epoch: 28 loss: 0.5820051494672419
Epoch: 29 loss: 0.5682592904308424
Epoc

KeyboardInterrupt: 