# Train model

## Import thư viện

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers
from transformers import BertModel, BertTokenizer
import joblib

## Load data

In [25]:
file = 'formated__dataset.csv'
df = pd.read_csv(file,delimiter=',', header=None)
df= df.iloc[1:200]

print(df.shape)
df.head()

(199, 2)


Unnamed: 0,0,1
1,Cực kì hài lòng đẹp,1
2,Cực kì hài lòng ok ok,1
3,Cực kì hài lòng đẹp đấy,1
4,Cực kì hài lòng,1
5,Cực kì hài lòng Hat chia rất đều đep date sd c...,1


## Load pre-train model 

In [26]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#encode lines
tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens = True, max_length=512,truncation=True)))
print('encode',tokenized[10])
# decode
print('decode',tokenizer.decode(tokenized[10]))

encode [101, 6865, 18699, 2226, 7632, 25311, 6865, 1102, 5575, 9587, 11937, 6045, 2000, 1047, 19991, 17595, 2624, 19538, 29328, 1047, 19991, 16215, 19098, 3070, 15990, 20684, 11265, 2078, 2089, 2084, 2290, 25223, 14684, 2050, 21770, 16371, 2050, 4705, 14163, 2050, 16480, 6187, 27699, 1102, 2378, 2232, 29328, 16215, 2072, 7929, 102]
decode [CLS] hang nhu hinh hang đung mo ta hat to khong lan san minh dung khong thuong xuyen nen may thang roi chua het nua kg mua cho ca gia đinh dung thi ok [SEP]


## Fine-tuning model

In [27]:
#get all label 
labels = df[1].to_numpy().astype(np.float)
print('labels shape:', labels.shape)
# get lenght max of tokenized
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
print('max len:', max_len)

# if lenght of tokenized not equal max_len , so padding value 0
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print('padded:', padded[1])
print('len padded:', padded.shape)

#get attention mask ( 0: not has word, 1: has word)
attention_mask = np.where(padded ==0, 0,1)
print('attention mask:', attention_mask[1])

# Convert input to tensor
padded = torch.tensor(padded,dtype=torch.long)
attention_mask = torch.tensor(attention_mask)


# Train model
with torch.no_grad():
    last_hidden_states = model(padded, attention_mask =attention_mask)
#     print('last hidden states:', last_hidden_states)

features = last_hidden_states[0][:,0,:].numpy()
print('features:', features)

X_train, X_test, y_train, y_test = train_test_split(features, labels)

cl = LogisticRegression()
cl.fit(X_train, y_train)



labels shape: (199,)
max len: 278
padded: [  101 12731  2278 11382 15030  2146  7929  7929   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Save model

In [28]:
# Save model
joblib.dump(cl, 'save_model.pkl')
sc = cl.score(X_test, y_test)
print('score:', sc)


score: 0.82
