In [3]:
from ml_utils import * 
import yaml
import torch
pd.options.mode.copy_on_write = True
import sys
import os

import warnings
warnings.filterwarnings('ignore')


In [4]:
# Multioutput to multilabel function
aspects =['SCREEN','CAMERA','FEATURES','BATTERY','PERFORMANCE',
        'STORAGE','DESIGN','PRICE','GENERAL','SER&ACC']

sentiments = ['NEG','NEU','POS']

# Multiouput to Multi Label
def motml(y):
    name_cols = [f'{a}#{s}' for a in aspects for s in sentiments]
    nrows, ncols = len(y),len(name_cols)
    ml = pd.DataFrame(np.zeros((nrows,ncols)),columns=name_cols)

    for i,a in enumerate(aspects):
        for j in range(1,4):
            indices = y[a] ==j #?
            ml.iloc[indices,i*3+j-1] = 1
    return ml

# Mutioutput to dataframe
def motdf(y):
    if isinstance(y, pd.DataFrame):
        return y
    return pd.DataFrame(y, columns=aspects)


def init_data(df):
    X = df.pop('comment')
    y = df.replace({np.nan: 0,
                    'Negative': 1,
                    'Neutral': 2,
                    'Positive': 3}).astype(np.uint8)

    print('X.shape:', X.shape, 'y.shape:', y.shape)
    return X, y

In [5]:
with open('ml_model.yaml') as f:
  cfg = yaml.load(f, Loader=yaml.SafeLoader)
  cfg['device']= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"### Loading config {cfg}")

### Loading config {'aspect': ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC'], 'labels': ['negative', 'neutral', 'positive'], 'train_file': '../Data/Pre_train_model/ML/Train.csv', 'dev_file': '../Data/Pre_train_model/ML/Dev.csv', 'test_file': '../Data/Pre_train_model/ML/Test.csv', 'device': device(type='cpu')}


In [6]:
def txt2df(filepath, aspect):
    df = pd.read_csv(filepath)
    df[aspect] = [label_encoder(label, aspect) for label in df['label']]
    df_new=df.drop(columns=['index','n_star','date_time','label'])
    return df_new

In [7]:
train = txt2df(cfg['train_file'], cfg['aspect'])
dev = txt2df(cfg['dev_file'], cfg['aspect'])
test = txt2df(cfg['test_file'], cfg['aspect'])

Xtrain, ytrain = init_data(train)
Xdev,   ydev   = init_data(dev)
Xtest,  ytest  = init_data(test)

X.shape: (7786,) y.shape: (7786, 10)
X.shape: (1112,) y.shape: (1112, 10)
X.shape: (2224,) y.shape: (2224, 10)


In [8]:
ytrain.head(3)

Unnamed: 0,SCREEN,CAMERA,FEATURES,BATTERY,PERFORMANCE,STORAGE,DESIGN,PRICE,GENERAL,SER&ACC
0,0,3,3,3,0,0,0,3,3,3
1,0,0,0,1,0,0,0,0,3,0
2,0,0,1,0,0,0,0,0,0,0


In [9]:
# y in real data for evaluation
ytrain_ml = motml(ytrain)
ydev_ml   = motml(ydev)
ytest_ml  = motml(ytest)

In [10]:
ytrain_ml.head(3)

Unnamed: 0,SCREEN#NEG,SCREEN#NEU,SCREEN#POS,CAMERA#NEG,CAMERA#NEU,CAMERA#POS,FEATURES#NEG,FEATURES#NEU,FEATURES#POS,BATTERY#NEG,...,DESIGN#POS,PRICE#NEG,PRICE#NEU,PRICE#POS,GENERAL#NEG,GENERAL#NEU,GENERAL#POS,SER&ACC#NEG,SER&ACC#NEU,SER&ACC#POS
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from ml_text_preprocessed import preprocess_fn

xtrain = Xtrain.apply(preprocess_fn)
xdev   = Xdev.apply(preprocess_fn)
xtest  = Xtest.apply(preprocess_fn)

In [12]:
xtrain

0       mới mua máy này tại thegioididong thốt nốt cảm...
1       pin kém còn lại miễn_chê mua 832019 tình_trạng...
2       sao lúc gọi điện_thoại màn_hình bị chấm nhỏ nh...
3       mọi người cập_nhật phần_mềm lại nó sẽ bớt tốn_...
4       mới mua xài được 1 tháng thấy pin rất trâu xài...
                              ...                        
7781    8 g cái đi đánh là mạng giật_giật không chịu n...
7782    mua được giảm 500 k mà lỗi lòi ra hết treo màn...
7783    máy xài 3 tháng rồi rất ok pin trâu khỏi nói x...
7784    rất tiếc hàng realme không có ốp lưng ngoài nê...
7785    mình rất thất_vọng khi mua máy này bắt wifi cự...
Name: comment, Length: 7786, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             min_df=4, max_df=0.8)

# x data using basic clean up class and basic features extrator
xtrain_tfidf = vectorizer.fit_transform(xtrain)
xdev_tfidf   = vectorizer.transform(xdev)
xtest_tfidf  = vectorizer.transform(xtest)


In [14]:
xtrain_tfidf.shape

(7786, 11903)

In [15]:
from sklearn.multioutput import MultiOutputClassifier as MOC
from sklearn.metrics import f1_score, classification_report

categories= list(ytest.columns)

def quick_f1(y_true, y_pred):
    y_pred = motml(motdf(y_pred))
    return round(f1_score(y_true, y_pred, average='micro', zero_division=0), 4)

def evaluate(model, X, y, average='micro'):
    yb_true  = motml(y)

    yb_pred  = motdf(model.predict(X))
    yb_pred  = motml(yb_pred)

    return classification_report(yb_true, yb_pred, zero_division=0)

In [16]:
from ml_utils import aspect_detection_eval, sentiment_classification_eval, combination_eval

In [17]:
from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
clf0 = MOC(LinearSVC(random_state=5))
clf0.fit(xtrain_tfidf, ytrain)
ypred= clf0.predict(xtest_tfidf)

In [18]:
clf0.predict(vectorizer.transform(['Mình mới mua dc 4 ngày, sao để dt qua đêm , sáng dậy lúc nào cũng bị tụt 4 5 % pin']))
categories

['SCREEN',
 'CAMERA',
 'FEATURES',
 'BATTERY',
 'PERFORMANCE',
 'STORAGE',
 'DESIGN',
 'PRICE',
 'GENERAL',
 'SER&ACC']

In [19]:
ypred= pd.DataFrame(ypred, columns= categories)
aspect_detection_eval(ytest, ypred)

## Aspect Detection Evaluate ##
              precision    recall  f1-score   support

                 0.8972    0.9627    0.9288     15518
     BATTERY     0.9564    0.8442    0.8968      1014
      CAMERA     0.9501    0.7449    0.8351       588
      DESIGN     0.9249    0.5879    0.7189       398
    FEATURES     0.8991    0.7145    0.7962       711
     GENERAL     0.8346    0.7893    0.8113      1381
 PERFORMANCE     0.8816    0.7944    0.8357      1172
       PRICE     0.9067    0.7346    0.8117       569
      SCREEN     0.8639    0.5428    0.6667       269
     SER&ACC     0.9209    0.6476    0.7604       593
     STORAGE     0.8571    0.2222    0.3529        27

    accuracy                         0.8970     22240
   macro avg     0.8993    0.6895    0.7650     22240
weighted avg     0.8976    0.8970    0.8933     22240



In [20]:
sentiment_classification_eval(ytest, ypred)

## Sentiment Classification Evaluate ##
              precision    recall  f1-score   support

        None     0.8972    0.9627    0.9288     15518
    negative     0.7425    0.5937    0.6598      2210
     neutral     0.6914    0.3537    0.4680       817
    positive     0.8129    0.7491    0.7797      3695

    accuracy                         0.8682     22240
   macro avg     0.7860    0.6648    0.7091     22240
weighted avg     0.8603    0.8682    0.8604     22240



In [21]:
combination_eval(ytest, ypred)

## Combination Evaluate (Aspect Detection + Sentiment Classification) ##
                      precision    recall  f1-score   support

        BATTERY,None     0.8811    0.9678    0.9224      1210
    BATTERY,negative     0.7708    0.7310    0.7503       368
     BATTERY,neutral     0.4545    0.1630    0.2400        92
    BATTERY,positive     0.8499    0.7870    0.8172       554
         CAMERA,None     0.9149    0.9859    0.9491      1636
     CAMERA,negative     0.7838    0.5088    0.6170       171
      CAMERA,neutral     0.4571    0.2254    0.3019        71
     CAMERA,positive     0.7905    0.7197    0.7534       346
         DESIGN,None     0.9168    0.9896    0.9518      1826
     DESIGN,negative     0.7778    0.2188    0.3415        96
      DESIGN,neutral     1.0000    0.0357    0.0690        28
     DESIGN,positive     0.8489    0.6971    0.7655       274
       FEATURES,None     0.8776    0.9623    0.9180      1513
   FEATURES,negative     0.7723    0.7168    0.7435       

Using Trained Model to Predict

In [23]:
import pandas as pd 
df_ecom = pd.read_csv('../Data/Preprocessed_data/EcomReviews.csv')
print(len(df_ecom))

# Drop duplicated
df_ecom.dropna(subset=['Review'],inplace=True)
print(len(df_ecom))

# Preprocess data
df_ecom['ml_review'] = df_ecom['Review'].apply(preprocess_fn)

# Transform TF IDF
X_int = vectorizer.transform(df_ecom['ml_review'])

# Predict 
Y_out= clf0.predict(X_int)
Y_out= pd.DataFrame(Y_out, columns= categories)

# Tranform for User
Y_out.replace({0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'},inplace=True)
df_ecom.reset_index(drop=True, inplace=True)
Y_out.reset_index(drop=True, inplace=True)
df_ecom_pred = pd.concat([df_ecom,Y_out],axis=1)

# Save
# df_ecom_pred.to_excel('..\Data\EcomReviews_Labeled.xlsx',index=False)

6614
3044


In [24]:
import pandas as pd 
df_social = pd.read_csv('../Data/Preprocessed_data/SocialMediaReviews.csv')
print(len(df_social))

# Drop duplicated
df_social.dropna(subset=['Review'],inplace=True)
print(len(df_social))

# Preprocess data
df_social['ml_review'] = df_social['Review'].apply(preprocess_fn)

# Transform TF IDF
X_int = vectorizer.transform(df_social['ml_review'])

# Predict 
Y_out= clf0.predict(X_int)
Y_out= pd.DataFrame(Y_out, columns= categories)

# Tranform for User
Y_out.replace({0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'},inplace=True)
df_social.reset_index(drop=True, inplace=True)
Y_out.reset_index(drop=True, inplace=True)
df_social_pred = pd.concat([df_social,Y_out],axis=1)

# Save
# df_social_pred.to_excel('..\Data\MachineLearning\SocialMediaReviews_Labeled.xlsx',index=False)

60822
60520


Fine Turning

In [25]:
import optuna
from optuna.samplers import TPESampler

In [26]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

In [27]:
import warnings
warnings.filterwarnings("ignore")
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=5
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(xtrain_tfidf, ytrain)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(xdev_tfidf)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials=50, callbacks=[callback])


clf2 = linearsvc_study.user_attrs['best_model']

print(clf2.estimators_[0].get_params())
print(linearsvc_study.best_params)

[I 2024-05-14 22:35:34,310] A new study created in memory with name: no-name-58a624e3-f546-4a94-9cab-6b8668b7bf9a
[I 2024-05-14 22:35:34,702] Trial 0 finished with value: 0.2385 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-05-14 22:35:35,311] Trial 1 finished with value: 0.2385 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-05-14 22:35:35,551] Trial 2 finished with value: 0.2385 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-05-14 22:35:35,816] Trial 3 finished with value: 0.2385 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-05-14 22:35:36,794] Trial 4 finished with value: 0.7036 and parameters: {'C': 0.2804917948703948, 'class_weight': 'balanc

{'C': 0.4493072714375323, 'class_weight': 'balanced', 'dual': 'warn', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 5, 'tol': 0.0001, 'verbose': 0}
{'C': 0.4493072714375323, 'class_weight': 'balanced', 'loss': 'squared_hinge'}


In [28]:
ypred2=clf2.predict(xtest_tfidf)
ypred2= pd.DataFrame(ypred2, columns= categories)
aspect_detection_eval(ytest, ypred2)

## Aspect Detection Evaluate ##
              precision    recall  f1-score   support

                 0.9115    0.9532    0.9319     15518
     BATTERY     0.9585    0.8649    0.9093      1014
      CAMERA     0.9315    0.8095    0.8662       588
      DESIGN     0.9152    0.6508    0.7606       398
    FEATURES     0.8633    0.7637    0.8104       711
     GENERAL     0.8250    0.8161    0.8205      1381
 PERFORMANCE     0.8718    0.8123    0.8410      1172
       PRICE     0.8926    0.7891    0.8377       569
      SCREEN     0.7955    0.6506    0.7157       269
     SER&ACC     0.8678    0.7083    0.7799       593
     STORAGE     0.8889    0.2963    0.4444        27

    accuracy                         0.9028     22240
   macro avg     0.8838    0.7377    0.7925     22240
weighted avg     0.9022    0.9028    0.9008     22240



In [29]:
sentiment_classification_eval(ytest, ypred2)

## Sentiment Classification Evaluate ##
              precision    recall  f1-score   support

        None     0.9115    0.9532    0.9319     15518
    negative     0.7074    0.6520    0.6786      2210
     neutral     0.5823    0.4027    0.4761       817
    positive     0.8176    0.7545    0.7848      3695

    accuracy                         0.8701     22240
   macro avg     0.7547    0.6906    0.7179     22240
weighted avg     0.8635    0.8701    0.8655     22240



In [30]:
combination_eval(ytest, ypred2)

## Combination Evaluate (Aspect Detection + Sentiment Classification) ##
                      precision    recall  f1-score   support

        BATTERY,None     0.8953    0.9686    0.9305      1210
    BATTERY,negative     0.7699    0.7364    0.7528       368
     BATTERY,neutral     0.3968    0.2717    0.3226        92
    BATTERY,positive     0.8600    0.7762    0.8159       554
         CAMERA,None     0.9346    0.9786    0.9561      1636
     CAMERA,negative     0.7448    0.6316    0.6835       171
      CAMERA,neutral     0.4182    0.3239    0.3651        71
     CAMERA,positive     0.8199    0.7370    0.7763       346
         DESIGN,None     0.9284    0.9869    0.9567      1826
     DESIGN,negative     0.6579    0.2604    0.3731        96
      DESIGN,neutral     0.4000    0.0714    0.1212        28
     DESIGN,positive     0.8375    0.7336    0.7821       274
       FEATURES,None     0.8947    0.9432    0.9183      1513
   FEATURES,negative     0.7663    0.7429    0.7544       