In [1]:
from utils import * 
import yaml
import torch
pd.options.mode.copy_on_write = True
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Multioutput to multilabel function
aspects =['SCREEN','CAMERA','FEATURES','BATTERY','PERFORMANCE',
        'STORAGE','DESIGN','PRICE','GENERAL','SER&ACC']

sentiments = ['NEG','NEU','POS']

def motml(y):
    name_cols = [f'{a}#{s}' for a in aspects for s in sentiments]
    nrows, ncols = len(y),len(name_cols)
    ml = pd.DataFrame(np.zeros((nrows,ncols)),columns=name_cols)

    for i,a in enumerate(aspects):
        for j in range(1,4):
            indices = y[a] ==j #?
            ml.iloc[indices,i*3+j-1] = 1
    return ml

In [3]:
# Mutioutput to dataframe
def motdf(y):
    if isinstance(y, pd.DataFrame):
        return y
    return pd.DataFrame(y, columns=aspects)

def init_data(df):
    X = df.pop('comment')
    y = df.replace({np.nan: 0,
                    'Negative': 1,
                    'Neutral': 2,
                    'Positive': 3}).astype(np.uint8)

    print('X.shape:', X.shape, 'y.shape:', y.shape)
    return X, y

In [7]:
with open('ml_model.yaml') as f:
  cfg = yaml.load(f, Loader=yaml.SafeLoader)
  cfg['device']= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"### Loading config {cfg}")

### Loading config {'aspect': ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC'], 'labels': ['negative', 'neutral', 'positive'], 'train_file': '../../Data/UIT-ViSFD/Train.csv', 'dev_file': '../../Data/UIT-ViSFD/Dev.csv', 'test_file': '../../Data/UIT-ViSFD/Test.csv', 'device': device(type='cpu')}


In [5]:
def txt2df(filepath, aspect):
    df = pd.read_csv(filepath)
    df[aspect] = [label_encoder(label, aspect) for label in df['label']]
    df_new=df.drop(columns=['index','n_star','date_time','label'])
    return df_new

In [8]:
train = txt2df(cfg['train_file'], cfg['aspect'])
dev = txt2df(cfg['dev_file'], cfg['aspect'])
test = txt2df(cfg['test_file'], cfg['aspect'])

Xtrain, ytrain = init_data(train)
Xdev,   ydev   = init_data(dev)
Xtest,  ytest  = init_data(test)

X.shape: (7786,) y.shape: (7786, 10)
X.shape: (1112,) y.shape: (1112, 10)
X.shape: (2224,) y.shape: (2224, 10)


In [19]:
ytrain

Unnamed: 0,SCREEN,CAMERA,FEATURES,BATTERY,PERFORMANCE,STORAGE,DESIGN,PRICE,GENERAL,SER&ACC
0,0,3,3,3,0,0,0,3,3,3
1,0,0,0,1,0,0,0,0,3,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,1,2,0,0,0,0,2,0
4,0,0,0,3,3,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
7781,0,0,1,1,1,0,0,0,0,0
7782,0,0,1,0,1,0,0,3,0,0
7783,0,0,0,3,3,0,0,0,3,0
7784,0,0,0,0,0,0,0,1,3,1


In [13]:
# y in real data for evaluation
ytrain_ml = motml(ytrain)
ydev_ml   = motml(ydev)
ytest_ml  = motml(ytest)

In [12]:
from text_preprocessed_nvh import preprocess_fn
xtrain = Xtrain.apply(preprocess_fn)
xdev   = Xdev.apply(preprocess_fn)
xtest  = Xtest.apply(preprocess_fn)

In [14]:
def get_num_words_per_sample(sample_texts):
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)
num_samples = len(xtrain)
num_aspects = len(aspects)
num_classes = num_aspects * 3
num_words_per_sample = get_num_words_per_sample(xtrain)
sw_ratio = num_samples / num_words_per_sample


print("Xtrain key metrics")
print("Number of samples:", num_samples)
print("Number of aspects:", num_aspects)
print("Number of classes:", num_classes)
print("Number of words per sample:", num_words_per_sample)
print("Number of samples/number of words per sample ratio", sw_ratio)

Xtrain key metrics
Number of samples: 7786
Number of aspects: 10
Number of classes: 30
Number of words per sample: 26.0
Number of samples/number of words per sample ratio 299.46153846153845


In [15]:
xtrain
train_vocabulary = set()
for line in xtrain:
    words = line.strip().split()
    train_vocabulary.update(words)

# Tạo từ vựng của tập test
test_vocabulary = set()
for line in xtest:
    words = line.strip().split()
    test_vocabulary.update(words)

# Tạo từ vựng của tập val
val_vocabulary = set()
for line in xdev:
    words = line.strip().split()
    val_vocabulary.update(words)

In [16]:
print("Số lượng từ vựng tập train: ", len(train_vocabulary))
print("Số lượng từ vựng tập val: ", len(val_vocabulary))
print("Số lượng từ vựng tập test: ", len(test_vocabulary))

print("Số lượng từ vựng nằm ngoài tập train của tập val: ", len(val_vocabulary - train_vocabulary))
print("Số lượng từ vựng nằm ngoài tập train của tập test: ", len(test_vocabulary - train_vocabulary))

Số lượng từ vựng tập train:  11298
Số lượng từ vựng tập val:  3529
Số lượng từ vựng tập test:  5302
Số lượng từ vựng nằm ngoài tập train của tập val:  960
Số lượng từ vựng nằm ngoài tập train của tập test:  1776


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=4, max_df=0.85)

# x data using basic clean up class and basic features extrator
xtrain_tfidf = vectorizer.fit_transform(xtrain)
xdev_tfidf   = vectorizer.transform(xdev)
xtest_tfidf  = vectorizer.transform(xtest)


In [18]:
xtrain_tfidf.shape

(7786, 16307)

In [21]:
from sklearn.multioutput import MultiOutputClassifier as MOC
from sklearn.metrics import f1_score, classification_report
from eval import label_map, replacements, target_names, aspect_detection_eval, sentiment_classification_eval, combination_eval

categories= list(ytest.columns)

def quick_f1(y_true, y_pred):
    y_pred = motml(motdf(y_pred))
    return round(f1_score(y_true, y_pred, average='micro', zero_division=0), 4)

def evaluate(model, X, y, average='micro'):
    yb_true  = motml(y)

    yb_pred  = motdf(model.predict(X))
    yb_pred  = motml(yb_pred)

    return classification_report(yb_true, yb_pred, zero_division=0)

In [22]:
from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
clf0 = MOC(LinearSVC(random_state=5))
clf0.fit(xtrain_tfidf, ytrain)
ypred= clf0.predict(xtest_tfidf)

In [23]:

clf0.predict(vectorizer.transform(['Mình mới mua dc 4 ngày, sao để dt qua đêm , sáng dậy lúc nào cũng bị tụt 4 5 % pin']))
categories

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=uint8)

In [25]:
ypred= pd.DataFrame(ypred, columns= categories)
aspect_detection_eval(ytest, ypred)

## Aspect Detection Evaluate ##
              precision    recall  f1-score   support

                 0.8961    0.9627    0.9282     15518
     BATTERY     0.9562    0.8393    0.8939      1014
      CAMERA     0.9528    0.7551    0.8425       588
      DESIGN     0.9286    0.5553    0.6950       398
    FEATURES     0.8905    0.7089    0.7894       711
     GENERAL     0.8293    0.7951    0.8118      1381
 PERFORMANCE     0.8830    0.7918    0.8349      1172
       PRICE     0.9156    0.7241    0.8086       569
      SCREEN     0.8698    0.5465    0.6712       269
     SER&ACC     0.9291    0.6408    0.7585       593
     STORAGE     0.8333    0.1852    0.3030        27

    accuracy                         0.8961     22240
   macro avg     0.8986    0.6822    0.7579     22240
weighted avg     0.8969    0.8961    0.8922     22240



In [26]:
sentiment_classification_eval(ytest, ypred)

## Sentiment Classification Evaluate ##
              precision    recall  f1-score   support

        None     0.8961    0.9627    0.9282     15518
    negative     0.7514    0.5937    0.6633      2210
     neutral     0.7153    0.3599    0.4788       817
    positive     0.8077    0.7459    0.7756      3695

    accuracy                         0.8679     22240
   macro avg     0.7927    0.6655    0.7115     22240
weighted avg     0.8604    0.8679    0.8600     22240



In [27]:
combination_eval(ytest, ypred)

## Combination Evaluate (Aspect Detection + Sentiment Classification) ##
                      precision    recall  f1-score   support

        BATTERY,None     0.8778    0.9678    0.9206      1210
    BATTERY,negative     0.7935    0.7310    0.7610       368
     BATTERY,neutral     0.4839    0.1630    0.2439        92
    BATTERY,positive     0.8423    0.7906    0.8156       554
         CAMERA,None     0.9181    0.9866    0.9511      1636
     CAMERA,negative     0.8455    0.5439    0.6619       171
      CAMERA,neutral     0.5385    0.2958    0.3818        71
     CAMERA,positive     0.7950    0.7283    0.7602       346
         DESIGN,None     0.9109    0.9907    0.9491      1826
     DESIGN,negative     0.7037    0.1979    0.3089        96
      DESIGN,neutral     1.0000    0.0357    0.0690        28
     DESIGN,positive     0.8524    0.6533    0.7397       274
       FEATURES,None     0.8752    0.9590    0.9152      1513
   FEATURES,negative     0.7658    0.7124    0.7381       

Using Trained Model to Predict

In [31]:
import pandas as pd 
df_ecom = pd.read_csv('../../Data/Preprocessed_data/EcomReviews.csv')
print(len(df_ecom))

# Drop duplicated
df_ecom.dropna(subset=['Review'],inplace=True)
print(len(df_ecom))

# Preprocess data
df_ecom['ml_review'] = df_ecom['Review'].apply(preprocess_fn)

# Transform TF IDF
X_int = vectorizer.transform(df_ecom['ml_review'])

# Predict 
Y_out= clf0.predict(X_int)
Y_out= pd.DataFrame(Y_out, columns= categories)

# Tranform for User
Y_out.replace({0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'},inplace=True)
df_ecom.reset_index(drop=True, inplace=True)
Y_out.reset_index(drop=True, inplace=True)
df_ecom_pred = pd.concat([df_ecom,Y_out],axis=1)

# Save
# df_ecom_pred.to_excel('..\..\Data\EcomReviews_Labeled.xlsx',index=False)

1189
678


In [71]:
import pandas as pd 
df_social = pd.read_csv('../../Data/Preprocessed_data/SocialMediaReviews.csv')
print(len(df_social))

# Drop duplicated
df_social.dropna(subset=['Review'],inplace=True)
print(len(df_social))

# Preprocess data
df_social['ml_review'] = df_social['Review'].apply(preprocess_fn)

# Transform TF IDF
X_int = vectorizer.transform(df_social['ml_review'])

# Predict 
Y_out= clf0.predict(X_int)
Y_out= pd.DataFrame(Y_out, columns= categories)

# Tranform for User
Y_out.replace({0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'},inplace=True)
df_social.reset_index(drop=True, inplace=True)
Y_out.reset_index(drop=True, inplace=True)
df_social_pred = pd.concat([df_social,Y_out],axis=1)

# Save
# df_social_pred.to_excel('..\..\Data\MachineLearning\SocialMediaReviews_Labeled.xlsx',index=False)

60822
60520


In [72]:
df_social_pred.to_excel('..\..\Data\MachineLearning\SocialMediaReviews_Labeled.xlsx',index=False)

In [73]:
df_social_pred

Unnamed: 0,SMReviewID,Review,ProductID,PlatformID,TypeReview,DateReview,Title,ReviewerName,ViewCount,CommentCount,...,SCREEN,CAMERA,FEATURES,BATTERY,PERFORMANCE,STORAGE,DESIGN,PRICE,GENERAL,SER&ACC
0,SM00000001,1. Giới thiệu\nMình là một ifan chính hiệu từ ...,P001,PL05,Post,2023-10-24,Nâng cấp lên iPhone 15 Pro Max - Bản facelift ...,linhpham89,,129.0,...,,positive,,,positive,,positive,,positive,positive
1,SM00000002,Về mặt lý thuyết thì đây là kèo rất cân vì cả ...,P001,PL05,Post,2023-10-22,So sánh camera iPhone 15 Pro Max và Google Pix...,Hoàng Hải.,,233.0,...,,positive,,,positive,,,,,
2,SM00000003,Đây là cách thiết lập máy của mình sau khi nhậ...,P001,PL05,Post,2023-10-25,Cách mình thiết lập để chụp hình và quay phim ...,Hoàng Hải.,,41.0,...,,positive,,,,,,,,
3,SM00000004,Tinh tế đã có sớm\niPhone 15 Pro Max\nNatural ...,P001,PL05,Post,2023-09-22,Mở hộp iPhone 15 Pro Max Natural Titan: Mềm mạ...,Lê Huyền Vân,,136.0,...,,positive,,,positive,,positive,,,
4,SM00000005,iPhone 15 Pro Max\nlà chiếc\niPhone\nkích thướ...,P001,PL05,Post,2023-10-23,Lại phải quay về iPhone 15 Pro sau 2 tuần dùng...,Trần Hoàng Long.,,185.0,...,positive,positive,,positive,,,positive,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60515,SM00060818,Đã có google pay thay thế nên giờ không là vấn đề,P002,PL04,Comment,2023-06-03,Galaxy S23 Ultra Hàn ĐẮT HƠN chính hãng... vẫn...,Tony Phùng Studio,19936.0,94.0,...,,,,,,,,,,
60516,SM00060819,Máy bản Hàn sản xuất tại Hàn Quốc bạn ơi,P002,PL04,Comment,2023-06-03,Galaxy S23 Ultra Hàn ĐẮT HƠN chính hãng... vẫn...,Tony Phùng Studio,19936.0,94.0,...,,,,,,,,,,
60517,SM00060820,cước gửi từ bên Hàn về 700 tiền việt cho 1 máy,P002,PL04,Comment,2023-06-03,Galaxy S23 Ultra Hàn ĐẮT HƠN chính hãng... vẫn...,Tony Phùng Studio,19936.0,94.0,...,,,,,,,,positive,,
60518,SM00060821,mik nghĩ là nhx người mua con ss hàn này chắc ...,P002,PL04,Comment,2023-06-03,Galaxy S23 Ultra Hàn ĐẮT HƠN chính hãng... vẫn...,Tony Phùng Studio,19936.0,94.0,...,,,,,,,,,,


Fine Turning

In [171]:
import optuna
from optuna.samplers import TPESampler

In [172]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_model', value=trial.user_attrs['model'])

In [177]:
import warnings
warnings.filterwarnings("ignore")
def linearsvc_objective(trial):
    params = dict(
        C=trial.suggest_float('C', 1e-9, 1e2, log=True),
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        loss=trial.suggest_categorical('loss', ['hinge', 'squared_hinge']),
        max_iter=2000,
        random_state=5
    )

    clf = MOC(LinearSVC(**params))
    clf.fit(xtrain_tfidf, ytrain)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(xdev_tfidf)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
linearsvc_study = optuna.create_study(sampler=sampler, direction='maximize')
linearsvc_study.optimize(linearsvc_objective, n_trials=50, callbacks=[callback])


clf2 = linearsvc_study.user_attrs['best_model']

print(clf2.estimators_[0].get_params())
print(linearsvc_study.best_params)

[I 2024-03-09 18:46:12,309] A new study created in memory with name: no-name-3fe0e134-ab1a-421a-a826-0c95a3f99d91


[I 2024-03-09 18:46:12,866] Trial 0 finished with value: 0.2385 and parameters: {'C': 1.9636582699290402e-07, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-03-09 18:46:13,878] Trial 1 finished with value: 0.2385 and parameters: {'C': 5.339536586472381e-06, 'class_weight': None, 'loss': 'squared_hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-03-09 18:46:14,244] Trial 2 finished with value: 0.2385 and parameters: {'C': 1.3055563380836963e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-03-09 18:46:14,606] Trial 3 finished with value: 0.2385 and parameters: {'C': 1.1682869614143264e-09, 'class_weight': None, 'loss': 'hinge'}. Best is trial 0 with value: 0.2385.
[I 2024-03-09 18:46:17,178] Trial 4 finished with value: 0.7001 and parameters: {'C': 0.2804917948703948, 'class_weight': 'balanced', 'loss': 'hinge'}. Best is trial 4 with value: 0.7001.
[I 2024-03-09 18:46:18,328] Trial 5 finished with value

{'C': 0.5940218221401105, 'class_weight': 'balanced', 'dual': 'warn', 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': 5, 'tol': 0.0001, 'verbose': 0}
{'C': 0.5940218221401105, 'class_weight': 'balanced', 'loss': 'squared_hinge'}


In [178]:
ypred2=clf2.predict(xtest_tfidf)
ypred2= pd.DataFrame(ypred2, columns= categories)
aspect_detection_eval(ytest, ypred2)

## Aspect Detection Evaluate ##
              precision    recall  f1-score   support

                 0.9100    0.9536    0.9313     15518
     BATTERY     0.9535    0.8688    0.9092      1014
      CAMERA     0.9412    0.8163    0.8743       588
      DESIGN     0.9022    0.6256    0.7389       398
    FEATURES     0.8709    0.7496    0.8057       711
     GENERAL     0.8204    0.8269    0.8237      1381
 PERFORMANCE     0.8652    0.8157    0.8397      1172
       PRICE     0.8986    0.7627    0.8251       569
      SCREEN     0.8000    0.6245    0.7015       269
     SER&ACC     0.8908    0.6880    0.7764       593
     STORAGE     0.8750    0.2593    0.4000        27

    accuracy                         0.9018     22240
   macro avg     0.8843    0.7265    0.7842     22240
weighted avg     0.9013    0.9018    0.8995     22240



In [179]:
sentiment_classification_eval(ytest, ypred2)

## Sentiment Classification Evaluate ##
              precision    recall  f1-score   support

        None     0.9100    0.9536    0.9313     15518
    negative     0.7200    0.6480    0.6821      2210
     neutral     0.6298    0.3831    0.4764       817
    positive     0.8073    0.7629    0.7845      3695

    accuracy                         0.8706     22240
   macro avg     0.7667    0.6869    0.7186     22240
weighted avg     0.8637    0.8706    0.8654     22240



In [180]:
combination_eval(ytest, ypred2)

## Combination Evaluate (Aspect Detection + Sentiment Classification) ##
                      precision    recall  f1-score   support

        BATTERY,None     0.8977    0.9645    0.9299      1210
    BATTERY,negative     0.7886    0.7500    0.7688       368
     BATTERY,neutral     0.4348    0.2174    0.2899        92
    BATTERY,positive     0.8447    0.8051    0.8244       554
         CAMERA,None     0.9370    0.9817    0.9588      1636
     CAMERA,negative     0.7744    0.6023    0.6776       171
      CAMERA,neutral     0.5000    0.3521    0.4132        71
     CAMERA,positive     0.8043    0.7601    0.7816       346
         DESIGN,None     0.9235    0.9852    0.9534      1826
     DESIGN,negative     0.6571    0.2396    0.3511        96
      DESIGN,neutral     1.0000    0.0357    0.0690        28
     DESIGN,positive     0.8250    0.7226    0.7704       274
       FEATURES,None     0.8896    0.9478    0.9178      1513
   FEATURES,negative     0.7634    0.7451    0.7541       

Non linear svm

In [181]:
from sklearn.svm import SVC

def svc_objective(trial):
    params = dict(
        class_weight=trial.suggest_categorical('class_weight', ['balanced', None]),
        kernel=trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        gamma=trial.suggest_categorical('gamma', ['auto', 'scale']),
        max_iter=3000,
        random_state=5
    )

    clf = MOC(SVC(**params))
    clf.fit(xtrain_tfidf, ytrain)
    trial.set_user_attr(key="model", value=clf)

    y_pred = clf.predict(xdev_tfidf)
    return quick_f1(ydev_ml, y_pred)

sampler = TPESampler(seed=22)
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(svc_objective, n_trials=10, callbacks=[callback])


clf3 = svc_study.user_attrs['best_model']

print(clf3.estimators_[0].get_params())
print(svc_study.best_params)

[I 2024-03-09 18:52:11,297] A new study created in memory with name: no-name-4b37b6fb-e9a8-416c-8684-b4cbf4af658d
[I 2024-03-09 18:55:23,799] Trial 0 finished with value: 0.2385 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: 0.2385.
[I 2024-03-09 19:01:55,227] Trial 1 finished with value: 0.2736 and parameters: {'class_weight': 'balanced', 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 1 with value: 0.2736.
[I 2024-03-09 19:08:07,066] Trial 2 finished with value: 0.2505 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 1 with value: 0.2736.
[I 2024-03-09 19:14:14,684] Trial 3 finished with value: 0.2505 and parameters: {'class_weight': None, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 1 with value: 0.2736.
[I 2024-03-09 19:20:21,809] Trial 4 finished with value: 0.6958 and parameters: {'class_weight': 'balanced', 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 4 with value: 0.69

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 3000, 'probability': False, 'random_state': 5, 'shrinking': True, 'tol': 0.001, 'verbose': False}
{'class_weight': None, 'kernel': 'sigmoid', 'gamma': 'scale'}


In [182]:
ypred3=clf3.predict(xtest_tfidf)
ypred3= pd.DataFrame(ypred3, columns= categories)

In [183]:
aspect_detection_eval(ytest, ypred3)

## Aspect Detection Evaluate ##
              precision    recall  f1-score   support

                 0.8884    0.9686    0.9267     15518
     BATTERY     0.9593    0.8609    0.9075      1014
      CAMERA     0.9607    0.7483    0.8413       588
      DESIGN     0.9324    0.5201    0.6677       398
    FEATURES     0.9216    0.6610    0.7699       711
     GENERAL     0.8328    0.7828    0.8070      1381
 PERFORMANCE     0.9031    0.7790    0.8365      1172
       PRICE     0.9169    0.6977    0.7924       569
      SCREEN     0.8974    0.3903    0.5440       269
     SER&ACC     0.9611    0.5835    0.7261       593
     STORAGE     0.6667    0.0741    0.1333        27

    accuracy                         0.8932     22240
   macro avg     0.8946    0.6424    0.7230     22240
weighted avg     0.8952    0.8932    0.8874     22240



In [184]:
sentiment_classification_eval(ytest, ypred3)

## Sentiment Classification Evaluate ##
              precision    recall  f1-score   support

        None     0.8884    0.9686    0.9267     15518
    negative     0.7712    0.5552    0.6456      2210
     neutral     0.7615    0.3244    0.4549       817
    positive     0.8173    0.7483    0.7813      3695

    accuracy                         0.8672     22240
   macro avg     0.8096    0.6491    0.7021     22240
weighted avg     0.8603    0.8672    0.8573     22240



In [185]:
combination_eval(ytest, ypred3)

## Combination Evaluate (Aspect Detection + Sentiment Classification) ##
                      precision    recall  f1-score   support

        BATTERY,None     0.8927    0.9694    0.9295      1210
    BATTERY,negative     0.8097    0.7283    0.7668       368
     BATTERY,neutral     0.7000    0.0761    0.1373        92
    BATTERY,positive     0.8155    0.8375    0.8264       554
         CAMERA,None     0.9162    0.9890    0.9512      1636
     CAMERA,negative     0.7826    0.5263    0.6294       171
      CAMERA,neutral     0.5833    0.1972    0.2947        71
     CAMERA,positive     0.8119    0.7486    0.7789       346
         DESIGN,None     0.9046    0.9918    0.9462      1826
     DESIGN,negative     0.7647    0.1354    0.2301        96
      DESIGN,neutral     1.0000    0.0000    0.0000        28
     DESIGN,positive     0.8585    0.6423    0.7349       274
       FEATURES,None     0.8594    0.9736    0.9129      1513
   FEATURES,negative     0.7990    0.6928    0.7421       

In [187]:
ypred3

Unnamed: 0,SCREEN,CAMERA,FEATURES,BATTERY,PERFORMANCE,STORAGE,DESIGN,PRICE,GENERAL,SER&ACC
0,3,0,3,3,3,0,0,0,3,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,3,3
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,3,0,0,3,2,3,0
...,...,...,...,...,...,...,...,...,...,...
2219,0,0,1,0,3,0,3,0,0,0
2220,0,0,0,0,0,0,0,0,0,0
2221,0,0,0,0,3,0,0,0,3,0
2222,0,3,0,0,3,0,0,0,0,3


In [190]:
ytest

Unnamed: 0,SCREEN,CAMERA,FEATURES,BATTERY,PERFORMANCE,STORAGE,DESIGN,PRICE,GENERAL,SER&ACC
0,3,0,3,0,3,0,0,0,3,0
1,0,0,1,0,3,0,0,0,0,3
2,0,0,0,0,0,0,2,0,0,3
3,0,0,0,0,1,0,1,0,0,0
4,3,0,0,1,0,0,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2219,0,0,3,0,3,0,3,0,0,0
2220,0,0,0,0,1,0,0,0,0,0
2221,0,0,0,0,3,0,0,0,3,0
2222,0,3,0,3,3,0,3,0,0,3


In [191]:
import pandas as pd 
df_social = pd.read_csv('../Data/Preprocessed_data/SocialMediaReviews.csv')
df_social_test = df_social['Review']