# Experiment using TF-IDF approach

Author: Lu ZhiPing

In [1]:
from sentiment.dataset.load_dataset import LoadDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np
import lightgbm as lgb


def get_matrix_vocab(dataset, column_name):
    vectorizer = TfidfVectorizer(
        max_features=2000
    )

    def yield_from_mongo(ds):
        for item in ds:
            yield item[column_name]
    gen = yield_from_mongo(ds=dataset)
    matrix = vectorizer.fit_transform(gen)
    vocab = vectorizer.vocabulary_
    return matrix, vocab

dataset = LoadDataset(
        database_name="PLP",
        collection_name="AStarCOVID"
)

NOTICE: sentiment log file will be at /Users/johnnylu/tweet_sentiment/sentiment/logs/sentiment.log


2022-10-13 11:22:25,063 : INFO : Initialized Mongo Connection to db:PLP, collection: AStarCOVID


In [2]:
dataset


        Database: Database(MongoClient(host=['192.168.50.72:27017'], document_class=dict, tz_aware=False, connect=True), 'PLP'),
        Collection: Collection(Database(MongoClient(host=['192.168.50.72:27017'], document_class=dict, tz_aware=False, connect=True), 'PLP'), 'AStarCOVID')
        Length : 161390
        Sample: {'Text': 'HDB closes Bukit Merah branch office after second employee tests '
         'positive for Covid-19 https://t.co/hhbICSfy5o',
 '_id': ObjectId('634637137380598a236355ae'),
 'anger_intensity': 0.44,
 'country_region': 'Singapore',
 'date_stamp': '2020-04-02 00:00:00',
 'emotion_category': 'no specific emotion',
 'fear_intensity': 0.49,
 'joy_intensity': 0.281,
 'keyword_used': 'covid',
 'sadness_intensity': 0.4370000000000001,
 'sentiment_category': 'neutral',
 't1': 1,
 't10': 0,
 't2': 0,
 't3': 0,
 't4': 0,
 't5': 0,
 't6': 0,
 't7': 0,
 't8': 0,
 't9': 0,
 'tweet_ID': 1245550415581716481,
 'user_ID': 37874853,
 'valence_intensity': 0.48}
        

In [3]:
matrix, vocab = get_matrix_vocab(dataset, "Text")

In [5]:
matrix.shape

(10000, 2000)

In [9]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

labels = [item["sentiment_category"] for item in dataset]
labels = le.fit_transform(labels)
print(Counter(labels))

Counter({0: 4622, 2: 3162, 1: 1852, 3: 291, 4: 73})


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
        matrix, labels, test_size=0.2, random_state=42
)

lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'num_iterations': 500,
    "early_stopping_rounds": 25
}
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
gbm = lgb.train(
    lgbm_params,
    lgb_train,
    valid_sets=lgb_eval,
    verbose_eval=100
)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32642
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 1124
[LightGBM] [Info] Start training from score -0.782525
[LightGBM] [Info] Start training from score -1.669984
[LightGBM] [Info] Start training from score -1.148854
[LightGBM] [Info] Start training from score -3.519137
[LightGBM] [Info] Start training from score -4.844062
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[51]	valid_0's multi_logloss: 0.933149


In [24]:
y_pred = gbm.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

ACC = accuracy_score(y_test, y_pred)
F1 = f1_score(y_test, y_pred, average="micro")
precision = precision_score(y_test, y_pred, average="micro")
recall = recall_score(y_test, y_pred, average="micro")
CN = confusion_matrix(y_test, y_pred)

with open("experiment_result.txt", "w") as file:
    file.write(f"Accuracy: {ACC}\n")
    file.write(f"F1: {F1}\n")
    file.write(f"Precision: {precision}\n")
    file.write(f"Recall: {recall}\n")
    file.write(f"Confusion Matrix: {CN}")

In [21]:
import numpy as np

np.argmax(y_pred, axis=1)

array([0, 0, 3, ..., 0, 1, 2])

In [29]:
from sklearn.metrics import classification_report
from pprint import pprint

report = classification_report(y_test, y_pred)
type(report)

  _warn_prf(average, modifier, msg_start, len(result))


str

In [30]:
print(report)

              precision    recall  f1-score   support

           0       0.65      0.85      0.74       964
           1       0.41      0.14      0.20       346
           2       0.63      0.62      0.62       626
           3       0.54      0.24      0.33        54
           4       0.00      0.00      0.00        10

    accuracy                           0.63      2000
   macro avg       0.45      0.37      0.38      2000
weighted avg       0.60      0.63      0.59      2000



In [31]:
report

'              precision    recall  f1-score   support\n\n           0       0.65      0.85      0.74       964\n           1       0.41      0.14      0.20       346\n           2       0.63      0.62      0.62       626\n           3       0.54      0.24      0.33        54\n           4       0.00      0.00      0.00        10\n\n    accuracy                           0.63      2000\n   macro avg       0.45      0.37      0.38      2000\nweighted avg       0.60      0.63      0.59      2000\n'

In [34]:
with open("experiment_result.txt", "w") as file:
    file.write(f"Accuracy: {ACC}\n")
    file.write(f"F1: {F1}\n")
    file.write(f"Precision: {precision}\n")
    file.write(f"Recall: {recall}\n")
    file.write(f"Confusion Matrix: \n{CN}\n")
    file.write(f"Classification Report: \n{report}")

In [35]:
vocab

{'hdb': 791,
 'closes': 339,
 'office': 1216,
 'after': 88,
 'second': 1524,
 'employee': 563,
 'tests': 1751,
 'positive': 1333,
 'for': 682,
 'covid': 412,
 '19': 15,
 'https': 848,
 'co': 348,
 'quarantine': 1389,
 'stress': 1679,
 'every': 596,
 'vaccine': 1863,
 'and': 129,
 'treatment': 1817,
 'in': 880,
 'development': 477,
 'so': 1602,
 'far': 636,
 'via': 1871,
 'this': 1772,
 'was': 1901,
 'the': 1757,
 'read': 1408,
 'interesting': 913,
 'sars': 1504,
 'cov': 409,
 'shares': 1557,
 'whole': 1940,
 'with': 1947,
 'coronavirus': 394,
 'from': 703,
 'province': 1383,
 'of': 1211,
 'outbreak': 1254,
 '22': 24,
 '2020': 21,
 'top': 1801,
 'official': 1217,
 'to': 1788,
 'china': 320,
 'health': 794,
 'organization': 1245,
 'use': 1858,
 'wuhan': 1980,
 'virus': 1880,
 'used': 1859,
 'love': 1055,
 'corona': 393,
 'chicken': 316,
 'guy': 772,
 'had': 774,
 'he': 792,
 'changed': 305,
 'name': 1163,
 'few': 650,
 'years': 1985,
 'ago': 94,
 'who': 1939,
 'update': 1849,
 'april': 1

In [37]:
type(vocab)

dict

In [39]:
type(vocab["platforms"])

numpy.int64

In [40]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

with open("vocabulary.json", "w") as file:
    json.dump(vocab, file, cls=NpEncoder)