# Facebook Posts Sentiment Analysis Using Word2Vec and lightbgm

Import all necessary packages

In [73]:
import nltk
import string
nltk.download('stopwords')
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Naphat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Let's define some helper functions

In [204]:
def process_text(path, include_all=False) -> tuple[list[list], list[int]]:
    """Return a list of lists for each comment and a list of labels if there is label"""
    with open(path) as f:
        header = f.readline().rstrip('\n').split('\t')
        if 'Feedback' in header:
            labels = []
        else:
            labels = None

        texts = []
        post_id = []
        for line in f:
            data = line.rstrip('\n').split('\t')
            text = data[1]
            processed_text = []
            # Lowercasing
            text = text.lower()
            # Tokenization with NLTK
            tokens = nltk.tokenize.word_tokenize(text)
            # Remove words that are not alphabet, stop words and punctuations with NLTK
            for token in tokens:
                # if token.isalpha() and \
                if token not in nltk.corpus.stopwords.words('english') and \
                   token not in string.punctuation and \
                   len(token) > 1:
                    processed_text.append(token)
            # put processed text back into a list (remove the cases where nothing left after pre-processing)
            if len(processed_text) > 0 or include_all:
                texts.append(processed_text if processed_text else ['this'])
                post_id.append(data[0])
                if labels is not None:
                    if data[2] == '1':
                        labels.append(0)
                    elif data[3] == '1':
                        labels.append(1)
                    elif data[4] == '1':
                        labels.append(2)
                    else:
                        raise ValueError('No label specified!')
    
    return texts, labels, post_id


def words_to_comment_vectors(proc_texts: list, word2vec_model, weighted_avg=False):
    """Transform processed comment to a vector representation"""
    if weighted_avg:
        vectorizer = TfidfVectorizer(
            lowercase = True, 
            tokenizer = None,  # by default, it does word tokenization AND punctuation removal. You can replace it with a function that does other types of tokenziation
            ngram_range = (1, 1),  # extract 1-gram (single tokens) and 2-gram (phrases of 2 words)
            token_pattern = r"[a-zA-Z0-9$&+,~_:;=?@#|/\\'`<>.^*()%!-]+",
            use_idf = True  # means that we want to get the TF-IDF, rather than just TF
        )

        corpus = [" ".join(l) for l in proc_texts]

        # Now apply it to the corpus and get the TF-IDF matrix
        tfidf = vectorizer.fit_transform(corpus)

        # Next, print it out in a nice readable format (this step is just to show you what it looks like, it's usually not needed)
        df = pd.DataFrame(tfidf.todense(), columns = vectorizer.get_feature_names())
        
    vectors = []
    for i, l in enumerate(proc_texts):
        temp_vec = []
        for word in l:
            temp_vec.append(word2vec_model.wv[word])
        
        if weighted_avg:
            weight = df.loc[i, proc_texts[i]].values
            weight = weight / weight.sum()
            vectors.append(np.sum(np.array(temp_vec) * weight.reshape(-1,1), axis=0))
        else:
            vectors.append(np.mean(temp_vec, axis=0))
    
    return np.array(vectors)


def format_result(y_pred_prob, post_id):
    """Transform the result to the same format as required"""
    y_pred_matrix = (y_pred_prob == y_pred_prob.max(axis=1)[:,None]).astype(int)
    df_post_id = pd.DataFrame({'postId': post_id})
    df_pred = pd.DataFrame(y_pred_matrix, columns=['Appreciation_pred', 'Complaint_pred', 'Feedback_pred'])

    if df_post_id.shape[0] != df_pred.shape[0]:
        raise ValueError('The shapes of y_pred_prob and post_id do not match')

    return df_post_id.join(df_pred)

Get labeled and unlabeled data and perform some exploration

In [185]:
proc_texts, labels, _ = process_text('FB_posts_labeled.txt')

In [186]:
proc_text_unlabeled, _, post_id_test = process_text('FB_posts_unlabeled.txt', include_all=True)

In [187]:
len(proc_texts)

7960

In [188]:
len(proc_text_unlabeled)

2039

In [189]:
proc_texts[:50]

[['great'],
 ['yum', 'yum'],
 ['yummm'],
 ['sweet'],
 ['nice'],
 ['nice'],
 ['winner'],
 ['awesome'],
 ['yay'],
 ['gmo'],
 ['good'],
 ['thanks'],
 ['great'],
 ['thanks'],
 ['thanks'],
 ['like'],
 ['like'],
 ['lame'],
 ['echange'],
 ['like'],
 ['boo'],
 ['nice'],
 ['dislike'],
 ['thanks'],
 ['like'],
 ['like'],
 ['weak'],
 ['boo'],
 ['dislike'],
 ['liars'],
 ['ugh'],
 ['thanks'],
 ['yum'],
 ['yummy'],
 ['mmmmmm'],
 ['yum', 'yum'],
 ['love'],
 ['3v'],
 ['friendly'],
 ['like'],
 ['yea', 'southwest'],
 ['like'],
 ['congrats'],
 ['target'],
 ['like'],
 ['boycott'],
 ['nice'],
 ["love.the.candles.i'm.a.candle.nut", 'love.them'],
 ['great',
  'beauty',
  'deal',
  'week',
  'cvs',
  'buy',
  '20',
  'pert',
  'plus',
  'let',
  "'s",
  'say',
  'price',
  '4.99',
  '4.99+4.99=9.98',
  '9.98-4.99=4.99',
  '4.99-1.50-1.50=1.99',
  '1.99-1.50=0.49',
  'money',
  'maker'],
 ['http', '//184.170.248.140/~kohls/', 'contact', 'real']]

Train a Word2Vec model 

In [190]:
# Train Word2Vec model for embedding
model = Word2Vec(sentences = proc_texts + proc_text_unlabeled,  # input should be a list of lists of tokens, like our output from preprocessing
                 vector_size = 128,  # dimension of embedding (this parameter may be named size if you are using an older version of Gensim)
                 window = 2,  # size of context window
                 min_count = 1,  # remove very infrequent words
                 sg = 1,  # skip-gram, set to 0 if you want CBOW
                 workers = 4)  # parallel training

In [191]:
model.wv['nice']

array([-0.05998771,  0.03020683, -0.02164491,  0.13270344, -0.02250056,
       -0.24770862, -0.13834651,  0.07529167,  0.02148428, -0.26444113,
        0.37745035, -0.20123658, -0.12367904,  0.18358901,  0.3133722 ,
        0.07842196,  0.2808734 , -0.17672142,  0.3118569 ,  0.29917246,
        0.3401665 ,  0.3653232 , -0.00482662,  0.10006938, -0.42488697,
        0.1306782 , -0.25179565, -0.23342106, -0.05785725, -0.13708434,
        0.19405954, -0.10312083, -0.2809174 ,  0.26817358,  0.12582755,
        0.25577465,  0.16056074, -0.18008949,  0.588236  , -0.00090086,
       -0.41429454,  0.3431835 , -0.1012907 , -0.17796099,  0.0069787 ,
       -0.07736837,  0.0797224 , -0.14349657, -0.3857846 ,  0.20748998,
       -0.08198371,  0.10756581, -0.17060216, -0.07641791,  0.08196544,
        0.02937231,  0.3508993 , -0.02958911, -0.12427397, -0.2592816 ,
        0.10441296, -0.10056689,  0.01867408, -0.40605795,  0.06571198,
        0.46204072, -0.2601128 ,  0.09171007,  0.18541333,  0.31

In [48]:
model.wv.most_similar(positive = ['nice'], topn=10)

[('sad', 0.9835895895957947),
 ('thought', 0.9786968231201172),
 ('sorry', 0.9782928824424744),
 ('understand', 0.9767194986343384),
 ('upset', 0.9758037328720093),
 ('managers', 0.9751867055892944),
 ('idea', 0.9748938679695129),
 ('im', 0.9748210310935974),
 ('appreciate', 0.9726012945175171),
 ('knew', 0.9714584946632385)]

Use the model to transform each commend to a vector

In [205]:
comment_vects = words_to_comment_vectors(proc_texts, model, weighted_avg=True)



In [206]:
comment_vects

array([[-0.20429446, -0.16239633,  0.16595899, ..., -0.44573224,
         0.12689543,  0.40104273],
       [-0.03522908,  0.06352112, -0.05770382, ..., -0.09318248,
         0.14540175,  0.09063055],
       [ 0.00221994, -0.00232594, -0.00434061, ...,  0.00286895,
         0.0038337 , -0.00623277],
       ...,
       [-0.07692992,  0.10988255, -0.0948421 , ..., -0.25526255,
         0.21775983,  0.2974752 ],
       [-0.09168824,  0.10265933, -0.10190153, ..., -0.22839808,
         0.23657685,  0.25559031],
       [-0.06489915,  0.09159425, -0.08760808, ..., -0.22908652,
         0.26196046,  0.25881368]])

Next I will use the vector representation of the comments to train a ligthgbm model

In [208]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(comment_vects, np.array(labels), test_size=0.10, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)

param = {'num_leaves': 20,
         'objective': 'multiclass',
         'max_depth': 35,
         'metric': 'multi_logloss',
         'num_class': 3}

clf = lgb.train(param, train_data, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 7164, number of used features: 128
[LightGBM] [Info] Start training from score -1.359847
[LightGBM] [Info] Start training from score -0.623596
[LightGBM] [Info] Start training from score -1.573654


Make a prediction using the test set

In [209]:
y_pred_prob = clf.predict(X_test)
y_pred = np.argmax(clf.predict(X_test), axis=1)

In [210]:
print(confusion_matrix(y_test, y_pred))
print('accuracy =', accuracy_score(y_test, y_pred))

[[140  61  22]
 [ 41 342  31]
 [ 20  82  57]]
accuracy = 0.6771356783919598


The accuracy is about 0.67 with the test set. I will use the current setting of hyperparameters for now and train the model again using the entire unlabelled data.

In [211]:
# Train using all data
train_data = lgb.Dataset(comment_vects, label=np.array(labels))

param = {'num_leaves': 20,
         'objective': 'multiclass',
         'max_depth': 35,
         'metric': 'multi_logloss',
         'num_class': 3}

clf = lgb.train(param, train_data, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 7960, number of used features: 128
[LightGBM] [Info] Start training from score -1.350753
[LightGBM] [Info] Start training from score -0.626569
[LightGBM] [Info] Start training from score -1.577297


Next, use the model to predict predict labels for the unlabelled dataset

In [212]:
comment_vects_unl = words_to_comment_vectors(proc_text_unlabeled, model, weighted_avg=True)



In [213]:
y_pred_prob = clf.predict(comment_vects_unl)

In [214]:
format_result(y_pred_prob, post_id_test).to_csv('output/040423-4.csv', index=False)

Submitting the result here, I get about 0.67 average f-1 score, which is pretty low. for the next part I'll try using a transformer model (BERT) to perform the same task.