In [1]:
import re
import pandas as pd
import numpy as np
import datetime
import time

import unicodedata
import emoji

import matplotlib.pyplot as plt


import warnings
pd.set_option('display.max_colwidth', None)

In [2]:
# Download datafile from sharepoint
data_file_path = "C:\\Users\\xtanl\\OneDrive - Singapore Management University\\Capstone\\inputs\\preprocessed_230604.xlsx"
data = pd.read_excel(data_file_path)

In [3]:
data.head(1)

Unnamed: 0,name,content,breach,non_compliant,hashtags,mentions,emojis,emojis_text,breach_flagwords,breach_hashes,cleaned_text
0,Nicholas Goh Organisation,"Follow us at NGO’s BKK FastTrack 2022! In celebrating with the qualifiers, we had a lot of fun. Our team really bonded while traveling, eating, and shopping together! A true companion is one that travels together and stays together! It was 100% more enjoyable to spend time traveling together than going alone! Once again, congratulations to all the qualifiers!",No further action required,0,[],[],,,False,False,"['Follow', 'us', 'NGO’s', 'BKK', 'FastTrack', 'In', 'celebrating', 'qualifiers', 'lot', 'fun', 'Our', 'team', 'really', 'bonded', 'traveling', 'eating', 'shopping', 'together', 'A', 'true', 'companion', 'one', 'travels', 'together', 'stays', 'together', 'It', 'enjoyable', 'spend', 'time', 'traveling', 'together', 'going', 'alone', 'Once', 'congratulations', 'qualifiers']"


### Pre-processing

In [4]:
#pd.DataFrame(data_df.iloc[37]).transpose() #23 - chinese, 37 emoji

In [5]:
data_df = data.copy()

In [6]:
import string
from nltk.corpus import stopwords

def text_cleaning(text):
    # remove tags
    text = re.sub("@\S+", "", str(text))
    # remove websites
    text = re.sub("https*\S+", "", str(text))
    # remove hashtags
    text = re.sub("#\S+", "", str(text))
    # remove apostrophes eg. abc's
    text = re.sub("\'\w+", "", str(text))
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), "", str(text))
    # remove numbers
    text = re.sub(r'\w*\d+\w*', "", str(text))
    # lowercase -- to remove stopwords
    text = text.lower()
    # remove stopwords
    default_stopwords = stopwords.words('english')
    text = " ".join(word for word in text.split() if word not in default_stopwords)
    
    # remove spaces more than " "
    #text = re.sub('\s{2,}', " ", text)

    return [word for word in text.split()]

In [7]:
default_stopwords = stopwords.words('english')
#pd.Series(default_stopwords)[pd.Series(default_stopwords).str.contains('our')]

In [8]:
def remove_chinese(cleaned):
    """
    cleaned: data_df['cleaned_text']
    """
    
    no_chinese = []
    CHINESE_REGEX = r'[\u4e00-\u9fff]+' #r'[^\x00-\x7F]+'

    for post in cleaned:
        new_post = []
        for term in post:
            term.replace(CHINESE_REGEX, '')
            if re.findall(CHINESE_REGEX, term) == []: # find chinese chars
                new_post.append(term)
        no_chinese.append(new_post)
        
    return no_chinese

In [9]:
import re
def remove_emojis(cleaned):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    
    no_emoji = []
    
    for post in cleaned:
        new_post = []
        for term in post:
            term = re.sub(emoj, '', term)
            new_post.append(term)
            
        no_emoji.append(new_post)
        
    return no_emoji

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer

def normalized_corpus(corpus):
    
    lemma = WordNetLemmatizer()
    # Normalize a list of words     
    normalized = " ".join(lemma.lemmatize(word) for word in corpus)
    # remove stopwords
    default_stopwords = stopwords.words('english')
    text = " ".join(word for word in normalized.split() if word not in default_stopwords)
    
    return [word for word in text.split()]

In [11]:
# Re-Clean text
data_df['cleaned_text'] = data_df.content.apply(lambda x: text_cleaning(x))
# Remove chinese
data_df['remove_chinese'] = remove_chinese(data_df['cleaned_text'])
# Remove emojis
data_df['remove_emojis'] = remove_emojis(data_df['remove_chinese'])

In [12]:
# Lemmatize text
data_df['normalised_text'] = data_df['remove_emojis'].apply(lambda x: normalized_corpus(x))
# Convert to a list of list of lemmatized words
doc_clean = [text for text in data_df['normalised_text']] 

### Document Term Matrix

In [13]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [26]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
NUM_TOPICS = 3
ldamodel = Lda(doc_term_matrix, num_topics=NUM_TOPICS, id2word = dictionary, passes=50)

In [27]:
NUM_WORDS = 10

for topic in range(len(ldamodel.print_topics(num_words=NUM_WORDS))):
    print(ldamodel.print_topics(num_words=NUM_WORDS)[topic], "\n")

(0, '0.020*"insurance" + 0.012*"financial" + 0.010*"plan" + 0.010*"life" + 0.008*"love" + 0.008*"income" + 0.008*"family" + 0.007*"coverage" + 0.007*"money" + 0.007*"way"') 

(1, '0.009*"thing" + 0.008*"happy" + 0.007*"yes" + 0.007*"great" + 0.007*"gst" + 0.007*"million" + 0.007*"dollar" + 0.005*"table" + 0.005*"achievement" + 0.005*"never"') 

(2, '0.012*"thank" + 0.012*"year" + 0.011*"birthday" + 0.011*"day" + 0.009*"special" + 0.009*"wish" + 0.008*"ngo" + 0.008*"u" + 0.007*"made" + 0.007*"one"') 



In [28]:
#print(ldamodel.print_topics(num_words=10)[0]) #num_topics=10, 

In [38]:
# Get feature vector
train_vecs = []
for i in range(len(data_df)):
    top_topics = ldamodel.get_document_topics(train_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(NUM_TOPICS)]
    topic_vec.extend([len(data_df.iloc[i].cleaned_text)]) # length review
    train_vecs.append(topic_vec)

### Bigrams

In [29]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [30]:
def get_corpus(content):
    words = [text for text in content] 
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[post] for post in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [31]:
train_corpus, train_id2word, bigram_train = get_corpus(data_df['normalised_text'])

### Classification Model

In [39]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import linear_model

from sklearn.metrics import f1_score

In [40]:
X = np.array(train_vecs)
y = np.array(data_df.non_compliant)

kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # Logistic Regression SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log_loss',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))

print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

Logistic Regression Val f1: 0.247 +- 0.106
Logisitic Regression SGD Val f1: 0.527 +- 0.203
SVM Huber Val f1: 0.227 +- 0.187
