In [None]:
import re
import pandas as pd
import numpy as np
import datetime
import time

import unicodedata
import emoji

import string
from itertools import permutations, combinations
from nltk import WordPunctTokenizer, BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder

import matplotlib.pyplot as plt

import helpers as helper
pd.set_option('display.max_colwidth', None)

In [None]:
# Download datafile from sharepoint
data_file_path = "C:\\Users\\xtanl\\OneDrive - Singapore Management University\\Capstone\\inputs\\preprocessed_230604.xlsx"
data = pd.read_excel(data_file_path)

## Rule Based Classification

### Pre-processing

In [None]:
data_df = data.copy()

In [None]:
# Re-Clean text
data_df['cleaned_text'] = data_df.content.apply(lambda x: helper.text_cleaning(x))
# Remove emojis
data_df['remove_emojis'] = helper.remove_emojis(data_df['cleaned_text'])
# Remove chinese
data_df['remove_chinese'] = helper.remove_chinese(data_df['remove_emojis'])

In [None]:
data_df.head(1)

### Get list of terms and synthesize new words

In [None]:
def permutate(single_string):
    """
    INPUT
    single_string: a string of different terms
    OUTPUT
    returns a list of different permutations of the original string of terms
    """
    # split string by space or brackets
    string_list = re.split(r"[()| ]+", single_string)
    word_perm = [" ".join(items) for items in permutations(string_list, r=len(string_list))]

    return set(word_perm)

In [None]:
permutate("PruShield Premium Plus (Foreigner)")

In [None]:
def get_combinations(single_string):
    """
    INPUT
    single_string: a string of different terms
    OUTPUT
    returns a list of different combinations of the original string of terms
    """
    # remove punctuation
    single_string = re.sub('[%s]' % re.escape(string.punctuation), "", single_string)
    # split string by space or brackets
    string_list = re.split(r"[()| ]+", single_string)
    word_combi = [" ".join(items) for items in combinations(string_list, r=len(string_list)-1)]

    return set(word_combi)

In [None]:
get_combinations("PruShield Premium Plus (Foreigner)")

In [None]:
def synthesize_words(word_list):
    """
    INPUT
    word_list: list of words 
    OUTPUT
    return a list of words that are made up of combinations of the original list
    """

    # Lowercase and joined the list into a single string
    lowercase_list = []
    for words in word_list:
        lowercase_list.append(words.lower())
        joined_string = ' '.join(lowercase_list)

    # Initialise Tokenizer
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(joined_string)

    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500) # take 500 bigrams with highest chi_sq , pmi

    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigrams = trigram_finder.nbest(TrigramAssocMeasures.chi_sq, 500)

    quadgram_finder = QuadgramCollocationFinder.from_words(tokens)
    quadgrams = quadgram_finder.nbest(QuadgramAssocMeasures.chi_sq, 500)

    # Add biwords to the list of terms
    biwords, biperm = [], []
    for tuplestring in bigrams:
        biwords.append(' '.join(tuplestring))
    # Further permutate the words -- swwitching the order of the biwords around
    for each_word in biwords:
        for each in permutate(each_word):
            biperm.append(each)

    # Add triwords to the list of terms
    triwords, triperm = [], []
    for tuplestring in trigrams:
        triwords.append(' '.join(tuplestring))
    for each_word in triwords: 
        for each in permutate(each_word):
            triperm.append(each)

    # Add quadwords to the list of terms
    quadwords,qperm = [], []
    for tuplestring in quadgrams:
        quadwords.append(' '.join(tuplestring))
    for each_word in quadwords: 
        for each in permutate(each_word):
            qperm.append(each)  

    # return unique set of words
    return set(lowercase_list + biwords + biperm + triwords + triperm) #+ quadwords + qperm

In [None]:
list_prob_words = ["financial advisor",
                   "financial adviser",
                   "risk advisor",
                   "risk adviser",
                   "medical adviser",
                   "insurance agent"]

In [None]:
# Prohibited terms
list_words = synthesize_words(list_prob_words)

In [None]:
list_words

### Extract for hyperlinks

In [None]:
def extract_hyperlinks(data_df, column):
    
    all_links = []

    URL_REGEX = r"""((?:(?:https|ftp|http)?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|org|uk)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|uk|ac)\b/?(?!@)))"""

    #extract hyperlinks
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_links.append(re.findall(URL_REGEX, string))

    return all_links

### Extract email links

In [None]:
def extract_emails(data_df, column):
    
    all_emails = []

    EMAIL_REGEX = r'[\w.+-]+@[\w-]+\.[\w.-]+'

    # extract emails
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_emails.append(re.findall(EMAIL_REGEX, string))
        
    return all_emails

### Extract Approval Codes

In [None]:
def extract_codes(data_df, column):
    
    all_codes = []

    APPV_REGEX =r'[a-z][\d]{5}'

    # extract emails
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_codes.append(re.findall(EMAIL_REGEX, string))
        
    return all_codes

In [None]:
data_df['emails'] = extract_emails(data_df, 'remove_chinese')
data_df['hyperlinks'] = extract_hyperlinks(data_df, 'remove_chinese')
data_df['approval_code'] = extract_emails(data_df, 'remove_chinese')

### Indicator columns

In [None]:
data_df['has_emails']  = np.where(data_df["emails"].str.len() == 0, 0, 1)
data_df['has_approvalcode']  = np.where(data_df["approval_code"].str.len() == 0, 0, 1)
data_df['has_hyperlinks']  = np.where(data_df["hyperlinks"].str.len() == 0, 0, 1)

In [None]:
data_df.head(1)