In [11]:
import re
import pandas as pd
import numpy as np
import datetime
import time

import unicodedata
import emoji

import matplotlib.pyplot as plt

import helpers as helper
pd.set_option('display.max_colwidth', None)

In [12]:
# Download datafile from sharepoint
data_file_path = "C:\\Users\\xtanl\\OneDrive - Singapore Management University\\Capstone\\inputs\\preprocessed_230604.xlsx"
data = pd.read_excel(data_file_path)

## Rule Based Classification

### Pre-processing

In [13]:
data_df = data.copy()

In [14]:
# Re-Clean text
data_df['cleaned_text'] = data_df.content.apply(lambda x: helper.text_cleaning(x))
# Remove emojis
data_df['remove_emojis'] = helper.remove_emojis(data_df['cleaned_text'])
# Remove chinese
data_df['remove_chinese'] = helper.remove_chinese(data_df['remove_emojis'])

In [None]:
data_df.head(1)

### Extract for hyperlinks

In [30]:
def extract_hyperlinks(data_df, column):
    
    all_links = []

    URL_REGEX = r"""((?:(?:https|ftp|http)?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|org|uk)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|uk|ac)\b/?(?!@)))"""

    #extract hyperlinks
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_links.append(re.findall(URL_REGEX, string))

    return all_links

In [31]:
data_df['hyperlinks'] = extract_hyperlinks(data_df, 'remove_chinese')

### Extract email links

In [32]:
def extract_emails(data_df, column):
    
    all_emails = []

    EMAIL_REGEX = r'[\w.+-]+@[\w-]+\.[\w.-]+'

    # extract emails
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_emails.append(re.findall(EMAIL_REGEX, string))
        
    return all_emails

In [33]:
data_df['emails'] = extract_emails(data_df, 'remove_chinese')

### Extract Approval Codes

In [35]:
def extract_codes(data_df, column):
    
    all_codes = []

    APPV_REGEX =r'[a-z][\d]{5}'

    # extract emails
    for each_text in range(len(data_df)):
        string = data_df.iloc[each_text][column]
        all_codes.append(re.findall(EMAIL_REGEX, string))
        
    return all_codes

In [36]:
data_df['approval_codes'] = extract_emails(data_df, 'remove_chinese')

### Indicator columns

In [47]:
data_df['has_emails']  = np.where(data_df["emails"].str.len() == 0, 0, 1)

In [48]:
data_df['has_approvalcode']  = np.where(data_df["approval_codes"].str.len() == 0, 0, 1)

In [49]:
data_df['has_hyperlinks']  = np.where(data_df["hyperlinks"].str.len() == 0, 0, 1)