In [3]:
import pandas as pd
import numpy as np
import re
import string

In [4]:
emoji_pattern = re.compile("["
                u"\U0001F600-\U0001F64F"
                u"\U0001F300-\U0001F5FF"
                u"\U0001F680-\U0001F6FF"
                u"\U0001F1E0-\U0001F1FF"
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
    "]+", flags=re.UNICODE)

abbrev_dict = {
            'ko': 'không', 'k': 'không', 'khong': 'không', 'kg': 'không', 'hok': 'không',
            'dc': 'được', 'đc': 'được', 'đk': 'được', 'r': 'rồi', 'vs': 'với',
            'j': 'gì', 'bt': 'bình thường', 'mk': 'mình', 'mik': 'mình',
            'cx': 'cũng', 'bn': 'bạn', 'b': 'bạn', 'tks': 'cảm ơn', 'thanks': 'cảm ơn',
            'nchung': 'nói chung', 'nhìu': 'nhiều', 'nhieu': 'nhiều',
            'cty': 'công ty', 'nt': 'nhắn tin', 'sp': 'sản phẩm',
            'i': 'giống', 'sz': 'size', 'sdt': 'số điện thoại',
            'ok': 'tốt', 'oke': 'tốt', 'okey': 'tốt', 'okie': 'tốt',
            'ok': 'tốt', 'oke': 'tốt', 'okey': 'tốt', 'okie': 'tốt',
            'okeyy': 'tốt', 'okiee': 'tốt', 'thik': 'thích',
            'thix': 'thích', 'ib': 'nhắn tin', 'ibx': 'nhắn tin',
            'inbox': 'nhắn tin', 'inb': 'nhắn tin', 'inbx': 'nhắn tin',
            'sz': 'cỡ', 'mn': 'mọi người', 'mng': 'mọi người',
            'sr': 'xin lỗi', 'sorry': 'xin lỗi', 'éo': 'không', 'kh': 'không',
            'rep': 'trả lời', 'ship': 'giao hàng', 'h': 'giờ',
            'lm': 'làm', 'rùi': 'rồi', 'tl': 'trả lời',
        }

In [5]:
def normalize_abbreviation(text):
    words = text.split()
    normalized_words = [abbrev_dict.get(w.lower(), w) for w in words]
    return ' '.join(normalized_words)

In [6]:
def lowercase_text(text):
    return text.lower()

def remove_emojis(text):
    return re.sub(emoji_pattern, " ", text)

def remove_consecutive_chars(text):
    return re.sub(r'(.)\1+', r'\1', text)

def handle_punctuation_spacing(text):
    # Add space around punctuation if it's next to a word
    text = re.sub(r"(\w)\s*([" + string.punctuation + r"])\s*(\w)", r"\1 \2 \3", text)
    text = re.sub(r"(\w)\s*([" + string.punctuation + "])", r"\1 \2", text)
    return text

def remove_multiple_punctuations(text):
    # Replace multiple identical punctuations with a single one
    return re.sub(f"([{string.punctuation}])([{string.punctuation}])+",r"\1", text)

def strip_outer_chars(text):
    # Remove leading/trailing punctuation and whitespace
    text = text.strip()
    while text.endswith(tuple(string.punctuation+string.whitespace)):
        text = text[:-1]
    while text.startswith(tuple(string.punctuation+string.whitespace)):
        text = text[1:]
    return text

def normalize_whitespace(text):
    # Replace multiple spaces with a single space
    return re.sub(r"\s+", " ", text)

def clean_text(text):
    text = lowercase_text(text)
    text = remove_emojis(text)
    text = remove_consecutive_chars(text)
    text = handle_punctuation_spacing(text)
    text = remove_multiple_punctuations(text)
    text = strip_outer_chars(text)
    text = normalize_whitespace(text)
    text = normalize_abbreviation(text)
    return text

In [7]:
def is_junk_review(text, threshold=25):
    if ' ' not in text and len(text) > threshold:
        return True
    return False

def remove_junk_reviews(df, threshold=25):
  junk_mask = df['reviewContent'].apply(is_junk_review)

  initial_rows = df.shape[0]
  df_filtered = df[~junk_mask].copy()
  final_rows = df_filtered.shape[0]

  print(f"Number of rows before filtering: {initial_rows}")
  print(f"Number of junk rows removed: {initial_rows - final_rows}")
  print(f"Number of rows after filtering: {final_rows}")

  print("\nSample of removed junk rows:")
  display(df[junk_mask][['reviewContent']].head())

  return df_filtered

In [9]:
for i in range(11):
  df = pd.read_excel(f'processed_reviews_{i+1}.xlsx')
  df = remove_junk_reviews(df)
  df['reviewContent'] = df['reviewContent'].apply(clean_text)
  output_file_name = f"test_flow_reviews_{i+1}.xlsx"
  df.to_excel(output_file_name, index=False)

Number of rows before filtering: 1021
Number of junk rows removed: 5
Number of rows after filtering: 1016

Sample of removed junk rows:


Unnamed: 0,reviewContent
3,1krkfjjdiwufucidiwid8v8v88s8w8d8b8f8w88f8v8w8d...
44,1krkfjjdiwufucidiwid8v8v88s8w8d8b8f8w88f8v8w8d...
85,1krkfjjdiwufucidiwid8v8v88s8w8d8b8f8w88f8v8w8d...
254,jjnnnbbhjjjjjjnnfhfhfhhffnthjfjfjfjfjtjfjfjjfj...
304,jjnnnbbhjjjjjjnnfhfhfhhffnthjfjfjfjfjtjfjfjjfj...


Number of rows before filtering: 1000
Number of junk rows removed: 6
Number of rows after filtering: 994

Sample of removed junk rows:


Unnamed: 0,reviewContent
37,.................................................
87,.................................................
122,sjajhhsbsbsusisisusjjsjdhdhdjsjsjsnsnsjsjsjjan...
123,jsjsjsjajajajjajjajajjajajjajjajajjajjajajjaja...
172,sjajhhsbsbsusisisusjjsjdhdhdjsjsjsnsnsjsjsjjan...


Number of rows before filtering: 1026
Number of junk rows removed: 0
Number of rows after filtering: 1026

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 1000
Number of junk rows removed: 2
Number of rows after filtering: 998

Sample of removed junk rows:


Unnamed: 0,reviewContent
243,"ok.,............................................."
293,"ok.,............................................."


Number of rows before filtering: 996
Number of junk rows removed: 2
Number of rows after filtering: 994

Sample of removed junk rows:


Unnamed: 0,reviewContent
17,"okeeeeeeeeeeeeeee..........................,,,..."
67,"okeeeeeeeeeeeeeee..........................,,,..."


Number of rows before filtering: 1000
Number of junk rows removed: 0
Number of rows after filtering: 1000

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 1000
Number of junk rows removed: 0
Number of rows after filtering: 1000

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 1000
Number of junk rows removed: 0
Number of rows after filtering: 1000

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 998
Number of junk rows removed: 0
Number of rows after filtering: 998

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 1000
Number of junk rows removed: 0
Number of rows after filtering: 1000

Sample of removed junk rows:


Unnamed: 0,reviewContent


Number of rows before filtering: 498
Number of junk rows removed: 6
Number of rows after filtering: 492

Sample of removed junk rows:


Unnamed: 0,reviewContent
225,0k9k0k0k0k0k0kj0kok0k0kk0kk0kj0joj9j9j9k9niono...
274,0k9k0k0k0k0k0kj0kok0k0kk0kk0kj0joj9j9j9k9niono...
307,ghgghhhhjjjjhhhhshbshshshshshshhshehehehehhehe...
326,hog99h9hjhttuiituuii9h9hviiivviviivivifjyoccyo...
357,ghgghhhhjjjjhhhhshbshshshshshshhshehehehehhehe...
