# Pre-Processing 

## 1. Setup and Imports

In [22]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import warnings
import re
import emoji
from collections import Counter

warnings.filterwarnings('ignore')
nltk.download('punkt', quiet=True)

True

## 2. Data Import and Initial Processing

The data import and initial processing stage focuses on loading the feedback data from a CSV file and performing preliminary cleaning. It reads the data into a pandas DataFrame, prints the initial number of entries, removes duplicate entries based on 'GeneralFeedbackID' while keeping the first occurrence, converts the 'Content' column to string type, and prints the number of unique entries. This step ensures that the dataset is prepared for more detailed processing in the following stages.

In [23]:
df = pd.read_csv('data/feedback_detail.csv', encoding='utf-8-sig')
print('Number of records:', len(df))

Number of records: 457415


In [24]:
df = df[['GeneralFeedbackID', 'Content']].drop_duplicates(subset='GeneralFeedbackID', keep='first')
df['Content'] = df['Content'].astype(str)
print('Number of records:', len(df))

Number of records: 21880


## 3. Tokenization

The tokenization section breaks down the feedback content into smaller, meaningful segments. It defines two key functions: 'clean_and_tokenize' which splits text by commas and periods while cleaning each segment, and 'tokenize_content' which applies this tokenization to each row of the dataframe. The process then explodes the tokenized content into separate rows and adds an 'ID' column to number the segments within each 'GeneralFeedbackID'. This step is crucial for transforming the raw feedback text into a format more suitable for analysis.

Currently we have the data in the format of MasterID, FullText

| MasterID | FullText |
|----------|----------|
| 1        | This is a sentence, and this is another sentence. This is the third sentence. |

We will tokenize the sentences and reconstruct the data to the format of MasterID, ID, Text

| MasterID | ID | Text |
|----------|----|------|
| 1        | 0  | this is a sentence |
| 1        | 1  | and this is another sentence |
| 1        | 2  | this is the third sentence |

In [25]:
def clean_and_tokenize(text):
    text = text.strip('"')
    segments = re.split(r'[,.]', text)
    cleaned_segments = [segment.strip().rstrip(',') for segment in segments if segment.strip()]
    return cleaned_segments

def tokenize_content(row):
    try:
        segments = clean_and_tokenize(row['Content'])
        return pd.Series({
            'GeneralFeedbackID': row['GeneralFeedbackID'],
            'Content': segments
        })
    except Exception as e:
        print(f"Error processing row {row['GeneralFeedbackID']}: {e}")
        return pd.Series({
            'GeneralFeedbackID': row['GeneralFeedbackID'],
            'Content': [row['Content']] 
        })

In [26]:
df_tokenized = df.apply(tokenize_content, axis=1)
df_exploded = df_tokenized.explode('Content').reset_index(drop=True)
df_exploded['ID'] = df_exploded.groupby('GeneralFeedbackID').cumcount()
df = df_exploded[['GeneralFeedbackID', 'ID', 'Content']]

In [27]:
print('Number of records:', len(df))

Number of records: 37699


## 4. Text Cleaning and Filtering

This comprehensive section focuses on cleaning the text data and filtering out meaningless content. It defines several functions to remove unwanted elements such as URLs, HTML tags, special characters, hashtags, and phone numbers. The 'is_meaningful' function checks if the text is meaningful based on length, repetition, and pattern criteria. The 'clean_text' function applies all cleaning operations to the text. These functions are then applied to the 'Content' column, removing rows with non-meaningful content and any remaining empty rows. This step ensures that the final dataset contains only clean, meaningful feedback entries.

In [28]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_special_characters(text):
    pattern = re.compile(r'[^a-zA-Z0-9\s\u00C0-\u1EF9.,!?]')
    return pattern.sub(r'', text)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_phone_numbers(text):
    return re.sub(r'\b(?:\+?84|0)(?:\d{9,10})\b', '[PHONE]', text)

def is_meaningful(text, min_length=2, max_length=200, max_repetition_ratio=0.5, max_consonant_streak=5):
    cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
    
    if len(cleaned_text) < min_length or len(cleaned_text) > max_length:
        return False
    
    if re.search(r'[bcdfghjklmnpqrstvwxyz]{' + str(max_consonant_streak) + ',}', cleaned_text):
        return False
    
    char_counts = Counter(cleaned_text)
    most_common_char_count = char_counts.most_common(1)[0][1]
    repetition_ratio = most_common_char_count / len(cleaned_text)
    
    if repetition_ratio > max_repetition_ratio:
        return False
    
    for pattern_length in range(2, 6):
        for i in range(len(cleaned_text) - pattern_length * 2):
            pattern = cleaned_text[i:i+pattern_length]
            if pattern == cleaned_text[i+pattern_length:i+pattern_length*2]:
                return False
    
    return True

def clean_text(text):
    text = remove_urls(text)
    text = remove_html_tags(text)
    text = remove_hashtags(text)
    text = remove_phone_numbers(text)
    text = remove_special_characters(text)
    text = remove_extra_spaces(text)
    text = text.lower()
    text = emoji.demojize(text)
    return text

In [29]:
df['Content'] = df['Content'].astype(str)
df['Content'] = df['Content'].apply(clean_text)
df['is_meaningful'] = df['Content'].apply(is_meaningful)
df = df[df['is_meaningful']]
df = df.drop(columns=['is_meaningful'])
df = df[df['Content'].str.strip().astype(bool)]

In [30]:
print('Number of records:', len(df))

Number of records: 25513


5. Create a subset from the original dataset

In [31]:
np.random.seed(15)
df = df.sample(n=4400, random_state=15)
print('Number of records:', len(df))

Number of records: 4400


In [32]:
df.to_json('data/feedback_subset.json', orient='records', indent=2, force_ascii=False)