In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import string
import re
from IPython.display import display, HTML
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/satria-data-semifinal/cleaned_df.csv


In [2]:
def create_scrollable_table(df, title):
    def format_element(x):
        return str(x)

    formatted_df = df.applymap(format_element)
    df_html = formatted_df.to_html(classes='table table-striped', escape=False, index=False)
    html = f"""
    <div style='height:400px; overflow:auto;'>
        <h2>{title}</h2>
        {df_html}
    </div>
    """
    return html

In [3]:
df= pd.read_csv('/kaggle/input/satria-data-semifinal/cleaned_df.csv')

In [4]:
# Renaming Columns
df.rename(columns={'Unnamed: 0': 'index', 'frn_cnt': 'following', 'flw_cnt': 'followers'}, inplace=True)

In [5]:
df = df.fillna(0).astype({'following': 'int', 'followers': 'int','sts_cnt':'int','lst_cnt':'int'})

In [6]:
sample_df=df[['index','lang','content']].sample(300,random_state=1)

In [7]:
sample_df_html = create_scrollable_table(sample_df.head(),'Preview')
display(HTML(sample_df_html))

  formatted_df = df.applymap(format_element)


index,index.1,lang,content
729347,729347,id,Jawaban brillian. Pak Anies
9533051,9533051,id,"Ganjar Pranowo sama Mahfud MD itu kayak duo pahlawan kita, deh! Yuk, rame-rame pake hashtag #JNKBersamaGanjarMahfud buat tunjukin dukungan kita. #JNK"
7925926,7925926,id,"Pengen kritik, malem-malem pengen kritik si anies ah. bosen juga nyanjung-nyanjung, berikut kelemahan Anies secara objektif."
2141021,2141021,id,"Abah Anies adalah AbahNya Semua Golongan , Agama , Suku dan Seluruh Rakyat Indonesia !"
3189808,3189808,id,"mas,rakyat wadas itu jelas dibela sama Pak Ganjar,bantuan2 semua sudah turun,beliau datang langsung ke wadas untuk berani bertanggung jawab atas PROYEK NASIONAL. Beliau juga pernah membatalkan Pabrik Emas yang ada di wonogiri,ini ga pernah di UP dipublik"


#  Cleaning Phase 1
- Remove URL
- Remove RT
- Extracting Reply
- Extracting Mention
- Extracting Hashtag

In [8]:
# Function to preprocess text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove 'RT'
    text = re.sub(r'\bRT\b', '', text)
    return text

In [9]:
sample_df['clean_content']=sample_df['content'].apply(preprocess_text)

In [10]:
# Function to extract reply tokens
def extract_reply_tokens(text):
    reply_tokens = re.findall(r'\[re \w+\]', text.lower())  # Extract reply tokens in lowercase
    return ', '.join(reply_tokens)

# Apply extraction
sample_df['reply_tokens'] = sample_df['clean_content'].apply(extract_reply_tokens)

# Remove reply tokens from clean_content
sample_df['clean_content'] = sample_df['clean_content'].apply(lambda x: re.sub(r'\[re \w+\]', '', x,flags=re.IGNORECASE))

In [11]:
# Count the frequency of each reply token
reply_token_counts = sample_df['reply_tokens'].value_counts().reset_index()
reply_token_counts.columns = ['reply_token', 'count']

# Display frequency analysis of reply tokens
debug_html_reply_tokens = create_scrollable_table(reply_token_counts.head(), 'Reply Token Frequency Analysis')
display(HTML(debug_html_reply_tokens))

  formatted_df = df.applymap(format_element)


reply_token,count
,300


In [12]:
# Function to extract mentions
def extract_mentions(text):
    mentions = re.findall(r'@\S+=', text)  # Extract mentions
    return ', '.join(mentions)

# Function to handle mentions by replacing them with positional tokens
def handle_mentions(text):
    words = text.split()
    tagged_words = []
    
    for i, word in enumerate(words):
        if re.match(r'@\S+=', word):
            if i == 0:
                tagged_words.append('')
            elif i == len(words) - 1:
                tagged_words.append('')
            else:
                tagged_words.append('')
        else:
            tagged_words.append(word)
    
    return ' '.join(tagged_words)

In [13]:
# Apply extraction
sample_df['mentions'] = sample_df['clean_content'].apply(extract_mentions)
sample_df['mentions'] = sample_df['mentions'].apply(lambda x: x.split(', ') if x else [])

#Tokenize mention
sample_df['clean_content'] = sample_df['clean_content'].apply(handle_mentions)

In [14]:
# Debug
#pd.set_option('display.max_colwidth', None)
#print(sample_df[['content','clean_content']])

In [15]:
def extract_hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    cleaned_hashtags = [re.sub(r'[^a-zA-Z0-9]', '', hashtag[1:]) for hashtag in hashtags]  # Remove the '#' and clean special characters
    return cleaned_hashtags

In [16]:
# Extract Hashtags into columns first
sample_df['hashtags'] = sample_df['clean_content'].apply(extract_hashtags)

In [17]:
# DEBUG
# Set pandas display options to show all text
#pd.set_option('display.max_colwidth', None)

# TO CHECK THE SPECIAL TOKEN
# Check the resulting DataFrame to ensure mentions and hashtags are handled correctly
#print(sample_df[['content', 'clean_content', 'hashtags']].head(5))

# TO CHECK EXTRACTION, NOTES: IDK WHY IT'S DOESN'T SHOW THE SPECIAL TOKEN
#debug_html = create_scrollable_table(sample_df, 'Mention Frequency Analysis')
#display(HTML(debug_html))

In [18]:
# Check the resulting DataFrame to ensure mentions and hashtags are handled correctly
# print(sample_df['clean_content'].head(20))

# Cleaning Phase 2
- Special Character

In [19]:
# Do not remove hashtag, punctuation, hyphen
def handle_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:<>#-]', '', text)
    return text

In [20]:
sample_df['clean_content'] = sample_df['clean_content'].apply(handle_special_characters)

In [21]:
# Check the resulting DataFrame to ensure mentions and hashtags are handled correctly
#print(sample_df['clean_content'].head(20))

In [22]:
# Normalize excessive punctuation
def normalize_punctuation(text):
    # Replace multiple exclamation marks with a single one
    text = re.sub(r'!{2,}', '!', text)
    # Replace multiple question marks with a single one
    text = re.sub(r'\?{2,}', '?', text)
    # Replace multiple periods with a single one
    text = re.sub(r'\.{2,}', '.', text)
    return text

In [23]:
sample_df['clean_content'] = sample_df['clean_content'].apply(normalize_punctuation)

In [24]:
# Check the resulting DataFrame to ensure mentions and hashtags are handled correctly
#print(sample_df['clean_content'].head(3))

#debug_html= create_scrollable_table(sample_df, 'Debug Purpose')
#display(HTML(debug_html))

In [25]:
#mentions_exploded = sample_df.explode('mentions')
#mention_counts = mentions_exploded['mentions'].value_counts().reset_index()
#mention_counts.columns = ['mention', 'count']

In [26]:
#debug_html_mentions = create_scrollable_table(mention_counts, 'Mention Frequency Analysis')
#display(HTML(debug_html_mentions))

# Now let's clean the main data!

In [27]:
import time
# Time tracking decorator
def time_tracker(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        row_count = len(args[0])
        rows_per_sec = row_count / elapsed_time if elapsed_time > 0 else float('inf')
        print(f"{func.__name__} took {elapsed_time:.4f} seconds, {rows_per_sec:.2f} rows/sec")
        return result
    return wrapper

In [28]:
from functools import lru_cache #Speed go brrrrrr Memory go "why do i exist just to suffer"
# Preprocess text
@lru_cache(maxsize=500000)
def preprocess_text_cached(text):
    return preprocess_text(text)

@lru_cache(maxsize=500000)
def extract_reply_tokens_cached(text):
    return extract_reply_tokens(text)

@lru_cache(maxsize=500)
def handle_mentions_cached(text):
    return handle_mentions(text)

@lru_cache(maxsize=500)
def extract_mentions_cached(text):
    return extract_mentions(text)

@lru_cache(maxsize=500000)
def extract_hashtags_cached(text):
    return extract_hashtags(text)

@lru_cache(maxsize=500000)
def handle_special_characters_cached(text):
    return handle_special_characters(text)

@lru_cache(maxsize=500000)
def normalize_punctuation_cached(text):
    return normalize_punctuation(text)

@time_tracker
def clean_text_pipeline(df):
    # Preprocess text
    df['content'] = df['content'].apply(preprocess_text_cached)
    
    # Extract and handle reply tokens
    df['reply_tokens'] = df['content'].apply(extract_reply_tokens_cached)
    df['content'] = df['content'].str.replace(r'\[re \w+\]', '', regex=True, case=False)
    
    # Extract and handle mentions
    df['mentions'] = df['content'].apply(extract_mentions_cached)
    df['content'] = df['content'].apply(handle_mentions_cached)
    
    # Extract hashtags
    df['hashtags'] = df['content'].apply(extract_hashtags_cached)
    
    # Handle special characters and normalize punctuation
    df['content'] = df['content'].apply(handle_special_characters_cached)
    df['content'] = df['content'].apply(normalize_punctuation_cached)
    return df


In [29]:
test = df.copy()
testing = test
cleaned_df = clean_text_pipeline(testing)
cleaned_df.to_csv("cleaned_df.csv",index=True)

clean_text_pipeline took 334.5838 seconds, 29341.99 rows/sec
