# Preprocessing

In [None]:
import emoji
import os
import pandas as pd
import re

from collections import Counter

In [None]:
os.makedirs("data/processed/", exist_ok=True)

In [None]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

In [None]:
# Rename id column as movie_id to avoid confusion
train = train.rename(columns={'id': 'movie_id'})
test = test.rename(columns={'id': 'movie_id'})

# Create reviews id for matching with the original dataset
train['review_id'] = train.index + 1
test['review_id'] = test.index + 1

## HTML markups

In [None]:
def contains_html_tags(text):
    pattern = re.compile(r'<[^>]+>')
    return bool(pattern.search(str(text)))

train['has_html'] = train['text'].apply(contains_html_tags)
test['has_html'] = test['text'].apply(contains_html_tags)

train_html_count = train['has_html'].sum()
train_total_count = len(train)
train_percentage = (train_html_count / train_total_count) * 100

test_html_count = test['has_html'].sum()
test_total_count = len(test)
test_percentage = (test_html_count / test_total_count) * 100

print(f"Reviews containing HTML markup in train: {train_html_count} out of {train_total_count} ({train_percentage:.2f}%)")
print(f"Reviews containing HTML markup in test: {test_html_count} out of {train_total_count} ({test_percentage:.2f}%)")

In [None]:
def extract_html_tags(text):
    pattern = re.compile(r'<([a-zA-Z][a-zA-Z0-9]*)')
    matches = pattern.findall(str(text))
    return matches

if train_html_count + test_html_count > 0:
    all_tags = []
    for text in train[train['has_html']]['text']:
        tags = extract_html_tags(text)
        all_tags.extend(tags)

    for text in test[test['has_html']]['text']:
        tags = extract_html_tags(text)
        all_tags.extend(tags)
    
    tag_counts = Counter(all_tags)
    
    print("\nMost common HTML tags:")
    for tag, count in tag_counts.most_common(10):
        print(f"{tag} ({count})")

In [None]:
def remove_html_markups(text):
    # Replace <SPOILER> tags with SPOILER
    text = re.sub(r'<SPOILER>', 'SPOILER', str(text))
    text = re.sub(r'</SPOILER>', 'SPOILER', str(text))
    
    # Replace <br> tags with line breaks
    text = re.sub(r'<br\s*/?>', '\n', text)
    
    # Remove all other HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    return text

train['text'] = train['text'].apply(remove_html_markups)
test['text'] = test['text'].apply(remove_html_markups)

train = train.drop(columns=['has_html'])
test = test.drop(columns=['has_html'])

## Emojis

In [None]:
# Count samples with emojis
train['has_emoji'] = train['text'].apply(lambda x: emoji.emoji_count(x) > 0)
train_emoji_count = train['has_emoji'].sum()
train_total = len(train)
train_percentage = (train_emoji_count / train_total) * 100

test['has_emoji'] = test['text'].apply(lambda x: emoji.emoji_count(x) > 0)
test_emoji_count = test['has_emoji'].sum()
test_total = len(test)
test_percentage = (test_emoji_count / test_total) * 100

print(f"Train set: {train_emoji_count} out of {train_total} samples contain emojis ({train_percentage:.2f}%)")
print(f"Test set: {test_emoji_count} out of {test_total} samples contain emojis ({test_percentage:.2f}%)")

In [None]:
# Get most common emojis
def extract_emojis(text):
    return ''.join(c for c in text if c in emoji.EMOJI_DATA)

all_train_emojis = ''.join(train['text'].apply(extract_emojis))
train_emoji_counts = Counter(all_train_emojis)

all_test_emojis = ''.join(test['text'].apply(extract_emojis))
test_emoji_counts = Counter(all_test_emojis)

print("\nMost common emojis in train set:")
for em, count in train_emoji_counts.most_common(10):
    print(f"{em} ({count})")

print("\nMost common emojis in test set:")
for em, count in test_emoji_counts.most_common(10):
    print(f"{em} ({count})")

In [None]:
def remove_emojis(text):
    text = re.sub(r'[®©]', '', str(text))
    
    return text

train['text'] = train['text'].apply(remove_emojis)
test['text'] = test['text'].apply(remove_emojis)

train = train.drop(columns=['has_emoji'])
test = test.drop(columns=['has_emoji'])

## Special characters

In [None]:
def has_special_chars(text):
    # Define a pattern that matches special characters/accented letters
    # but excludes common punctuation and standard ASCII letters/numbers
    pattern = re.compile(r'[^\x00-\x7F]+')  # Matches any non-ASCII character
    
    return bool(pattern.search(str(text)))

train['has_special_chars'] = train['text'].apply(has_special_chars)
test['has_special_chars'] = test['text'].apply(has_special_chars)

special_chars_count_train = train['has_special_chars'].sum()
percentage = (special_chars_count_train / len(train)) * 100
print(f"Texts containing special characters: {special_chars_count_train} out of len(train) ({percentage:.2f}%)")

special_chars_count_test = test['has_special_chars'].sum()
percentage = (special_chars_count_test / len(test)) * 100
print(f"Texts containing special characters: {special_chars_count_test} out of len(test) ({percentage:.2f}%)")

In [None]:
special_chars_train = ' '.join(train['text'][train['has_special_chars']].tolist())
special_chars_test = ' '.join(test['text'][test['has_special_chars']].tolist())

all_special_chars = re.findall(r'[^\x00-\x7F]', special_chars_train + special_chars_test)
special_char_counts = Counter(all_special_chars)

print("\nMost frequent special characters:")
special_char_counts.most_common(10)

In [None]:
replacements = {
    '\x96': '–',
    '\x97': '–',
    '\xad': '–',
    '\x85': '...',
    '\x91': '‘',
    '\u201c': '“',
    '\x93': '“',
    '\xa0': '',  # erase
    '\x8d': '',  # erase
    '\x9d': '',  # erase
    '\uf0b7': '',  # erase
    '\x81': '',  # erase
    '\x84': '”',
    '\x8e': 'Ž',
    '\x9e': 'ž',
    '\x9a': 'š',
    '\x95': '.',
    '\x80': '€',
    '\x99': '',  # TM
    '\x98': '~',
    '\x9c': 'œ',
    '\x9f': 'Ÿ',
    '\x82': ',',
}

def replace_special_chars(text):
    for old_char, new_char in replacements.items():
        text = text.replace(old_char, new_char)
    return text

train['text'] = train['text'].apply(replace_special_chars)
test['text'] = test['text'].apply(replace_special_chars)

train = train.drop(columns=['has_special_chars'])
test = test.drop(columns=['has_special_chars'])

## Save

In [None]:
train.to_csv("data/processed/train.csv", index=False)
test.to_csv("data/processed/test.csv", index=False)