# Import and setup environment

In [1]:
import numpy as np
import pandas as pd

# Load data

In [2]:
EXTERNAL_DATA_PATH = './data/external'
RAW_DATA_PATH = './data/raw'
PROCESSED_DATA_PATH = './data/processed'

In [3]:
train_data = pd.read_csv(f'{RAW_DATA_PATH}/train.csv')
test_data = pd.read_csv(f'{RAW_DATA_PATH}/test.csv')

In [4]:
train_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e...,kaskus,0,0,0,1
1,"@verosvante kita2 aja nitizen yang pada kepo,t...",instagram,0,0,0,0
2,"""#SidangAhok smg sipenista agama n ateknya mat...",twitter,0,1,1,1
3,@bolususulembang.jkt barusan baca undang2 ini....,instagram,0,0,0,0
4,bikin anak mulu lu nof \nkaga mikir apa kasian...,kaskus,0,0,0,0


In [5]:
test_data.head()

Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik
0,"1.BUKAN CM SPANDUK PROF,VIDEO2 ORASI MEREKA, B...",twitter,0,0,1,0
1,@memeqbeceq gy sange'gatel yh tetek'memekY drn...,twitter,1,0,0,0
2,Pertama kali denger lagunya enk bgt in dan png...,instagram,0,0,0,0
3,"astajim, ini pasti yg kasih penghargaan ke ibu...",kaskus,0,0,0,0
4,beda kalo disini kalo komplain lgs di bully am...,kaskus,0,0,0,0


# Preprocess data

### Translate text based on emojis

In [9]:
import re

# translate emoticon
EMOTICON_DATA_PATH = f'{EXTERNAL_DATA_PATH}/emoticon.txt'
emoticon_df = pd.read_csv(EMOTICON_DATA_PATH, sep='\t', header=None)
emoticon_dict = dict(zip(emoticon_df[0], emoticon_df[1]))

def translate_emoticon(t):
    for w, v in emoticon_dict.items():
        pattern = re.compile(re.escape(w))
        match = re.search(pattern, t)
        if match:
            t = re.sub(pattern, v, t)
    return t

In [10]:
sample_text = 'senang sekali berada disini :)'

print(f'Before : {sample_text}')
print(f'After : {translate_emoticon(sample_text)}')

Before : senang sekali berada disini :)
After : senang sekali berada disini Senyum


### Remove excessive newline

In [11]:
def remove_newline(text):
    return re.sub('\n', ' ', text)

In [12]:
sample_text = 'halo saya\nadalah\nmahasiswa'

print(f'Before : {sample_text}')
print(f'After : {remove_newline(sample_text)}')

Before : halo saya
adalah
mahasiswa
After : halo saya adalah mahasiswa


### Remove kaskus formatting

In [13]:
def remove_kaskus_formatting(text):
    text = re.sub('\[', ' [', text)
    text = re.sub('\]', '] ', text)
    text = re.sub('\[quote[^ ]*\].*?\[\/quote\]', ' ', text)
    text = re.sub('\[[^ ]*\]', ' ', text)
    text = re.sub('"', ' ', text)
    return text

In [15]:
sample_text = '[QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo semuanya[/QUOTE]'

print(f'Before : {sample_text}')
print(f'After  : {remove_kaskus_formatting(sample_text)}')

Before : [QUOTE=jessepinkman16;5a50ac34d89b093f368b456e]yoiii cuy halo semuanya[/QUOTE]
After  :    yoiii cuy halo semuanya   


### Remove url

In [16]:
def remove_url(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)

In [34]:
sample_text = 'kemaren gua liat infonya di sini https://www.instagram.com/uno_136/'

print(f'Before : {sample_text}')
print(f'After  : {remove_url(sample_text)}')

Before : kemaren gua liat infonya di sini https://www.instagram.com/uno_136/
After  : kemaren gua liat infonya di sini 


### Remove excessive whitespace

In [18]:
def remove_excessive_whitespace(text):
    return re.sub('  +', ' ', text)

In [35]:
sample_text = 'budi      pergi ke           pasar'

print(f'Before : {sample_text}')
print(f'After  : {remove_excessive_whitespace(sample_text)}')

Before : budi      pergi ke           pasar
After  : budi pergi ke pasar


### Tokenize text for word punctuation

In [21]:
from nltk.tokenize import WordPunctTokenizer

def tokenize_text(text, punct=False):
    text = WordPunctTokenizer().tokenize(text)
    text = [word for word in text if punct or word.isalnum()]
    text = ' '.join(text)
    text = text.strip()
    return text

In [36]:
sample_text = 'kemarin, aku pergi ke jakarta. Lalu ketemu si Dimas.'

print(f'Before : {sample_text}')
print(f'After  : {tokenize_text(sample_text)}')

Before : kemarin, aku pergi ke jakarta. Lalu ketemu si Dimas.
After  : kemarin aku pergi ke jakarta Lalu ketemu si Dimas


### Transform slang words

In [23]:
slang_words = pd.read_csv(f'{EXTERNAL_DATA_PATH}/slangword.csv')
slang_dict = dict(zip(slang_words['original'], slang_words['translated']))

def transform_slang_words(text):
    word_list = text.split()
    word_list_len = len(word_list)
    transformed_word_list = []
    i = 0
    while i < word_list_len:
        if (i + 1) < word_list_len:
            two_words = ' '.join(word_list[i:i+2])
            if two_words in slang_dict:
                transformed_word_list.append(slang_dict[two_words])
                i += 2
                continue
        transformed_word_list.append(slang_dict.get(word_list[i], word_list[i]))
        i += 1
    return ' '.join(transformed_word_list)

In [37]:
sample_text = 'siap mas sebentar lagi saya sampai 7an'

print(f'Before : {sample_text}')
print(f'After  : {transform_slang_words(sample_text)}')

Before : siap mas sebentar lagi saya sampai 7an
After  : siap mas sebentar lagi saya sampai tujuan


### Remove non alphabet

In [31]:
def remove_non_alphabet(text):
    output = re.sub('[^a-zA-Z ]+', '', text)
    return output

In [38]:
sample_text = 'kemaren tu123 ada kelinci di kebun'

print(f'Before : {sample_text}')
print(f'After  : {remove_non_alphabet(sample_text)}')

Before : kemaren tu123 ada kelinci di kebun
After  : kemaren tu ada kelinci di kebun


### Remove twitter & instagram formatting

In [39]:
def remove_twitter_ig_formatting(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'\brt\b', '', text)
    return text

In [40]:
sample_text = '@uno136 menurut saya hal tersebut masih kurang baik dilakukan sih kak'

print(f'Before : {sample_text}')
print(f'After  : {remove_twitter_ig_formatting(sample_text)}')

Before : @uno136 menurut saya hal tersebut masih kurang baik dilakukan sih kak
After  :  menurut saya hal tersebut masih kurang baik dilakukan sih kak


### Remove repeating characters

In [41]:
import itertools

def remove_repeating_characters(text):
    return ''.join(''.join(s)[:1] for _, s in itertools.groupby(text))

In [42]:
sample_text = 'heyyyyyyyyyyyyyyyyyyyy tadi ada apaan?'

print(f'Before : {sample_text}')
print(f'After  : {remove_repeating_characters(sample_text)}')

Before : heyyyyyyyyyyyyyyyyyyyy tadi ada apaan?
After  : hey tadi ada apan?


### Final preprocessing

In [43]:
def preprocess_text(text):
    transformed_text = text.lower()
    transformed_text = remove_newline(text)
    transformed_text = remove_url(transformed_text)
    transformed_text = remove_twitter_ig_formatting(transformed_text)
    transformed_text = remove_kaskus_formatting(transformed_text)
    transformed_text = translate_emoticon(transformed_text)
    transformed_text = transformed_text.lower()
    transformed_text = tokenize_text(transformed_text)
    transformed_text = transform_slang_words(transformed_text)
    transformed_text = remove_repeating_characters(transformed_text)
    transformed_text = transform_slang_words(transformed_text)
    transformed_text = remove_non_alphabet(transformed_text)
    transformed_text = remove_excessive_whitespace(transformed_text)
    transformed_text = transformed_text.lower().strip()
    return transformed_text

# Save preprocessed data

In [44]:
train_data['processed_text'] = train_data['original_text'].apply(preprocess_text)
test_data['processed_text'] = test_data['original_text'].apply(preprocess_text)

In [45]:
train_data.to_csv(f'{PROCESSED_DATA_PATH}/processed_train.csv', index=False)
test_data.to_csv(f'{PROCESSED_DATA_PATH}/processed_test.csv', index=False)