In [55]:
!pipreqs .

INFO: Not scanning for jupyter notebooks.
INFO: Successfully saved requirements file in .\requirements.txt


In [1]:
import re
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd

In [2]:
# clean number, tanda baca
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()
    return text

In [3]:
# clean username
def clean_username(text):
    return re.sub(r'[@#]\S+', '', text)

In [4]:
# clean RT, RE[], and Non Alphabetic
def clean_text(text):
    text = re.sub(r'\bRT\b', '', text)  # Remove 'RT'
    text = re.sub(r'\[RE [^\]]+\]', '', text)  # Remove '[RE xxxx]'
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [5]:
def remove_repeated_chars(text):
    exception_word = ['uu','pp'] 
    # Regular expression to remove repeated characters, except specified exception word
    pattern = rf'(?<!\b{exception_word})((.)\2+)(?!\2)'
    return re.sub(pattern, r'\2', text)


In [17]:
def text_to_formal(df, slang_to_formal, column):
    # List untuk menyimpan hasil teks yang telah diformalisasi
    formal_tokens = []
    
    # Iterasi melalui setiap baris dalam DataFrame
    for index, row in df.iterrows():
        temp = []
        # Iterasi melalui setiap kata dalam kolom 'clean'
        for word in row[column].split():
            # Mengganti kata slang dengan kata formal jika ada dalam kamus, jika tidak, tetap menggunakan kata aslinya
            temp.append(slang_to_formal.get(word, word))
        # Menggabungkan kata-kata menjadi sebuah kalimat
        res = " ".join(temp)
        formal_tokens.append(res)
    
    return formal_tokens

In [7]:
def separate_repeated_words(text):
    # List kata yang akan dikecualikan dari pemisahan
    exceptions = [
    'tata', 'sisi', 'uu', 'wowo', 'cucu', 'papa', 'mama', 'wkwk', 'susu',
    'bebe', 'kuku', 'nana', 'lele', 'dodo', 'pepe', 'lala', 'gugu', 'meme',
    'kaka', 'tutu', 'pipi', 'wawa', 'cici', 'hihi', 'tete', 'mimi', 'yaya',
    'popo', 'zaza', 'riri', 'fufu', 'bobo', 'sisis', 'pupa', 'kekek', 'lulu',
    'toto', 'nene', 'yuyu', 'bababa', 'cicic', 'didi', 'papapa', 'mamama',
    'tototo', 'kiki', 'wuwu', 'sissis', 'lololo', 'tutu', 'mimi', 'wawa',
    'hoho', 'lol', 'gaga', 'tata', 'kekeke', 'dudu', 'fifi', 'didi', 'zizi',
    'nini', 'mumu', 'wuwu', 'babi', 'jiji', 'wawa', 'mumu', 'hihi', 'titi',
    'kiki', 'popo', 'pepe', 'zaza', 'nene', 'sissi', 'tati', 'tati', 'dedede',
    'lulu', 'mumu', 'yuyu', 'riri', 'zizi', 'cucu', 'babi', 'mama', 'pupa',
    'pupu', 'tete', 'hihi', 'kaka', 'wawa', 'dudu', 'gaga', 'pepe', 'mama',
    'kukuku','pp'
]
    
    # Regular expression to find repeated words, except specified exceptions
    pattern = r'\b(?!(?:' + '|'.join(exceptions) + r')\b)(\w+)\1\b'
    
    return re.sub(pattern, r'\1 \1', text)

## LOAD DATA

In [25]:
dataset = pd.read_csv("./dataset/processed dataset/DatasetTrainAllVersion.csv")

dataset

Unnamed: 0,text,label
0,kunjungan prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,rt anies dapat tepuk tangan meriah saat jadi r...,Politik
2,@ciqxqwgat04tmtx4ocatxjovq7vv/y8heyaiogmfg8y= ...,Demografi
3,rt @l3r8xfbw3wgbxrpsj0/0hhztbqvgx7qtfwrg9zmhk7...,Politik
4,anies baswedan harap asn termasuk tni dan polr...,Politik
...,...,...
14559,"Penggundulan hutan di Indonesia saat ini, menu...",Sumber Daya Alam
14560,"Menurut Mahfud MD, Calon Wakil Presiden nomor ...",Sumber Daya Alam
14561,"Menurut Calon Wakil Presiden nomor urut 3, Mah...",Sumber Daya Alam
14562,"Calon Wakil Presiden nomor urut 3, Mahfud MD, ...",Sumber Daya Alam


In [26]:
dataset['clean_username'] = dataset['text'].apply(lambda x : clean_username(x))

In [27]:
dataset['clean_text'] = dataset['clean_username'].apply(lambda x : clean_text(x))

In [28]:
dataset['normalize_text'] = dataset['clean_text'].apply(lambda x : normalize_text(x))

In [29]:
dataset['clean_http'] = dataset['normalize_text'].apply(lambda x: re.sub(r'\b\w*https\w*\b', '', x))

In [30]:
slang_list = pd.read_csv("./dataset/corpus dataset/colloquial-indonesian-lexicon.csv")
slang = slang_list['slang'].values.tolist()
formal = slang_list['formal'].values.tolist()
slang_to_formal = dict(zip(slang, formal))

dataset['formal'] = text_to_formal(dataset, slang_to_formal,'clean_http')

In [32]:
dataset['preprocessed_formal'] = dataset['formal'].apply(separate_repeated_words)

In [33]:
dataset['preprocessed_formal_character'] = dataset['preprocessed_formal'].apply(remove_repeated_chars)

In [38]:
dataset.columns

Index(['text', 'label', 'clean_username', 'clean_text', 'normalize_text',
       'clean_http', 'formal', 'preprocessed_formal',
       'preprocessed_formal_character'],
      dtype='object')

In [39]:
dataset.dropna(inplace=True)

In [50]:
dataset[['preprocessed_formal_character','label']].to_csv("all_version_cleaning.csv",index=False)

# NON TRANSFORMER PREPROCESSING

In [154]:
from nlp_id.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer()
lemmatized=[]
for index, row in dataset.iterrows():
    lemmatized.append(lemmatizer.lemmatize(row['formal']))

dataset['lemmatized']=lemmatized

In [155]:
from nlp_id.stopword import StopWord

stopword = StopWord()
stopword_removed=[]

for index, row in dataset.iterrows():
    stopword_removed.append(stopword.remove_stopword(row['lemmatized']))

dataset['stopword_removed']=stopword_removed

In [156]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

tokens_c=[]

for index, row in dataset.iterrows():
    tokens = tokenizer.tokenize(row['stopword_removed'])
    tokens_c.append(tokens)

dataset['tokens']=tokens_c

In [157]:
dataset['tokens'] = dataset['tokens'].apply(lambda x : ' '.join(x))

## TRAINING DATA

In [158]:
dataset.head()

Unnamed: 0,text,label,clean_username,clean_text,normalize_text,clean_http,formal,preprocessed_formal,preprocessed_formal_character,lemmatized,stopword_removed,tokens
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam,Kunjungan Prabowo ini untuk meresmikan dan men...,Kunjungan Prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjung prabowo ini untuk resmi dan serah proy...,kunjung prabowo resmi serah proyek bantu air b...,kunjung prabowo resmi serah proyek bantu air b...
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik,RT Anies dapat tepuk tangan meriah saat jadi R...,Anies dapat tepuk tangan meriah saat jadi Rekt...,anies dapat tepuk tangan meriah saat jadi rekt...,anies dapat tepuk tangan meriah saat jadi rekt...,anies dapat tepuk tangan meriah saat jadi rekt...,anies dapat tepuk tangan meriah saat jadi rekt...,anies dapat tepuk tangan meriah sat jadi rekto...,anies dapat tepuk tangan riah saat jadi rektor...,anies tepuk tangan riah rektor wajib mata kuli...,anies tepuk tangan riah rektor wajib mata kuli...
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi,"emng bener sih, pendukung 01 ada yg goblok, b...",emng bener sih pendukung ada yg goblok begitu ...,emng bener sih pendukung ada yg goblok begitu ...,emng bener sih pendukung ada yg goblok begitu ...,memang benar sih pendukung ada yang goblok beg...,memang benar sih pendukung ada yang goblok beg...,memang benar sih pendukung ada yang goblok beg...,memang benar sih dukung ada yang goblok begitu...,dukung goblok dukung ridwan kamil skema kalo m...,dukung goblok dukung ridwan kamil skema kalo m...
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik,RT Sewaktu anies bersikap kritis ke kinerja p...,Sewaktu anies bersikap kritis ke kinerja pak p...,sewaktu anies bersikap kritis ke kinerja pak p...,sewaktu anies bersikap kritis ke kinerja pak p...,sewaktu anies bersikap kritis ke kinerja pak p...,sewaktu anies bersikap kritis ke kinerja pak p...,sewaktu anies bersikap kritis ke kinerja pak p...,waktu anies sikap kritis ke kerja pak prabowo ...,anies sikap kritis kerja prabowo anggap sopan ...,anies sikap kritis kerja prabowo anggap sopan ...
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Anies Baswedan Harap ASN termasuk TNI dan Polr...,anies baswedan harap asn termasuk tni dan polr...,anies baswedan harap asn termasuk tni dan polr...,anies baswedan harap aparatur sipil negara ter...,anies baswedan harap aparatur sipil negara ter...,anies baswedan harap aparatur sipil negara ter...,anies baswedan harap aparatur sipil negara mas...,anies baswedan harap aparatur sipil negara mas...,anies baswedan harap aparatur sipil negara mas...


In [159]:
dataset[['text','label']].to_csv("./training data/text-berttweet.csv",index=False)

In [160]:
dataset[['formal','label']].to_csv("./training data/bert-model.csv",index=False)

PermissionError: [Errno 13] Permission denied: './training data/bert-model.csv'

In [None]:
dataset[['tokens','label']].to_csv("./training data/tokens_data.csv",index=False)

## INDOBERTWEET PREPROCESSING

In [26]:
twitter_bert = pd.read_csv("./dataset/raw dataset/dataset_unlabeled_penyisihan_bdc_2024.csv",delimiter=";")
twitter_bert

Unnamed: 0,IDText,text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...
...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan..."
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...


In [12]:
# twitter_bert.drop_duplicates(inplace=True)

In [27]:
twitter_bert

Unnamed: 0,IDText,text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...
...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan..."
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...


In [14]:
twitter_bert['label'].value_counts()

label
Politik                    3094
Ideologi                    603
Pertahanan dan Keamanan     546
Sosial Budaya               515
Ekonomi                     479
Sumber Daya Alam            409
Demografi                   401
Geografi                    385
Name: count, dtype: int64

In [15]:
from emoji import demojize

def preprocess_tweet(tweet):
    # Translate emojis to text
    tweet = demojize(tweet)
    
    # Convert user mentions and URLs to special tokens
    tweet = re.sub(r'[@#]\S+', '@USER', tweet)
    tweet = re.sub(r'http\S+|www\S+', 'HTTPURL', tweet)
    
    return tweet

In [16]:
def clean_non_alphabetic(text):
    text = re.sub(r'\brt\b', '', text)  # Remove 'RT'
    text = re.sub(r'\[re [^\]]+\]', '', text)  # Remove '[RE xxxx]'
    # Preserve @USER while removing non-alphabetic characters
    text = re.sub(r'(@USER)|[^a-zA-Z\s]', lambda m: m.group(1) if m.group(1) else '', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [28]:
twitter_bert['text'] = twitter_bert['text'].apply(lambda x : x.lower())

In [29]:
twitter_bert['preprocessed_text'] = twitter_bert['text'].apply(lambda x : preprocess_tweet(x))

In [30]:
twitter_bert['preprocessed_text_two'] = twitter_bert['preprocessed_text'].apply(lambda x : clean_non_alphabetic(x))

In [31]:
twitter_bert['formal'] = text_to_formal(twitter_bert, slang_to_formal,'preprocessed_text_two')

In [32]:
twitter_bert['formal'] = twitter_bert['formal'].apply(lambda x : separate_repeated_words(x))

In [22]:
twitter_bert_dua =  twitter_bert.sample(frac=1, random_state=42)

In [35]:
twitter_bert

Unnamed: 0,IDText,text,preprocessed_text,preprocessed_text_two,formal
0,TXT0001,lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org prodemokrasi di negara ini bisa pun...,lu mau orang prodemokrasi di negara ini bisa p...
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo ganjar pranowo itulah beliau soso...,kiki_daliyo ganjar pranowo itulah beliau soso...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...
3,TXT0004,@kumparan prabowo gibran yang bisa melakukan i...,@USER prabowo gibran yang bisa melakukan itu s...,@USER prabowo gibran yang bisa melakukan itu s...,@USER prabowo gibran yang bisa melakukan itu s...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib lah ...,@USER @USER @USER lah justru yg gak nyambung j...,@USER @USER @USER lah justru yg gak nyambung j...,@USER @USER @USER lah justru yang tidak menyam...
...,...,...,...,...,...
995,TXT0996,"bikin bangga deh, ganjar-mahfud mau alokasikan...","bikin bangga deh, ganjar-mahfud mau alokasikan...",bikin bangga deh ganjarmahfud mau alokasikan s...,bikin bangga deh ganjar mahfud mau alokasikan ...
996,TXT0997,pak jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi sebelum pilpres berbesar hati meran...,pak jokowi sebelum pemilihan presiden berbesar...
997,TXT0998,@datuakrajoangek sbaiknya si gemot nga usah ik...,@USER sbaiknya si gemot nga usah ikutan debat ...,@USER sbaiknya si gemot nga usah ikutan debat ...,@USER sebaiknya sih gemot tidak usah ikutan de...
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...


In [36]:
formal_predict = twitter_bert[['IDText','formal']].to_csv("./predict data/formal.csv",index=False)

In [37]:
raw_text = twitter_bert[['IDText','text']].to_csv("./predict data/raw_text.csv",index=False)

In [24]:
twitter_bert_dua.dropna(inplace=True)

In [62]:
twitter_bert_dua['label'].value_counts()

label
Politik                    3094
Ideologi                    603
Pertahanan dan Keamanan     546
Sosial Budaya               515
Ekonomi                     479
Sumber Daya Alam            409
Demografi                   401
Geografi                    385
Name: count, dtype: int64

In [63]:
twitter_bert_dua[['formal','label']].dropna().to_csv("indotweet_train.csv",index=False)

## TRANSLATE

In [None]:
# from googletrans import Translator
# import json

# # Initialize the translator
# translator = Translator()

# # Function to translate text
# def translate_text(text, src='auto', dest='en'):
#     try:
#         translated = translator.translate(text, src=src, dest=dest)
#         return translated.text
#     except json.decoder.JSONDecodeError as json_err:
#         print(f"JSONDecodeError occurred: {json_err}")
#         return None
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return None

# twitter_bert['english'] =  twitter_bert['formal'].apply(lambda x : translate_text(x, src='id', dest='en'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_bert['english'] =  twitter_bert['formal'].apply(lambda x : translate_text(x, src='id', dest='en'))


## PROCESS ENGLISH

In [None]:
# twitter_bert['english_processed'] = twitter_bert['english'].apply(lambda x :clean_username(x))
# twitter_bert['english_processed'] = twitter_bert['english_processed'].apply(lambda x :normalize_text(x))
# twitter_bert['english_processed'] = twitter_bert['english_processed'].apply(lambda x :re.sub(r'\b\w*htt\w*\b', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_bert['english_processed'] = twitter_bert['english'].apply(lambda x :clean_username(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_bert['english_processed'] = twitter_bert['english_processed'].apply(lambda x :normalize_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitt

In [None]:
# twitter_bert[['english_processed','label']].dropna().to_csv("english_process.csv",index=False)

## TRY AUGMENTED DATA

In [None]:
english_data = pd.read_csv("./english_process.csv")

english_data.head()

Unnamed: 0,english_processed,label
0,prabowos visit was to inaugurate and submit a ...,Sumber Daya Alam
1,anies can be applauded when becoming the chanc...,Politik
2,it is true that there are supporters who are s...,Demografi
3,when anies was critical of pak prabowos perfor...,Politik
4,anies baswedan hopes that the state civil appa...,Politik


In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
english_data.dropna(inplace=True)

In [None]:
english_data

Unnamed: 0,english_processed,label
0,prabowos visit was to inaugurate and submit a ...,Sumber Daya Alam
1,anies can be applauded when becoming the chanc...,Politik
2,it is true that there are supporters who are s...,Demografi
3,when anies was critical of pak prabowos perfor...,Politik
4,anies baswedan hopes that the state civil appa...,Politik
...,...,...
4995,seeing the debate yesterday when prabowo kicep...,Politik
4996,the community believes that prabowo gibran has...,Politik
4997,imo both are irrational but one is much more i...,Ekonomi
4998,look at that mr ganjar you have been involved ...,Pertahanan dan Keamanan


In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
# augmented_text = aug.augment(text)
# print("Original:")
# print(text)
# print("Augmented Text:")
english_data['augmented_text'] = english_data['english_processed'].apply(lambda x: aug.augment(x))

TypeError: object of type 'float' has no len()

In [None]:
import nlpaug.augmenter.word as naw

back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)

In [None]:
augmented_text = back_translation_aug.augment(text)
print(augmented_text)

['Companies can applaud when it comes to running anti-corruption courses as chancellor to break the chain of corruption']


In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
anies can be applauded when becoming the chancellor requires anti corruption courses to break the chain of corruption  
Augmented Text:
['anies can stay consulted when speaking the chief requires anti corruption courses should follow the chain of governance']


### PREPROCESSING AUGMENTED DATA

In [8]:
data_augmented = pd.read_csv("./training data/augmented_data.csv")

data_augmented.head()

Unnamed: 0,english_processed,label
0,prabowos visit was to inaugurate and submit a ...,Sumber Daya Alam
1,anies can be applauded when becoming the chanc...,Politik
2,it is true that there are supporters who are s...,Demografi
3,when anies was critical of pak prabowos perfor...,Politik
4,anies baswedan hopes that the state civil appa...,Politik


In [9]:
data_augmented['english_processed_two'] = data_augmented['english_processed'].apply(lambda x : normalize_text(x))

In [10]:
data_augmented[['english_processed_two','label']].to_csv("augmented_training.csv",index=False)