# PREPROCESSING

### 1. Load All Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.classify import SklearnClassifier
from nltk.stem import LancasterStemmer, WordNetLemmatizer

import matplotlib.pyplot as plt

nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
import re

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Load Dataset

In [5]:
data = pd.read_csv("./dataset/raw dataset/dataset_penyisihan_bdc_2024.csv",sep=';')

In [6]:
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


### 3. Data Assesment

In [7]:
data.info()
data.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


text     5000
label    5000
dtype: int64

In [8]:
data.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Demografi,62
Ekonomi,367
Geografi,20
Ideologi,400
Pertahanan dan Keamanan,400
Politik,2972
Sosial Budaya,587
Sumber Daya Alam,192


### 4. Clean Data (Drop Duplicate)

In [101]:
# data_clean = data.drop_duplicates()
data_clean = data.copy()

In [102]:
data_clean.head()

Unnamed: 0,IDText,Text
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...


### 5. Split Camel Case and Cleaning from Twitter Format 

In [103]:
def preprocess_tweet(data):
    text = data['preprocessed']
    text = p.clean(text)
    return text

def split_camel_case(text):
    return [re.sub(r'([a-z])([A-Z])', r'\1 \2', i) for i in text]

### 6. Get Hastag

In [104]:
data['hastag'] = data['Text'].apply(lambda x: re.findall(f"#(\w+)",x))
data_clean['preprocessed_hastag'] = data['hastag'].apply(split_camel_case)
data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x: str(x[0]) if len(x) > 0 else np.nan)
data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x : str(x).lower())
data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x : x.split())

In [105]:
data_clean

Unnamed: 0,IDText,Text,preprocessed_hastag
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,[nan]
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,[nan]
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,"[ganjar, merawat, pancasila]"
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,[nan]
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,[nan]
...,...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan...","[ganjar, mahfud2024]"
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...,[nan]
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...,[nan]
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...,"[rembu, ganjar]"


### 7. Get Username

In [106]:
data_clean['username'] = data['Text'].apply(lambda x : re.findall(f"\[RE ([^\]]+)\]",x))
data_clean['username'] = data_clean['username'].apply(lambda x: str(x[0]) if len(x) > 0 else np.nan)

### 7. Clean @ or (USERNAME) and # or (HASTAG) in Text

In [107]:
def clean_username(text):
    return re.sub(r'[@#]\S+', '', text)

data_clean['preprocessed'] = data_clean['Text'].apply(lambda x : clean_username(x))

### 8. Combine Hastag and Text

In [108]:
data_clean['preprocessed'] = data_clean['preprocessed'].apply(lambda x : x.split())
data_clean['preprocessed'] = data_clean['preprocessed'] + data_clean['preprocessed_hastag']
data_clean['preprocessed'] = data_clean['preprocessed'].apply(lambda x : ' '.join(x))

In [109]:
# data_clean[['formal','label']].to_csv("BERT_Training.csv",index=False)

### 9. Lower, Delete Punctuation, and Delete Double Space

In [110]:
import string

data_clean['lower'] = data_clean['preprocessed'].str.lower().str.replace("[^\w\s]","").str.replace("\s\s+","")

for index,row in data_clean.iterrows():
    row['lower'] = row['lower'].translate(str.maketrans('','',string.punctuation))
    data_clean['lower'][index] = row['lower']

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data_clean['lower'][index] = row['lower']
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame 

### 10. Clean HTTPS and Number

In [111]:
data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\d+', '', x))
data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\b\w*https\w*\b', '', x))

### 10. CHANGE NON FORMAL TO FORMAL

In [112]:
slang_list = pd.read_csv("./dataset/corpus dataset/colloquial-indonesian-lexicon.csv")
slang = slang_list['slang'].values.tolist()
formal = slang_list['formal'].values.tolist()
slangToformal = dict(zip(slang,formal))

formal_tokens = []
for index,row in data_clean.iterrows():
    temp = []
    for word in row['clean'].split():
        temp.append(slangToformal.get(word,word))
    res = " ".join(temp)
    formal_tokens.append(res)
data_clean["formal"] = formal_tokens

### 11. Cleaning Word that Start With Re (Retweet)

In [113]:
data_clean['formal'] = data_clean['formal'].str.replace(r're\s+\S+', '',regex=True)

## 13. Lematisasi

In [114]:
from nlp_id.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer()
lemmatized=[]
for index, row in data_clean.iterrows():
    lemmatized.append(lemmatizer.lemmatize(row['formal']))

data_clean['lemmatized']=lemmatized

### 13. Stopword Removal

In [115]:
from nlp_id.stopword import StopWord

stopword = StopWord()
stopword_removed=[]

for index, row in data_clean.iterrows():
    stopword_removed.append(stopword.remove_stopword(row['lemmatized']))

data_clean['stopword_removed']=stopword_removed

### 14. Tokenisasi

In [116]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

tokens_c=[]

for index, row in data_clean.iterrows():
    tokens = tokenizer.tokenize(row['stopword_removed'])
    tokens_c.append(tokens)

data_clean['tokens']=tokens_c

### 15. Second Stopword Removal

In [117]:
#augment the stopwords with nonstandard twitter words
stopwords_set = set(stopwords.words("indonesian"))
stopwords_aug = {"ya","yak","iya","yg","ga","gak","gk","udh","sdh","udah","dah","nih","ini","deh","sih","dong","donk",
                 "sm","knp","utk","yaa","tdk","gini","gitu","bgt","gt","nya","kalo","cb","jg","jgn","gw","ge",
                 "sy","min","mas","mba","mbak","pak","kak","trus","trs","bs","bisa","aja","saja","no",
                 "w","g","gua","gue","emang","emg","wkwk","dr","kau","dg","gimana","apapun","apa",
                 "klo","yah","banget","pake","terus","krn","jadi","jd","mu","ku","si","hehe",
                 "tp","pa","lu","lo","lw","tw","tau","karna","kayak","ky","lg","untuk","tuk","dg","dgn"}
stopwords_all = stopwords_set.union(stopwords_aug)

data_clean['stopword_removed2'] = data_clean['tokens'].apply(lambda x: [item for item in x if item not in stopwords_all])

### 16. Remove Digit in List

In [118]:
data_clean['stopword_removed2'] = data_clean['stopword_removed2'].apply(lambda x : [item for item in x if not item.isdigit()])

### 17. Translate to Indonesia

In [119]:
from deep_translator import GoogleTranslator

def translate_to_indonesian(text):
    translated_text = GoogleTranslator(source='en', target='id').translate(text)
    return translated_text

indices_to_translate = [40,415,1646,2973,3767,3721,4736,392,39,772,1121,1269,1787,306,2271,2344,17]

def conditional_translation(row, index):
    if index in indices_to_translate:
        return translate_to_indonesian(' '.join(row))
    else:
        return ' '.join(row)

In [120]:
data_clean['stopword_removed2'] = data_clean.apply(
    lambda row: conditional_translation(row['stopword_removed2'], row.name),
    axis=1)

In [121]:
data_clean.head()

Unnamed: 0,IDText,Text,preprocessed_hastag,username,preprocessed,lower,clean,formal,lemmatized,stopword_removed,tokens,stopword_removed2
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,[nan],debordsbitch,Lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau orang-orang prodemokrasi di negara ini ...,lu mau orang prodemokrasi di negara ini bisa p...,lu orang prodemokrasi negara ngelawan oligarki...,"[lu, orang, prodemokrasi, negara, ngelawan, ol...",orang prodemokrasi negara ngelawan oligarki in...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,[nan],mazzini_gsp,Prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo tanya soal hutang luar negeri dia jawa...,prabowo hutang negeri hutang negeri indonesia ...,"[prabowo, hutang, negeri, hutang, negeri, indo...",prabowo hutang negeri hutang negeri indonesia ...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,"[ganjar, merawat, pancasila]",,kiki_daliyo Ganjar Pranowo itulah beliau sosok...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itu beliau sosok yan...,kikidaliyo ganjar pranowo sosok mengagunhkan n...,"[kikidaliyo, ganjar, pranowo, sosok, mengagunh...",kikidaliyo ganjar pranowo sosok mengagunhkan n...
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,[nan],,Prabowo Gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa laku itu semua demi s...,prabowo gibran laku sejahtera rakyat nan,"[prabowo, gibran, laku, sejahtera, rakyat, nan]",prabowo gibran laku sejahtera rakyat nan
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,[nan],,Lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang tidak menyambung junjungan lu ...,lah justru yang tidak sambung junjung lu aomkm...,sambung junjung lu aomkmkmkmk omong etika anie...,"[sambung, junjung, lu, aomkmkmkmk, omong, etik...",sambung junjung aomkmkmkmk omong etika anies o...


### 18. Filter in KBBI Corpus

In [125]:
kbbi_corpus = pd.read_csv("./dataset/corpus dataset/kbbi.csv")
kbbi_corpus.drop_duplicates(inplace=True)
list_kbbi = kbbi_corpus['kata'].to_list()

def hapus_kata_non_sastrawi(kata_list):
    kata_list = kata_list.split()
    return [kata for kata in kata_list if kata in list_kbbi]

In [126]:
# Terapkan fungsi ke kolom stopword_removed2
data_clean['KBBI'] = data_clean['stopword_removed2'].apply(hapus_kata_non_sastrawi)

In [127]:
data_clean

Unnamed: 0,IDText,Text,preprocessed_hastag,username,preprocessed,lower,clean,formal,lemmatized,stopword_removed,tokens,stopword_removed2,KBBI
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,[nan],debordsbitch,Lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau orang-orang prodemokrasi di negara ini ...,lu mau orang prodemokrasi di negara ini bisa p...,lu orang prodemokrasi negara ngelawan oligarki...,"[lu, orang, prodemokrasi, negara, ngelawan, ol...",orang prodemokrasi negara ngelawan oligarki in...,"[orang, prodemokrasi, negara, oligarki, indone..."
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,[nan],mazzini_gsp,Prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo tanya soal hutang luar negeri dia jawa...,prabowo hutang negeri hutang negeri indonesia ...,"[prabowo, hutang, negeri, hutang, negeri, indo...",prabowo hutang negeri hutang negeri indonesia ...,"[prabowo, hutang, negeri, hutang, negeri, indo..."
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,"[ganjar, merawat, pancasila]",,kiki_daliyo Ganjar Pranowo itulah beliau sosok...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itu beliau sosok yan...,kikidaliyo ganjar pranowo sosok mengagunhkan n...,"[kikidaliyo, ganjar, pranowo, sosok, mengagunh...",kikidaliyo ganjar pranowo sosok mengagunhkan n...,"[ganjar, sosok, nilai, nilai, pancasila, sadar..."
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,[nan],,Prabowo Gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa laku itu semua demi s...,prabowo gibran laku sejahtera rakyat nan,"[prabowo, gibran, laku, sejahtera, rakyat, nan]",prabowo gibran laku sejahtera rakyat nan,"[prabowo, laku, sejahtera, rakyat]"
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,[nan],,Lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang tidak menyambung junjungan lu ...,lah justru yang tidak sambung junjung lu aomkm...,sambung junjung lu aomkmkmkmk omong etika anie...,"[sambung, junjung, lu, aomkmkmkmk, omong, etik...",sambung junjung aomkmkmkmk omong etika anies o...,"[sambung, junjung, omong, etika, anies, anies,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,TXT0996,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan...","[ganjar, mahfud2024]",,"Bikin bangga deh, Ganjar-Mahfud mau alokasikan...",bikin bangga deh ganjarmahfud mau alokasikan s...,bikin bangga deh ganjarmahfud mau alokasikan s...,bikin bangga deh ganjarmahfud mau alokasikan s...,bikin bangga deh ganjarmahfud mau alokasi teng...,bikin bangga deh ganjarmahfud alokasi belanja ...,"[bikin, bangga, deh, ganjarmahfud, alokasi, be...",bikin bangga ganjarmahfud alokasi belanja peri...,"[bikin, bangga, alokasi, belanja, perintah, ko..."
996,TXT0997,Pak Jokowi sebelum pilpres 2024 berbesar hati ...,[nan],OliviaCath1540,Pak Jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi sebelum pilpres 2024 berbesar hati ...,pak jokowi belum pilpres 2024 besar hati rangk...,jokowi pilpres 2024 besar hati rangkul prabowo...,"[jokowi, pilpres, 2024, besar, hati, rangkul, ...",jokowi pilpres hati rangkul prabowo prabowo ut...,"[jokowi, hati, rangkul, prabowo, prabowo, utuh..."
997,TXT0998,@datuakrajoangek Sbaiknya si gemot nga usah ik...,[nan],brochandra,Sbaiknya si gemot nga usah ikutan debat deh..n...,sbaiknya si gemot nga usah ikutan debat dehnga...,sbaiknya si gemot nga usah ikutan debat dehnga...,sebaiknya sih gemot tidak usah ikutan debat de...,baik sih got tidak usah ikut debat dehnga jela...,got debat dehnga apakalau serang 80t dikemanai...,"[got, debat, dehnga, apakalau, serang, 80t, di...",got debat dehnga apakalau serang 80t dikemanai...,"[got, debat, serang, polusi, anies, buka, raha..."
998,TXT0999,kebiasaan merembuk atau bermusyawarah jadi gay...,"[rembu, ganjar]",,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...,kebiasaan merembuk atau bermusyawarah jadi gay...,biasa rembuk atau musyawarah jadi gaya pimpin ...,rembuk musyawarah gaya pimpin ganjar gubernur ...,"[rembuk, musyawarah, gaya, pimpin, ganjar, gub...",rembuk musyawarah gaya pimpin ganjar gubernur ...,"[rembuk, musyawarah, gaya, pimpin, ganjar, gub..."


### 19. Remove Word that Just Consist of 2 Letter and Replace Milu to Pemilu

In [1]:
def clear_list(clear2List):
    # Menghapus elemen dengan panjang kurang dari atau sama dengan 2 karakter
    list_karakter = [item for item in clear2List if len(item) > 2]

    # Mengganti semua kemunculan "milu" menjadi "pemilu" di dalam list
    for i in range(len(list_karakter)):
        list_karakter[i] = list_karakter[i].replace("milu", "pemilu")
    
    return list_karakter

data_clean['cleaned_KBBI'] = data_clean['KBBI'].apply(clear_list)

NameError: name 'data_clean' is not defined

### 20. Optional Make it To Unique Words

In [129]:
data_clean['unique_teks'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(set(x)))

### 21. Drop Nan Result

In [None]:
# data_clean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.dropna(inplace=True)


In [131]:
data_clean.head()

Unnamed: 0,IDText,Text,preprocessed_hastag,username,preprocessed,lower,clean,formal,lemmatized,stopword_removed,tokens,stopword_removed2,KBBI,cleaned_KBBI,unique_teks
0,TXT0001,Lu mau org2 pro-demokrasi di negara ini bisa p...,[nan],debordsbitch,Lu mau org2 pro-demokrasi di negara ini bisa p...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau org2 prodemokrasi di negara ini bisa pu...,lu mau orang-orang prodemokrasi di negara ini ...,lu mau orang prodemokrasi di negara ini bisa p...,lu orang prodemokrasi negara ngelawan oligarki...,"[lu, orang, prodemokrasi, negara, ngelawan, ol...",orang prodemokrasi negara ngelawan oligarki in...,"[orang, prodemokrasi, negara, oligarki, indone...","[orang, prodemokrasi, negara, oligarki, indone...",orang oligarki prabowo pilih partai gama negar...
1,TXT0002,Prabowo ditanya soal hutang luar negeri dia me...,[nan],mazzini_gsp,Prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo ditanya soal hutang luar negeri dia me...,prabowo tanya soal hutang luar negeri dia jawa...,prabowo hutang negeri hutang negeri indonesia ...,"[prabowo, hutang, negeri, hutang, negeri, indo...",prabowo hutang negeri hutang negeri indonesia ...,"[prabowo, hutang, negeri, hutang, negeri, indo...","[prabowo, hutang, negeri, hutang, negeri, indo...",normal uang intervensi ganjar menteri tahan am...
2,TXT0003,kiki_daliyo Ganjar Pranowo itulah beliau soso...,"[ganjar, merawat, pancasila]",,kiki_daliyo Ganjar Pranowo itulah beliau sosok...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itulah beliau sosok ...,kikidaliyo ganjar pranowo itu beliau sosok yan...,kikidaliyo ganjar pranowo sosok mengagunhkan n...,"[kikidaliyo, ganjar, pranowo, sosok, mengagunh...",kikidaliyo ganjar pranowo sosok mengagunhkan n...,"[ganjar, sosok, nilai, nilai, pancasila, sadar...","[ganjar, sosok, nilai, nilai, pancasila, sadar...",nilai hebat ganjar pancasila sadar rawat sosok
3,TXT0004,@kumparan Prabowo Gibran yang bisa melakukan i...,[nan],,Prabowo Gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa melakukan itu semua d...,prabowo gibran yang bisa laku itu semua demi s...,prabowo gibran laku sejahtera rakyat nan,"[prabowo, gibran, laku, sejahtera, rakyat, nan]",prabowo gibran laku sejahtera rakyat nan,"[prabowo, laku, sejahtera, rakyat]","[prabowo, laku, sejahtera, rakyat]",rakyat laku prabowo sejahtera
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib Lah ...,[nan],,Lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yg gak nyambung junjungan elu aomkm...,lah justru yang tidak menyambung junjungan lu ...,lah justru yang tidak sambung junjung lu aomkm...,sambung junjung lu aomkmkmkmk omong etika anie...,"[sambung, junjung, lu, aomkmkmkmk, omong, etik...",sambung junjung aomkmkmkmk omong etika anies o...,"[sambung, junjung, omong, etika, anies, anies,...","[sambung, junjung, omong, etika, anies, anies,...",hutang sopan junjung ejek omong etika wajah an...


In [None]:
data.groupby('label').count()['']

Unnamed: 0_level_0,text,hastag
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Demografi,62,62
Ekonomi,367,367
Geografi,20,20
Ideologi,400,400
Pertahanan dan Keamanan,400,400
Politik,2972,2972
Sosial Budaya,587,587
Sumber Daya Alam,192,192


In [None]:
data_clean['TEKS'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['TEKS'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(x))


In [None]:
# data_clean[['stopword_removed2','label']].dropna().to_csv("latih_lagi.csv",index=False)

In [132]:
data_clean.to_csv("./dataset/processed dataset/processed_dataset_test.csv",index=False)

In [None]:
data_hastag = data_clean[['preprocessed_hastag','label']]
data_hastag.replace('nan',np.nan,inplace=True)
data_hastag.dropna(inplace=True)

data_hastag.to_csv("dataset/processed dataset/hastag_data.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hastag.replace('nan',np.nan,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_hastag.dropna(inplace=True)


In [None]:
data_hastag

Unnamed: 0,preprocessed_hastag,label
1,amin miskinkan koruptor,Politik
5,ganjar mahfud2024,Politik
9,asal bukan prabowo,Politik
11,ganjar mahfud rebound,Pertahanan dan Keamanan
12,metrotvxtend,Pertahanan dan Keamanan
...,...,...
4972,menang seputaran,Politik
4977,pemilupedia,Politik
4978,indonesia kuat indonesia maju,Politik
4990,prabowo selamatkan wilfrida,Politik
