# PREPROCESSING

### 1. Load All Library

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.classify import SklearnClassifier
from nltk.stem import LancasterStemmer, WordNetLemmatizer

import matplotlib.pyplot as plt

nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
import re

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Load Dataset

In [2]:
data = pd.read_csv("./Dataset/Raw Dataset/dataset_penyisihan_bdc_2024.csv",sep=';')

In [3]:
data.head()

Unnamed: 0,text,label
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik


### 3. Data Assesment

In [4]:
data.info()
data.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


text     5000
label    5000
dtype: int64

In [5]:
data.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Demografi,62
Ekonomi,367
Geografi,20
Ideologi,400
Pertahanan dan Keamanan,400
Politik,2972
Sosial Budaya,587
Sumber Daya Alam,192


### 4. Clean Data (Drop Duplicate)

In [6]:
data_clean = data.drop_duplicates()

### 5. Split Camel Case and Cleaning from Twitter Format 

In [7]:
def preprocess_tweet(data):
    text = data['preprocessed']
    text = p.clean(text)
    return text

def split_camel_case(text):
    return [re.sub(r'([a-z])([A-Z])', r'\1 \2', i) for i in text]

In [8]:
data['hastag'] = data['text'].apply(lambda x: re.findall(f"#(\w+)",x))
data_clean['preprocessed_hastag'] = data['hastag'].apply(split_camel_case)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed_hastag'] = data['hastag'].apply(split_camel_case)


### 6. Lower The Text and Split The Text

In [9]:
data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x: [i.lower() for i in x])
data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x: ' '.join(x).split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x: [i.lower() for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed_hastag'] = data_clean['preprocessed_hastag'].apply(lambda x: ' '.join(x).split())


In [10]:
data_clean.head()

Unnamed: 0,text,label,preprocessed_hastag
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam,"[indonesia, sentris, indonesia, hijau, 02melan..."
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik,"[amin, miskinkan, koruptor]"
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi,[]
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik,[]
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik,[]


### 7. Clean @ or (USERNAME) and # or (HASTAG) in Text

In [11]:
def clean_username(text):
    return re.sub(r'[@#]\S+', '', text)

data_clean['preprocessed'] = data_clean['text'].apply(lambda x : clean_username(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed'] = data_clean['text'].apply(lambda x : clean_username(x))


### 8. Combine Hastag and Text

In [12]:
data_clean['preprocessed'] = data_clean['preprocessed'].apply(lambda x : x.split())
data_clean['preprocessed'] = data_clean['preprocessed'] + data_clean['preprocessed_hastag']
data_clean['preprocessed'] = data_clean['preprocessed'].apply(lambda x : ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed'] = data_clean['preprocessed'].apply(lambda x : x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed'] = data_clean['preprocessed'] + data_clean['preprocessed_hastag']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['preprocessed'] = 

### 9. Lower, Delete Punctuation, and Delete Double Space

In [13]:
import string

data_clean['lower'] = data_clean['preprocessed'].str.lower().str.replace("[^\w\s]","").str.replace("\s\s+","")

for index,row in data_clean.iterrows():
    row['lower'] = row['lower'].translate(str.maketrans('','',string.punctuation))
    data_clean['lower'][index] = row['lower']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['lower'] = data_clean['preprocessed'].str.lower().str.replace("[^\w\s]","").str.replace("\s\s+","")
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation:

### 10. Clean HTTPS and Number

In [14]:
data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\d+', '', x))
data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\b\w*https\w*\b', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\d+', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['clean'] = data_clean['lower'].apply(lambda x: re.sub(r'\b\w*https\w*\b', '', x))


### 10. CHANGE NON FORMAL TO FORMAL

In [15]:
slang_list = pd.read_csv("./dataset/corpus dataset/colloquial-indonesian-lexicon.csv")
slang = slang_list['slang'].values.tolist()
formal = slang_list['formal'].values.tolist()
slangToformal = dict(zip(slang,formal))

formal_tokens = []
for index,row in data_clean.iterrows():
    temp = []
    for word in row['clean'].split():
        temp.append(slangToformal.get(word,word))
    res = " ".join(temp)
    formal_tokens.append(res)
data_clean["formal"] = formal_tokens

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean["formal"] = formal_tokens


### 11. Cleaning Word that Start With Re (Retweet)

In [16]:
data_clean['formal'] = data_clean['formal'].str.replace(r're\s+\S+', '',regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['formal'] = data_clean['formal'].str.replace(r're\s+\S+', '',regex=True)


## 13. Lematisasi

In [17]:
from nlp_id.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer()
lemmatized=[]
for index, row in data_clean.iterrows():
    lemmatized.append(lemmatizer.lemmatize(row['formal']))

data_clean['lemmatized']=lemmatized

lemmatizer = Lemmatizer()
lemmatized=[]
for index, row in data_clean.iterrows():
    lemmatized.append(lemmatizer.lemmatize(row['formal']))

data_clean['lemmatized']=lemmatized

  from .autonotebook import tqdm as notebook_tqdm
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['lemmatized']=lemmatized
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['lemmatized']=lemmatized


### 13. Stopword Removal

In [18]:
from nlp_id.stopword import StopWord

stopword = StopWord()
stopword_removed=[]

for index, row in data_clean.iterrows():
    stopword_removed.append(stopword.remove_stopword(row['lemmatized']))

data_clean['stopword_removed']=stopword_removed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['stopword_removed']=stopword_removed


### 14. Tokenisasi

In [19]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

tokens_c=[]

for index, row in data_clean.iterrows():
    tokens = tokenizer.tokenize(row['stopword_removed'])
    tokens_c.append(tokens)

data_clean['tokens']=tokens_c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['tokens']=tokens_c


### 15. Second Stopword Removal

In [20]:
#augment the stopwords with nonstandard twitter words
stopwords_set = set(stopwords.words("indonesian"))
stopwords_aug = {"ya","yak","iya","yg","ga","gak","gk","udh","sdh","udah","dah","nih","ini","deh","sih","dong","donk",
                 "sm","knp","utk","yaa","tdk","gini","gitu","bgt","gt","nya","kalo","cb","jg","jgn","gw","ge",
                 "sy","min","mas","mba","mbak","pak","kak","trus","trs","bs","bisa","aja","saja","no",
                 "w","g","gua","gue","emang","emg","wkwk","dr","kau","dg","gimana","apapun","apa",
                 "klo","yah","banget","pake","terus","krn","jadi","jd","mu","ku","si","hehe",
                 "tp","pa","lu","lo","lw","tw","tau","karna","kayak","ky","lg","untuk","tuk","dg","dgn"}
stopwords_all = stopwords_set.union(stopwords_aug)

data_clean['stopword_removed2'] = data_clean['tokens'].apply(lambda x: [item for item in x if item not in stopwords_all])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['stopword_removed2'] = data_clean['tokens'].apply(lambda x: [item for item in x if item not in stopwords_all])


### 16. Remove Digit in List

In [21]:
data_clean['stopword_removed2'] = data_clean['stopword_removed2'].apply(lambda x : [item for item in x if not item.isdigit()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['stopword_removed2'] = data_clean['stopword_removed2'].apply(lambda x : [item for item in x if not item.isdigit()])


### 17. Translate to Indonesia

In [24]:
from deep_translator import GoogleTranslator

def translate_to_indonesian(text):
    translated_text = GoogleTranslator(source='en', target='id').translate(text)
    return translated_text

indices_to_translate = [40, 415, 1646, 2973, 3767, 3721, 4736, 392,39,772,1121,1269,1787,306,2271,2344,17]

def conditional_translation(row, index):
    if index in indices_to_translate:
        return translate_to_indonesian(' '.join(row))
    else:
        return ' '.join(row)

In [25]:
data_clean['stopword_removed2'] = data_clean.apply(
    lambda row: conditional_translation(row['stopword_removed2'], row.name),
    axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['stopword_removed2'] = data_clean.apply(


### 18. Filter in KBBI Corpus

In [26]:
kbbi_corpus = pd.read_csv("./dataset/corpus dataset/kbbi.csv")
kbbi_corpus.drop_duplicates(inplace=True)
list_kbbi = kbbi_corpus['kata'].to_list()

def hapus_kata_non_sastrawi(kata_list):
    kata_list = kata_list.split()
    return [kata for kata in kata_list if kata in list_kbbi]

In [27]:
# Terapkan fungsi ke kolom stopword_removed2
data_clean['KBBI'] = data_clean['stopword_removed2'].apply(hapus_kata_non_sastrawi)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['KBBI'] = data_clean['stopword_removed2'].apply(hapus_kata_non_sastrawi)


### 19. Remove Word that Just Consist of 2 Letter and Replace Milu to Pemilu

In [28]:
def clear_list(clear2List):
    # Menghapus elemen dengan panjang kurang dari atau sama dengan 2 karakter
    list_karakter = [item for item in clear2List if len(item) > 2]

    # Mengganti semua kemunculan "milu" menjadi "pemilu" di dalam list
    for i in range(len(list_karakter)):
        list_karakter[i] = list_karakter[i].replace("milu", "pemilu")
    
    return list_karakter

data_clean['cleaned_KBBI'] = data_clean['KBBI'].apply(clear_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['cleaned_KBBI'] = data_clean['KBBI'].apply(clear_list)


### 20. Optional Make it To Unique Words

In [29]:
data_clean['unique_teks'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(set(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['unique_teks'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(set(x)))


### 21. Drop Nan Result

In [30]:
data_clean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean.dropna(inplace=True)


In [31]:
data_clean.head()

Unnamed: 0,text,label,preprocessed_hastag,preprocessed,lower,clean,formal,lemmatized,stopword_removed,tokens,stopword_removed2,KBBI,cleaned_KBBI,unique_teks
0,Kunjungan Prabowo ini untuk meresmikan dan men...,Sumber Daya Alam,"[indonesia, sentris, indonesia, hijau, 02melan...",Kunjungan Prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjungan prabowo ini untuk meresmikan dan men...,kunjung prabowo ini untuk resmi dan serah proy...,kunjung prabowo resmi serah proyek bantu air b...,"[kunjung, prabowo, resmi, serah, proyek, bantu...",kunjung prabowo resmi serah proyek bantu air b...,"[kunjung, prabowo, resmi, serah, proyek, bantu...","[kunjung, prabowo, resmi, serah, proyek, bantu...",kunjung emas muda hijau bersih air proyek anak...
1,RT Anies dapat tepuk tangan meriah saat jadi R...,Politik,"[amin, miskinkan, koruptor]",RT Anies dapat tepuk tangan meriah saat jadi R...,rt anies dapat tepuk tangan meriah saat jadi r...,rt anies dapat tepuk tangan meriah saat jadi r...,rt anies dapat tepuk tangan meriah saat jadi r...,rt anies dapat tepuk tangan riah saat jadi rek...,rt anies tepuk tangan riah rektor wajib mata k...,"[rt, anies, tepuk, tangan, riah, rektor, wajib...",rt anies tepuk tangan riah rektor wajib mata k...,"[rt, anies, tepuk, tangan, riah, rektor, wajib...","[anies, tepuk, tangan, riah, rektor, wajib, ma...",riah tepuk putus tangan amin anti mata korupto...
2,@CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...,Demografi,[],"emng bener sih, pendukung 01 ada yg goblok, be...",emng bener sih pendukung 01 ada yg goblok begi...,emng bener sih pendukung 01 ada yg goblok begi...,memang benar sih pendukung 01 ada yang goblok ...,memang benar sih dukung 01 ada yang goblok beg...,dukung 01 goblok dukung 02 ridwan kamil skema ...,"[dukung, 01, goblok, dukung, 02, ridwan, kamil...",dukung goblok dukung ridwan kamil skema mayori...,"[dukung, goblok, dukung, kamil, skema, mayorit...","[dukung, goblok, dukung, kamil, skema, mayorit...",kamil mayoritas goblok arti dukung rendah pili...
3,RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...,Politik,[],RT Sewaktu anies bersikap kritis ke kinerja pa...,rt sewaktu anies bersikap kritis ke kinerja pa...,rt sewaktu anies bersikap kritis ke kinerja pa...,rt sewaktu anies bersikap kritis ke kinerja pa...,rt waktu anies sikap kritis ke kerja pak prabo...,rt anies sikap kritis kerja prabowo anggap sop...,"[rt, anies, sikap, kritis, kerja, prabowo, ang...",rt anies sikap kritis kerja prabowo anggap sop...,"[rt, anies, sikap, kritis, kerja, prabowo, ang...","[anies, sikap, kritis, kerja, prabowo, anggap,...",gibran sikap kritis hormat sok kerja tua tengi...
4,Anies Baswedan Harap ASN termasuk TNI dan Polr...,Politik,[],Anies Baswedan Harap ASN termasuk TNI dan Polr...,anies baswedan harap asn termasuk tni dan polr...,anies baswedan harap asn termasuk tni dan polr...,anies baswedan harap aparatur sipil negara ter...,anies baswedan harap aparatur sipil negara mas...,anies baswedan harap aparatur sipil negara mas...,"[anies, baswedan, harap, aparatur, sipil, nega...",anies baswedan harap aparatur sipil negara mas...,"[anies, harap, aparatur, sipil, negara, masuk,...","[anies, harap, aparatur, sipil, negara, masuk,...",tentara aparatur polri sipil sumpah pegang har...


In [32]:
data.groupby('label').count()

Unnamed: 0_level_0,text,hastag
label,Unnamed: 1_level_1,Unnamed: 2_level_1
Demografi,62,62
Ekonomi,367,367
Geografi,20,20
Ideologi,400,400
Pertahanan dan Keamanan,400,400
Politik,2972,2972
Sosial Budaya,587,587
Sumber Daya Alam,192,192


In [33]:
data_clean['TEKS'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['TEKS'] = data_clean['cleaned_KBBI'].apply(lambda x : ' '.join(x))


In [None]:
data_clean.to_csv("./Dataset/Processed Dataset/processed_dataset_2.csv",index=False)

In [37]:
data_clean[['stopword_removed2','label']].dropna().to_csv("./Dataset/Processed Dataset/latih_lagi_2.csv",index=False)

---

# **MODELLING**

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_metric
# from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from sklearn.metrics import balanced_accuracy_score

In [None]:
dataset = pd.read_csv("./dataset/processed dataset/DatasetTrainAllVersion.csv")
dataset.drop_duplicates(inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = list(dataset['text'].astype(str))
y = list(dataset['label'])

y = le.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split

# Memisahkan data menjadi train dan test set
train_texts, eval_texts, train_labels, eval_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = AutoTokenizer.from_pretrained('indolem/indobertweet-base-uncased')

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
eval_dataset = CustomDataset(eval_texts, eval_labels, tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('indolem/indobertweet-base-uncased', num_labels=8)
model = model.to('cuda')

# Define training arguments
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    balanced_acc = balanced_accuracy_score(labels, predictions)
    return {"balanced_accuracy": balanced_acc}

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./bert_tweet_50K")
trainer.evaluate()

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.preprocessing import LabelEncoder

# Load dataset
dataset = pd.read_csv("./predict data/raw_text.csv")

# Prepare texts
X = list(dataset['text'].astype(str))

# Define path to the model
model_path = './bert_model/raw_twitbert/raw_tweetbert'

# Check if GPU is available
device = torch.device('cpu')

# Load tokenizer and model from local directory
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

def predict_batch(texts, batch_size=16):
    """Tokenize texts and perform prediction in batches."""
    results = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            results.extend(batch_predictions)

    return results

# Perform batch prediction
batch_size = 16  # Adjust batch size as needed
predicted_labels = predict_batch(X, batch_size)

# Convert predictions to actual class labels
label = ['Politik',
 'Ideologi',
 'Pertahanan dan Keamanan',
 'Sosial Budaya',
 'Ekonomi',
 'Demografi',
 'Geografi',
 'Sumber Daya Alam']

le = LabelEncoder()
le.fit(label)  # Fit label encoder on the dataset labels
predicted_labels = le.inverse_transform(predicted_labels)

# Create a new DataFrame with text and predicted labels
results_df = pd.DataFrame({
    'IDText' : dataset['IDText'],
    'Text': X,
    'Predicted_Label': predicted_labels
})

# Optionally, you can save this DataFrame to a CSV file
results_df.to_csv("./predict data/result/raw_tweet.csv", index=False)

print(results_df.head())  # Print the first few rows of the DataFrame

In [None]:
test = pd.read_csv("./dataset/raw dataset/dataset_unlabeled_penyisihan_bdc_2024.csv",delimiter=";")

In [None]:
result = pd.read_csv("./Result/Result Temp.csv")

In [None]:
from sklearn.metrics import balanced_accuracy_score
import pandas as pd

def find_mode(row):
    mode = row[['Predicted_Label_raw_tweet', 'Predicted_Label_BertTweet']].mode()
    print(mode)
    if len(mode) == 0:
        return row['Predicted_Label_BertTweet']  # Jika tidak ada modus, kembalikan None atau nilai lain sesuai kebutuhan
    else:
        return mode.iloc[0]  # Mengambil nilai modus dari baris pertama, kolom pertam
    
modus = result.apply(find_mode, axis=1)

result['modus'] = modus

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
predict = result['modus']
label = result['Label']

In [None]:
balanced_acc = balanced_accuracy_score(label, predict)

print("Balanced Accuracy:", balanced_acc)