In [9]:
!pip install sastrawi nltk pandas scikit-learn



In [10]:
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

# Untuk tokenisasi
nltk.download('punkt')
nltk.download('punkt_tab')

# Untuk stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# load data source
file_path = 'tweet.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0.1,Unnamed: 0,sentimen,tweet
0,0,negatif,Kata @prabowo Indonesia tidak dihargai bangsa ...
1,1,netral,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib..."
2,2,netral,"Di era Jokowi, ekonomi Indonesia semakin baik...."
3,3,positif,"Bagi Sumatera Selatan, Asian Games berdampak p..."
4,4,negatif,Negara kita ngutang buat bngun infrastruktur y...


In [13]:
# clean data
def clean_text(text):

    # case folding : set to lowering text
    text = text.lower()

    # remove noise regex
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['text'] = df['tweet'].apply(clean_text) # move tweets to text for tokenization of comparison results

In [14]:
# Tokenisasi : Word Tokenized
df['word_tokenized'] = df['text'].apply(word_tokenize)

print(df['word_tokenized'].head())

0    [kata, indonesia, tidak, dihargai, bangsa, asi...
1    [batuan, langka, tasbih, jokowi, hadiah, dari,...
2    [di, era, jokowi, ekonomi, indonesia, semakin,...
3    [bagi, sumatera, selatan, asian, games, berdam...
4    [negara, kita, ngutang, buat, bngun, infrastru...
Name: word_tokenized, dtype: object


In [15]:
# Tokenisasi : Tweet Tokenized
tweet_tokenizer = TweetTokenizer()
df['tweet_tokenized'] = df['text'].apply(tweet_tokenizer.tokenize)

print(df['tweet_tokenized'].head())

0    [kata, indonesia, tidak, dihargai, bangsa, asi...
1    [batuan, langka, tasbih, jokowi, hadiah, dari,...
2    [di, era, jokowi, ekonomi, indonesia, semakin,...
3    [bagi, sumatera, selatan, asian, games, berdam...
4    [negara, kita, ngutang, buat, bngun, infrastru...
Name: tweet_tokenized, dtype: object


In [16]:
stop_words = set(stopwords.words('indonesian'))
df['tweet_no_stopwords'] = df['tweet_tokenized'].apply(lambda x: [word for word in x if word not in stop_words])
print(df['tweet_no_stopwords'].head())

0    [indonesia, dihargai, bangsa, asing, berita, h...
1    [batuan, langka, tasbih, jokowi, hadiah, habib...
2    [era, jokowi, ekonomi, indonesia, indonesiamaj...
3    [sumatera, selatan, asian, games, berdampak, p...
4    [negara, ngutang, bngun, infrastruktur, udah, ...
Name: tweet_no_stopwords, dtype: object


In [17]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
df['tweet_stemmed'] = df['tweet_no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
print(df['tweet_stemmed'].head())

0    [indonesia, harga, bangsa, asing, berita, hoax...
1    [batu, langka, tasbih, jokowi, hadiah, habib, ...
2    [era, jokowi, ekonomi, indonesia, indonesiamaj...
3    [sumatera, selatan, asi, games, dampak, pd, ek...
4    [negara, ngutang, bngun, infrastruktur, udah, ...
Name: tweet_stemmed, dtype: object


In [19]:
df['tweet_process'] = df['tweet_stemmed'].apply(lambda x: ' '.join(x))
print(df['tweet_process'].head())

0    indonesia harga bangsa asing berita hoax buat ...
1    batu langka tasbih jokowi hadiah habib luthfi ...
2    era jokowi ekonomi indonesia indonesiamaju jok...
3    sumatera selatan asi games dampak pd ekonomi l...
4    negara ngutang bngun infrastruktur udah dipake...
Name: tweet_process, dtype: object


In [20]:
# train data

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['tweet_process'])
print('Shape TF-IDF: ', X.shape)

Shape TF-IDF:  (1815, 5682)


In [21]:
# Cek hasil TF-IDF
print(df[['tweet', 'tweet_process']].head(10))

                                               tweet  \
0  Kata @prabowo Indonesia tidak dihargai bangsa ...   
1  Batuan Langka, Tasbih Jokowi Hadiah dari Habib...   
2  Di era Jokowi, ekonomi Indonesia semakin baik....   
3  Bagi Sumatera Selatan, Asian Games berdampak p...   
4  Negara kita ngutang buat bngun infrastruktur y...   
5  Yg bisikin pak jokowi, cm mikirin perputaran d...   
6  Masa tenang msih ngoceh aja..ttp jokowi harga ...   
7  #UASdifitnahKejiBalasDiTPS   kerjasa ekonomi b...   
8  Iya bener Aa, kita MANTAP kan pilihan ke Pemim...   
9  Prabowo-Sandi Sepakat Tak Ambil Gaji karena Ne...   

                                       tweet_process  
0  indonesia harga bangsa asing berita hoax buat ...  
1  batu langka tasbih jokowi hadiah habib luthfi ...  
2  era jokowi ekonomi indonesia indonesiamaju jok...  
3  sumatera selatan asi games dampak pd ekonomi l...  
4  negara ngutang bngun infrastruktur udah dipake...  
5  yg bisikin jokowi cm mikirin putar duit golong... 

In [22]:
# Save hasil proses
df.to_csv('tweet_preprocess_tfidf.csv', index=False)
print('Data disimpan ke "tweet_preprocess_tfidf.csv"')


Data disimpan ke "tweet_preprocess_tfidf.csv"
