In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Siswantoro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Siswantoro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Preprocessing

In [2]:
df = pd.read_csv(r'./instagram.csv')

In [3]:
df.head(2)

Unnamed: 0,username,komentar
0,kompascom,KOREKSI: Dalam infografis tertulis “Periode su...
1,noto.aries.3,"Pdip dukung anies aja, agar bisa menang jadi s..."


In [4]:
print('total baris', df.shape[0])
print('total kolom', df.shape[1])

total baris 412
total kolom 2


In [5]:
# Cek missing value

df.isna().sum()

username    0
komentar    2
dtype: int64

In [6]:
# Cek duplikasi

df.duplicated().sum()

2

In [7]:
df_clean = df.copy()

In [8]:
# Menghandle Missing Value

df_clean = df_clean.dropna()
df_clean.isna().sum()

username    0
komentar    0
dtype: int64

In [9]:
# Menghandle Duplikasi

df_clean = df_clean.drop_duplicates()
df_clean.duplicated().sum()

0

In [10]:
# Fungsi untuk membersihkan data

def remove_emoji(text):
    if isinstance(text, str):
        return emoji.replace_emoji(text, replace='')
    else:
        return text

def clean_text(text):
    text = remove_emoji(text)
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'@\w+|#\w+|http\S+', '', text)
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    return text


In [11]:
df_clean['clean_comment'] = df_clean['komentar'].apply(clean_text)

In [12]:
df_clean

Unnamed: 0,username,komentar,clean_comment
0,kompascom,KOREKSI: Dalam infografis tertulis “Periode su...,koreksi dalam infografis tertulis periode surv...
1,noto.aries.3,"Pdip dukung anies aja, agar bisa menang jadi s...",pdip dukung anies aja agar bisa menang jadi sa...
2,fakta.jakarta,Kok respondennya cuma 400 orang min? 😂 Di akun...,kok respondennya cuma orang min di akun kami p...
3,annie_anieeeee,Tetep Ahok ❤️❤️❤️❤️🔥🔥🔥🔥🔥,tetep ahok
4,info___rumahbaru,ahok aja daripada si mANIES,ahok aja daripada si manies
...,...,...,...
407,dermawan.iwan,Babah.... Siapin yg banyak,babah siapin yg banyak
408,juhendri5834,Anies l,anies l
409,kambing.jantan.etawa,"Hoax ini, harusnya kesang 78 persen😂 bahkan bi...",hoax ini harusnya kesang persen bahkan bisa tr...
410,_penuntutsurga,Yang 1% nanti berubah jadi 58% 👏 sim salabim j...,yang nanti berubah jadi sim salabim jadi apa p...


In [13]:
# Menghapus stopwords

stop_words = set(stopwords.words('indonesian'))
df_clean['clean_comment'] = df_clean['clean_comment'].apply(
    lambda x: ' '.join([
        word for word in word_tokenize(x) if word not in stop_words
        ]) 
        if isinstance(x, str) else x
)

In [14]:
# Membuat Stemmer

factory = StemmerFactory()
stemmer = factory.create_stemmer()

df_clean['stem_comment'] = df_clean['clean_comment'].apply(
    lambda x: stemmer.stem(x) if isinstance(x, str) else ''
)

In [15]:
# Membuat Tokenisasi

df_clean['tokens'] = df_clean['stem_comment'].apply(
    lambda x: word_tokenize(x) if isinstance(x, str) else []
)

In [16]:
# Membuat Label

def get_sentiment(text):

    sentiment = TextBlob(text).sentiment

    if sentiment.polarity > 0.1:
        return 'positif'
    elif sentiment.polarity < -0.1:
        return 'negatif'
    else:
        return 'netral'

df_clean['label'] = df_clean['clean_comment'].apply(get_sentiment)

In [17]:
df_clean['label'].value_counts()

label
netral     397
positif      7
negatif      4
Name: count, dtype: int64

In [18]:
df_clean.head(2)

Unnamed: 0,username,komentar,clean_comment,stem_comment,tokens,label
0,kompascom,KOREKSI: Dalam infografis tertulis “Periode su...,koreksi infografis tertulis periode survei jul...,koreksi infografis tulis periode survei juli p...,"[koreksi, infografis, tulis, periode, survei, ...",netral
1,noto.aries.3,"Pdip dukung anies aja, agar bisa menang jadi s...",pdip dukung anies aja menang sakit hati pilpre...,pdip dukung anies aja menang sakit hati pilpre...,"[pdip, dukung, anies, aja, menang, sakit, hati...",netral


### Save dataset|

In [21]:
# Simpan file

df_clean.to_csv('./data_clean.csv', index=False) # data yang sudah bersih            