<a href="https://colab.research.google.com/github/MuhRiswan/SkripsiSentimenPemilu2024_NaiveBayes/blob/main/skripsi_analisis_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Analisis Sentimen Pemberitaan hasil Rekapitulasi Pemilu presiden 2024**



Tahapan Analisis Data Sentimen


1.   Data Selection
     * Normalisasi
     * Labeling
2.   PreProcessing

     * Cleaning
     * Stopword
     * Tokenize
     * Stemming     
3.   Visualiasi
4.   Klasifikasi Sentimen
5.   Evaluation

#Select Data

In [None]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/dataSkripsi.csv")
df.head()

In [None]:
df = df[['username', 'comment', 'commentDate']]
df

In [None]:
#Labeling data
!pip install transformers
!pip install googletrans==3.1.0a0
from googletrans.client import Translator
translator  = Translator()

from transformers import pipeline
sentiment_classifier = pipeline('sentiment-analysis')

In [None]:
df['comments'] = df['comment'].str.encode('ascii', 'ignore').apply(translator.translate, dest='en')
df['comments'] = df['comments'].apply(getattr, args=('text',))

In [None]:
df = pd.read_csv("/content/dataClean.csv")
df.head()

#Preprocessing

In [None]:
#Cleaning
import re
import string

# Fungsi untuk membersihkan komentar Instagram
def clean_comment_instagram(data):
    if pd.isna(data): # Check if data is NaN
        return ''
    # Menghapus karakter khusus macam @mentions, #hastag, url, dan emote
    data = re.sub(r'@[A-Za-z0-9_]+', '', data)
    data = re.sub(r'#\w+', '', data)
    data = re.sub(r'RT[\s]+', '', data)
    data = re.sub(r'https?://\S+', '', data)
    data = re.sub(r'[^A-Za-z0-9]', ' ', data)

    # Menghapus tanda baca
    data = data.translate(str.maketrans('', '', string.punctuation))

    # Normalisasi teks
    # data = data.lower() # Mengubah menjadi lowercase
    data = re.sub(r'\s+', ' ', data).strip() # Menghapus spasi berlebih

    # Menghapus angka yang menempel pada kata
    data = re.sub(r'\d+', '', data)

    return data # Mengembalikan data tanpa koreksi typo

# Menambahkan kolom baru 'Cleaning' dengan data yang telah dibersihkan
df['cleaning'] = df['comment'].apply(clean_comment_instagram)

# Menghapus duplikat dan data kosong
df = df.drop_duplicates(subset=['cleaning'])
df = df.dropna()

# Memilih kolom 'comment', 'label', dan 'Cleaning'
df = df[['comment', 'label', 'cleaning']]

# Menampilkan 5 baris pertama untuk memastikan kolom baru telah ditambahkan
df.head()

In [None]:
# - Case Folding

df['case_folding'] = df['cleaning'].str.lower()
df.head()

In [None]:
#Stopword Removal
!pip install Sastrawi

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

more_stop_words = ["indonesia", "indonesian"]

# Buat instance dari StopWordRemoverFactory
factory = StopWordRemoverFactory()

# Ambil daftar stopwords bawaan dari Sastrawi
stop_words = factory.get_stop_words()

# Tambahkan stopwords tambahan ke daftar stopwords
stop_words.extend(more_stop_words)

# Buat array dictionary untuk stopwords
new_array = ArrayDictionary(stop_words)

# Buat instance dari StopWordRemover dengan stopwords yang telah diperbarui
stop_words_remover_new = StopWordRemover(new_array)

# Fungsi untuk menghapus stopwords dari teks
def stopword(str_text):
    str_text = stop_words_remover_new.remove(str_text)
    return str_text

# Terapkan fungsi stopword ke kolom 'comment' pada DataFrame
df['stopword_removal'] = df['case_folding'].apply(stopword)

# Lihat beberapa baris hasil untuk memastikan
df.head()

In [None]:
# Tokenizing
df['tokenized'] = df['stopword_removal'].apply(lambda x:x.split())
df.head()

In [None]:
# - Stemming

# Mengimpor pustaka yang diperlukan
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd

# Membuat instance dari StemmerFactory dan Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk melakukan stemming pada teks
def stemming(text_cleaning):
    # Menstem setiap kata dalam teks yang sudah dibersihkan
    stemmed_words = [stemmer.stem(word) for word in text_cleaning]
    # Menggabungkan kembali kata-kata yang sudah di-stem menjadi satu string
    return " ".join(stemmed_words)

# Asumsikan 'tokenized' adalah DataFrame yang sudah berisi tokenisasi komentar
df['comment'] = df['tokenized'].apply(stemming)
# Menerapkan stemming pada setiap tokenized comment
tokenized = df['comment']

# Menyimpan hasil preprocessing ke file CSV
tokenized.to_csv("/content/hasilPreProcessingData.csv", index=False)

# Membaca kembali hasil preprocessing dari file CSV
data = pd.read_csv("/content/hasilPreProcessingData.csv", encoding='latin1')


#TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Sesuaikan dengan data 'comment' yang telah di-preprocessing dan transformasikan
tfidf = tfidf_vectorizer.fit_transform(df['comment'])  # Ganti 'df' dengan nama DataFrame Anda yang berisi data yang telah di-preprocessing

# Dapatkan daftar fitur (kata-kata)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Buat DataFrame untuk menampilkan kata dan bobotnya
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=feature_names)

# Tampilkan DataFrame
print(df_tfidf)

In [None]:
#Mempelajari kosakata unik
comment = df['comment']
cv = CountVectorizer()
term_fit = cv.fit(comment)
#mencetak ukuran kosakata, yaitu jumlah total kata unik yang ditemukan dalam data comment
print(len(term_fit.vocabulary_))

In [None]:
#menampilkan kosakata dari kata-kata unik dan nilainya adalah indeks numerik yang sesuai yang diberikan kepada setiap kata
term_fit.vocabulary_

In [None]:
#menghitung seberapa sering setiap kata muncul dalam setiap komentar.
#Kolom pertama  = jumlah dokumen
#Kolom kedua = letak katanya
#Kolom ketika = hasil dari tf
term_frequency_all = term_fit.transform(comment)
print(term_frequency_all)

In [None]:
term_frequency = term_fit.transform([comment_tf])
print(term_frequency)

In [None]:
#menghitung seberapa sering setiap kata muncul dalam setiap komentar dan menyimpan hasilnya dalam bentuk tabel.
dokumen = term_fit.transform(comment)

#menghitung bobot setiap kata berdasarkan seberapa sering kata tersebut muncul dalam semua komentar. Kata yang jarang muncul akan memiliki bobot yang lebih tinggi.
tfidf_transformer = TfidfTransformer().fit(dokumen)
print(tfidf_transformer.idf_)

#menghitung bobot akhir setiap kata dalam komentar tertentu dengan mempertimbangkan bobot yang dihitung sebelumnya
tfidf = tfidf_transformer.transform(term_frequency)
print(tfidf)

#Algoritma Naive Bayes

In [None]:
#Spliting data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Split data training dan testing
x_train, x_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.1, random_state=42)

# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()


In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit dan transform data training
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Hanya transform data testing
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Inisialisasi dan latih model Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)

# Prediksi data testing
y_pred = nb.predict(x_test_tfidf)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report # Added the missing import statement

# Evaluasi performa model
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print('============================================================================')
print(classification_report(y_test, y_pred, zero_division=0)) # Changed 'predicted' to 'y_pred'

In [None]:
# prompt: tolong buatkan code yang manimplkan pie chart dari hasil prediksinya dan setelah data miningnya berisikan hasil true negative dan true positive dan buat keterangannya menggunakan bahsa indonesia serta jumlah datanya ditampilkan disamping pie chart dan jangan terlalu jauh

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# ... (Your existing code for data preprocessing, model training, and prediction) ...

# Menghitung confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Ekstrak nilai true positive, true negative, false positive, dan false negative
tn, fp, fn, tp = cm.ravel()

# Data untuk pie chart
labels = ['True Positive', 'True Negative']
sizes = [tp, tn]

# Buat pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

# Tambahkan judul
plt.title('Hasil Actual Prediction')

# Menampilkan jumlah data di samping pie chart
plt.text(1.2, 0.5, f"True Positive: {tp}\nTrue Negative: {tn}", transform=ax.transAxes)

# Tampilkan pie chart
plt.show()


In [None]:
# Split data training dan testing
x_train, x_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.2, random_state=42)

# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()


In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit dan transform data training
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Hanya transform data testing
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Inisialisasi dan latih model Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)

# Prediksi data testing
y_pred = nb.predict(x_test_tfidf)

In [None]:
# Evaluasi performa model
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print('============================================================================')
print(classification_report(y_test, y_pred, zero_division=0)) # Changed 'predicted' to 'y_pred'

In [None]:
# prompt: tolong buatkan code yang manimplkan pie chart dari hasil prediksinya dan setelah data miningnya berisikan hasil true negative dan true positive dan buat keterangannya menggunakan bahsa indonesia serta jumlah datanya ditampilkan disamping pie chart dan jangan terlalu jauh

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# ... (Your existing code for data preprocessing, model training, and prediction) ...

# Menghitung confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Ekstrak nilai true positive, true negative, false positive, dan false negative
tn, fp, fn, tp = cm.ravel()

# Data untuk pie chart
labels = ['True Positive', 'True Negative']
sizes = [tp, tn]

# Buat pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

# Tambahkan judul
plt.title('Hasil Actual Prediction')

# Menampilkan jumlah data di samping pie chart
plt.text(1.2, 0.5, f"True Positive: {tp}\nTrue Negative: {tn}", transform=ax.transAxes)

# Tampilkan pie chart
plt.show()


In [None]:
# Split data training dan testing
x_train, x_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3, random_state=42)

# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()


In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Fit dan transform data training
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Hanya transform data testing
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Inisialisasi dan latih model Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)

# Prediksi data testing
y_pred = nb.predict(x_test_tfidf)

In [None]:
# Evaluasi performa model
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print('============================================================================')
print(classification_report(y_test, y_pred, zero_division=0)) # Changed 'predicted' to 'y_pred'

In [None]:
# prompt: tolong buatkan code yang manimplkan pie chart dari hasil prediksinya dan setelah data miningnya berisikan hasil true negative dan true positive dan buat keterangannya menggunakan bahsa indonesia serta jumlah datanya ditampilkan disamping pie chart dan jangan terlalu jauh

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# ... (Your existing code for data preprocessing, model training, and prediction) ...

# Menghitung confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Ekstrak nilai true positive, true negative, false positive, dan false negative
tn, fp, fn, tp = cm.ravel()

# Data untuk pie chart
labels = ['True Positive', 'True Negative']
sizes = [tp, tn]

# Buat pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

# Tambahkan judul
plt.title('Hasil Hasil Actual Prediction')

# Menampilkan jumlah data di samping pie chart
plt.text(1.2, 0.5, f"True Positive: {tp}\nTrue Negative: {tn}", transform=ax.transAxes)

# Tampilkan pie chart
plt.show()


#Visualisasi

In [None]:
# prompt: tolong buatkan code yang hasilnya berisikan hasil sentimen komentar positif & negatif dalam bentuk pie chart besarta tampilkan jumlah datanya, oh ya datanya itu data dari labeling

import matplotlib.pyplot as plt

# Hitung jumlah komentar positif dan negatif dari kolom 'label'
positive_comments = (df['label'] == 'POSITIVE').sum()
negative_comments = (df['label'] == 'NEGATIVE').sum()

# Data untuk pie chart
labels = ['Positive', 'Negative']
sizes = [positive_comments, negative_comments]

# Buat pie chart
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
ax.axis('equal')

# Tambahkan judul
plt.title('Persentase Komentar Positif & Negatif')

# Menampilkan jumlah data di samping pie chart
plt.text(1.2, 0.5, f"Positive Comments: {positive_comments}\nNegative Comments: {negative_comments}", transform=ax.transAxes)

# Tampilkan pie chart
plt.show()


Visualisasi Kata yang sering muncul

In [None]:
train_s0 = df[df['label'] == 'NEGATIVE']

In [None]:
train_s0['comment'] = train_s0['comment'].fillna('tidak ada komentar')

In [None]:
from wordcloud import WordCloud

In [None]:
all_text_s0 = ' '.join(word for word in train_s0['comment'])
wordcloud = WordCloud(background_color='white').generate(all_text_s0)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Komentar Negatif')
plt.margins(x=0, y=0)
plt.show()

In [None]:
train_s1 = df[df['label'] == 'POSITIVE']
train_s1['comment'] = train_s1['comment'].fillna('tidak ada komentar')

In [None]:
all_text_s1 = ' '.join(word for word in train_s1['comment'])
wordcloud = WordCloud(background_color='white').generate(all_text_s1)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Komentar Positif')
plt.margins(x=0, y=0)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Data
scenarios = ['Skenario 90:10', 'Skenario 80:20', 'Skenario 70:30']
accuracy = [0.76, 0.75, 0.75]
precision = [0.77, 0.75, 0.75]
recall = [0.75, 0.74, 0.73]
f1_score = [0.75, 0.74, 0.74]

# Mengatur lebar bar dan posisi
bar_width = 0.2
r1 = np.arange(len(scenarios))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]

# Membuat plot
plt.figure(figsize=(12, 6))
plt.bar(r1, accuracy, color='#ff7f0e', width=bar_width, label='Accuracy')
plt.bar(r2, precision, color='#9467bd', width=bar_width, label='Precision')
plt.bar(r3, recall, color='#ffbb78', width=bar_width, label='Recall')
plt.bar(r4, f1_score, color='#1f77b4', width=bar_width, label='F1-Score')

# Menambahkan label dan judul
plt.xlabel('Skenario')
plt.ylabel('Persentase (%)')
plt.title('Perbandingan Evaluasi Antar Skenario')
plt.xticks([r + bar_width for r in range(len(scenarios))], scenarios)

# Menambahkan nilai di atas setiap bar
for i, v in enumerate(accuracy):
    plt.text(r1[i], v, f'{v:.2%}', ha='center', va='bottom')
for i, v in enumerate(precision):
    plt.text(r2[i], v, f'{v:.2%}', ha='center', va='bottom')
for i, v in enumerate(recall):
    plt.text(r3[i], v, f'{v:.2%}', ha='center', va='bottom')
for i, v in enumerate(f1_score):
    plt.text(r4[i], v, f'{v:.2%}', ha='center', va='bottom')

# Menambahkan legend
plt.legend()

# Mengatur batas sumbu y
plt.ylim(0, 1)

# Menampilkan grafik
plt.tight_layout()
plt.show()