Nama Anggota :

23520001 - Muhammad Ulfi

23520015 - Kevin Muharyman A


---


# Analisis Kepuasan Rakyat Terhadap Kinerja Pemerintah Selama Pandemi Covid-19

## Preprocessing

### Import Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
pip install PySastrawi

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DWDM/TB/covid-sentiment.csv')
dftranslated = pd.read_csv('/content/drive/MyDrive/DWDM/TB/TRANSLATED-covid-sentiment.csv')

In [None]:
df.head()

### Case Folding

In [None]:
df['tweet'] = df['tweet'].str.lower()

In [None]:
df['tweet'].head()

### Tokenizing

In [None]:
import string
import re #untuk regular expression

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

In [None]:
#remove unnneccesary characters
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
              
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

In [None]:
df['tweet'] = df['tweet'].apply(remove_tweet_special)
df['tweet'] = df['tweet'].apply(remove_number)
df['tweet'] = df['tweet'].apply(remove_punctuation)
df['tweet'] = df['tweet'].apply(remove_whitespace_LT)
df['tweet'] = df['tweet'].apply(remove_whitespace_multiple)
df['tweet'] = df['tweet'].apply(remove_singl_char)

In [None]:
df['tokenized_tweet'] = df['tweet'].apply(word_tokenize_wrapper)

In [None]:
df[['tweet','tokenized_tweet']].head()

In [None]:
# NLTK calc frequency distribution (digunakan untuk tf-idf nantinya (mungkin, masih explore)) 
# referensi tf-idf : https://medium.com/@yunusmuhammad007/tf-idf-term-frequency-inverse-document-frequency-representasi-vector-data-text-2a4eff56cda
# rencana tar tf-idf nya dipisah yg positif sama negatif, labelling tweet pake ini mana yg lebih besar (masih teori, blm di coba)
def freqDist_wrapper(text):
    return FreqDist(text)

df['tokenized_freq'] = df['tokenized_tweet'].apply(freqDist_wrapper)


In [None]:
print(df['tokenized_freq'].head().apply(lambda x : x.most_common()))

### Stopword Removal

In [None]:
tweet_data = df[['tweet', 'tokenized_tweet', 'tokenized_freq']]

In [None]:
tweet_data.head()

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# stopword indonesia from NTLK library
list_stopwords = stopwords.words('indonesian')

# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# stopword from file
txt_stopword = pd.read_csv("/content/drive/MyDrive/DWDM/TB/stopwords.txt", names= ["stopwords"], header = None)

# add stopword from file to list_stopword
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

In [None]:
#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

In [None]:
tweet_data['stopword_applied'] = tweet_data['tokenized_tweet'].apply(stopwords_removal)

In [None]:
tweet_data[['tokenized_tweet', 'stopword_applied']].head()

### Normalisasi

In [None]:
# mengganti kata alay ke kata standar baku
# list slang dapat dari : https://medium.com/@arie.pratama.s/bahasa-indonesia-open-sourced-nlp-resources-8cb394193238

kamusalay = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')

In [None]:
kamusalay[['slang', 'formal']].head()

In [None]:
katabaku_dict = {}

for index, row in kamusalay.iterrows():
    if row[0] not in katabaku_dict:
        katabaku_dict[row[0]] = row[1] 

def normalized_term(document):
    return [katabaku_dict[term] if term in katabaku_dict else term for term in document]


In [None]:
tweet_data['baku_tweet'] = tweet_data['stopword_applied'].apply(normalized_term)

In [None]:
tweet_data[['tokenized_tweet', 'stopword_applied', 'baku_tweet']].head()

### Stemming

In [None]:
pip install swifter

In [None]:
# import Sastrawi package
# pake swifter biar eksekusinya cepat

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in tweet_data['baku_tweet']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            


for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    
# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

tweet_data['stemmed_tweet'] = tweet_data['baku_tweet'].swifter.apply(get_stemmed_term)

In [None]:
tweet_data['stemmed_tweet'].head()

In [None]:
# save dataset sementara
tweet_data.to_csv("Text_Preprocessing_notinclude_tfidf.csv")

### TF-IDF

Sumber referensi TF-IDF manual 

https://medium.com/@yunusmuhammad007/tf-idf-term-frequency-inverse-document-frequency-representasi-vector-data-text-2a4eff56cda

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DWDM/TB/Text_Preprocessing_notinclude_tfidf.csv")

In [None]:
df.head()

In [None]:
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

df['tweet_list'] = df['stemmed_tweet'].apply(convert_text_list)

In [None]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict


In [None]:
df["TF_dict"] = df['tweet_list'].apply(calc_TF)

df["TF_dict"].head()

In [None]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF


In [None]:
DFresult = calc_DF(df["TF_dict"])

In [None]:
# cacl idf
n_document = len(df)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict


In [None]:
#Stores the idf dictionary
IDFresult = calc_IDF(n_document, DFresult)

In [None]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDFresult[key]
    return TF_IDF_Dict


In [None]:
df["TF-IDF_dict"] = df["TF_dict"].apply(calc_TF_IDF)

In [None]:
positive_word = pd.read_csv('/content/drive/MyDrive/DWDM/TB/positive.txt')
negative_word = pd.read_csv('/content/drive/MyDrive/DWDM/TB/negative.txt')
kata_positive = positive_word['a+'].tolist()
kata_negative = negative_word['abnormal'].tolist()

In [None]:
# cacl tf-idf total for positive word
def calc_positive(TF_IDF_Dict):
  total = 0
  for key in TF_IDF_Dict:
    if key in kata_positive:
      total = total + TF_IDF_Dict[key]

  return total


In [None]:
# cacl tf-idf total for positive word
def calc_negative(TF_IDF_Dict):
  total = 0
  for key in TF_IDF_Dict:
    if key in kata_negative:
      total = total + TF_IDF_Dict[key]

  return total

In [None]:
df["TF-IDF_positive"] = df["TF-IDF_dict"].apply(calc_positive)
df["TF-IDF_negative"] = df["TF-IDF_dict"].apply(calc_negative)

In [None]:
conditions = [
    (df['TF-IDF_positive'] < df['TF-IDF_negative']),
    (df['TF-IDF_positive'] > df['TF-IDF_negative']),
    (df['TF-IDF_positive'] == df['TF-IDF_negative'])]
choices = ['negative','positive','neutral']

df['status'] = np.select(conditions, choices)

In [None]:
df.head()

In [None]:
df.to_csv("Text_Preprocessing_complete.csv")

### Pembagian Dataset dan Penanganan Imbalance Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DWDM/TB/Text_Preprocessing_complete.csv")

In [None]:
# Vectorizing is the process to convert tokens to numbers. It is an important step because the machine learning algorithm works with numbers and not text.

from sklearn.feature_extraction.text import TfidfVectorizer

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector


In [None]:
tf_vector = get_feature_vector(np.array(df.iloc[:, 2]).ravel())
X = tf_vector.transform(np.array(df.iloc[:, 2]).ravel())
y = np.array(df.iloc[:, 13]).ravel()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# x = df['tweet']
# y = df['status']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
df.status.value_counts()

Berdasarkan persebaran status pada dataset ini, dapat kita lihat bahwa persebaran antara tweet positif dan negatif cukup seimbang, akan tetapi persebaran data netral cukup jauh. Oleh karena itu disini kami mencoba untuk downsampling 

In [None]:

# waktu coba fitting, muncul error "A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array."

# x_train.toarray() # cara mengatasinya, tapi disini season crash waktu run ini, ukuran sparse matrix nya kegedean

In [None]:
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
x_train_res, y_train_res = rus.fit_sample(x_train, y_train) 
x_test_res, y_test_res = rus.fit_sample(x_test, y_test) 

In [None]:
print('After UnderSampling, the shape of train_X: {}'.format(x_train_res.shape)) 
print('After UnderSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After UnderSampling, counts of label 'positive': {}".format(sum(y_train_res == 'positive'))) 
print("After UnderSamplimg, counts of label 'negative': {}".format(sum(y_train_res == 'negative'))) 
print("After UnderSampling, counts of label 'neutral': {}".format(sum(y_train_res == 'neutral'))) 

In [None]:
print("Setelah UnderSampling, jumlah dari label 'positive': {}".format(sum(y_train_res == 'positive'))) 
print("Setelah UnderSamplimg, jumlah dari label 'negative': {}".format(sum(y_train_res == 'negative'))) 
print("Setelah UnderSampling, jumlah dari label 'neutral': {}".format(sum(y_train_res == 'neutral'))) 

## Implementasi

### Logistic Regression

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report,confusion_matrix

#### Inisiasi Model

Pada bagian awal, dilakukan pemodelan dengan bentuk inisialisasi tanpa pengaturan sama sekali.

In [None]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression()

In [None]:
scoreLR = cross_val_score(modelLR, x_train, y_train, cv=5)

Untuk skor model ketika melakukan cross validation mendapat skor paling tinggi sebesar 87.5% dengan nilai rata-rata 86.9%

In [None]:
print(scoreLR)
print(np.mean(scoreLR))

In [None]:
modelLR.fit(x_train,y_train)
y_predLR_init = modelLR.predict(x_test)

In [None]:
print(confusion_matrix(y_test,y_predLR_init),"\n")
print(classification_report(y_test,y_predLR_init))

Pada pemodelan awal ini, dapat dilihat bahwa model ini memiliki nilai akurasi sebesar 89%

#### Tuning Hyperparameter Menggunakan Balance Dataset

Disini kami menggunakan solver = sag dan saga karena solver ini mendukung untuk pemrosesan database yang berskala besar, kemudian juga mentuning parameter penalty untuk kedua solver tersebut

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = [
  {'penalty': ['l1', 'l2', 'none'], 'solver': ['saga']},
  {'penalty': ['l2', 'none'], 'solver': ['sag']},
 ]

In [None]:
modelLR1 = GridSearchCV(modelLR, parameters, cv=5)
modelLR1.fit(x_train_res, y_train_res)

In [None]:
print("best parameters: {}".format(modelLR1.best_params_))
print("best score:      {:0.5f}".format(modelLR1.best_score_))

Dari hasil tersebut dapat dilihat bahwa parameter terbaik dengan menggunakan nilai penalty = none dan solver = saga dengan skor sebesar 0.89

#### Pengujian Model

In [None]:
modelLR_final = LogisticRegression(solver = "saga", penalty="none")
modelLR_final.fit(x_train,y_train)
y_predLR = modelLR_final.predict(x_test)

In [None]:
print("Hasil pemodelan dengan parameter terbaik")
print(confusion_matrix(y_test,y_predLR),"\n")
print(classification_report(y_test,y_predLR))

In [None]:
print("Hasil pemodelan dengan tanpa parameter")
print(confusion_matrix(y_test,y_predLR_init),"\n")
print(classification_report(y_test,y_predLR_init))

Dari perbandingan antara model awal dengan model final, terdapat peningkatan yang cukup signifikan secara performa. Model final memiliki skor pada setiap aspek yang lebih baik daripada model awal. Model final membuat precision, recall, dan f1-score meningkat menjadi 92% dari skor awal yang berkisar antara 88-89%

## LSTM

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools
import pandas as pd

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DWDM/TB/Text_Preprocessing_complete.csv")

In [None]:
df.head()

In [None]:
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

df['tweet_list'] = df['tweet_list'].apply(convert_text_list)
df['tweet_list'] = df['tweet_list'].str.join(" ")

In [None]:
zz = df[["tweet_list","status"]]
k = zz["status"] =="positive"
l = zz["status"] =="negative"
m = zz["status"] =="neutral"

a = zz[k][:10000].values
b = zz[l][:10000].values
c = zz[m].values

d = []
for i in range(len(a)):
  d.append([a[i][0],a[i][1]])

for i in range(len(b)):
  d.append([b[i][0],b[i][1]])

for i in range(len(c)):
  d.append([c[i][0],c[i][1]])


bb = pd.DataFrame(d,  columns =  ["tweet","status"])

In [None]:
X = df["tweet_list"].tolist()
y = df["status"]
print(y.value_counts())
le = LabelEncoder()
le.fit(y.tolist())
y = le.transform(y.tolist())

In [None]:
print(X[:5])
print(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
y_train= encoder.fit_transform(y_train)

In [None]:
documents = [_tweet.split() for _tweet in X_train]

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(size=500, 
                                            window=7, 
                                            min_count=10, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents) 

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=5)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=500)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=500)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
embedding_matrix = np.zeros((vocab_size, 500))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, 500, weights=[embedding_matrix], input_length=500, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.summary()

In [None]:
model = Sequential()
model.add(embedding_layer)
# model.add(Dropout(0.5))
# model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2,return_sequences=True,))
# model.add(Dropout(0.3))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2,return_sequences=True,))
# model.add(Dropout(0.5))
model.add(LSTM(8, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=256,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
y_pred = model.predict(X_test, verbose=1)

In [None]:
z = []

for i in range(len(y_pred)):
  for j in range(len(y_pred[0])):
    if y_pred[i][j] == max(y_pred[i]):
      z.append(j)

In [None]:
print(classification_report(y_test, z))

       precision    recall  f1-score   support

    negative       0.74      0.75      0.75      1933
     neutral       0.77      0.86      0.81      1898
    positive       0.81      0.71      0.76      1877

    accuracy                           0.77      5708
   macro avg       0.77      0.77      0.77      5708
weighted avg       0.77      0.77      0.77      5708


## Logistic Regression Versi 2 Menggunakan TF IDF

In [None]:
X = df.iloc[:, 8]
y = df.iloc[:, 13]

le = LabelEncoder()
le.fit(y.tolist())
y = le.transform(y.tolist())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
td = TfidfVectorizer(max_features = 5000)
X = td.fit_transform(X).toarray()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
modelLR = LogisticRegression()

In [None]:
scoreLR = cross_val_score(modelLR, X_train, y_train, cv=5)

In [None]:
print(scoreLR)
print("akurasi rata-rata : ",np.mean(scoreLR))

## Random Forest Versi 2 Menggunakan TF IDF

In [None]:
# RANDOM FOREST
modelRF = RandomForestClassifier(random_state=0)

In [None]:
scoreRF = cross_val_score(modelRF, X_train, y_train, cv=5)

In [None]:
print(scoreRF)
print("akurasi rata-rata : ",np.mean(scoreRF))

In [None]:
modelRF.fit(X_train,y_train)
y_predRF_init = modelRF.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_predRF_init),"\n")
print(classification_report(y_test,y_predRF_init))

## Lesson Learn, Insight, and Summary