In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [2]:
# GET the data
## Memuat Data Dataset
df = pd.read_csv("Sample_data.csv", usecols=["Isi_Tweet", "Sentimen"])

In [3]:
# EXPLORE the data
## Menampilkan lima data pertama
df.head()

Unnamed: 0,Isi_Tweet,Sentimen
0,"Biusnya habis ! RT""@eddies_song: Dahlan Iskan ...",-1
1,"Presiden Prabowo ,Presiden Terakhir Indonesia",1
2,@republikaonline masa capres prabowo bergitu b...,-1
3,"Kalo kata bapak capres ARB, kita harus ""berani...",1
4,"RT @DhafaRizky_: Najis,org gila doang yg dukun...",-1


In [4]:
## Menampilkan Ukuran Dataset
df.shape

(1885, 2)

In [5]:
## Memeriksa Missing Values
print(df.isnull().any())

Isi_Tweet    False
Sentimen     False
dtype: bool


In [6]:
# MODEL the data
## Pra Pengolahan - Pembersihan
stopwords = pd.read_csv("stopwords_id.csv")
stopwords = np.append(stopwords, "rt")

def clean_text(tweet):
    
    # Convert to lower case
    tweet = tweet.lower()
    
    # Clean www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Clean @username
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    
    # Clean per Words
    words = tweet.split()
    tokens=[]
    for ww in words:
        #split repeated word
        for w in re.split(r'[-/\s]\s*', ww):
            #replace two or more with two occurrences
            pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
            w = pattern.sub(r"\1\1", w)
            #strip punctuation
            w = w.strip('\'"?,.')
            #check if the word cosists of two or more alphabets
            val = re.search(r"^[a-zA-Z][a-zA-Z][a-zA-Z]*$", w)
            #add tokens
            if(w in stopwords or val is None):
                continue
            else:
                tokens.append(w.lower())
    
    tweet = " ".join(tokens)
    return tweet


df["Isi_Tweet"] = df['Isi_Tweet'].map(lambda x: clean_text(x))
df = df[df['Isi_Tweet'].apply(lambda x: len(x.split()) >=2)]
df.head()

Unnamed: 0,Isi_Tweet,Sentimen
0,biusnya habis dahlan iskan gak jls capres dahl...,-1
1,presiden prabowo presiden indonesia,1
2,capres prabowo bergitu bodoh ngecap uang ngak ...,-1
3,kalo capres arb berani berfikir berani bermimp...,1
4,gila doang yg dukung jdi lbih,-1


In [7]:
## Pra Pengolahan - Tokenisasi dan Sequence 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Isi_Tweet'])
sequences = tokenizer.texts_to_sequences(df['Isi_Tweet'])
tweets = pad_sequences(sequences, maxlen=50, padding='post')
labels = np.array((df['Sentimen']))

In [8]:
#trying GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


In [9]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Conv1D(num_filters, kernel_size=kernel_size, activation=tf.nn.relu))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256, activation=tf.nn.relu))
    model.add(Dense(1, activation=tf.nn.sigmoid))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [10]:
import time
np.random.seed(1)
start = time. time()
vocab_size = len(tokenizer.word_index) + 1
sequence_length = tweets.shape[1]
tweets_train, tweets_test, labels_train, labels_test = train_test_split(
    tweets, labels, test_size=0.2)

param_grid = dict(num_filters=[100,200],
                      kernel_size=[3, 4,5,6,7],
                      vocab_size=[vocab_size],
                      embedding_dim=[100],
                      maxlen=[sequence_length])
model = KerasClassifier(build_fn=create_model,
                            epochs=10, batch_size=10,
                            verbose=False)
grid = GridSearchCV(estimator=model, param_grid=param_grid,
                              cv=5, verbose=1)
grid_result = grid.fit(tweets_train,labels_train)

# Evaluate testing set
test_accuracy = grid.score(tweets_test, labels_test)
print("Model Terbaik: %s" % grid.best_params_)
print("Nilai akurasi train: %s" % grid.best_score_)
print("Nilai akurasi test: %s" %test_accuracy)

end = time. time()
print(end - start)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           343800    
_________________________________________________________________
conv1d (Conv1D)              (None, 48, 100)           30100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               25856     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 400,013
Trainable params: 400,013
Non-trainable params: 0
_________________________________________________________________


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           343800    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 100)           30100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               25856     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 400,013
Trainable params: 400,013
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 47, 100)           40100     
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 100)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 256)               25856     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 257       
Total params: 410,013
Trainable params: 410,013
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 47, 200)           80200     
_________________________________________________________________
global_max_pooling1d_19 (Glo (None, 200)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 256)               51456     
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 257       
Total params: 475,713
Trainable params: 475,713
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 46, 200)           100200    
_________________________________________________________________
global_max_pooling1d_28 (Glo (None, 200)               0         
_________________________________________________________________
dense_56 (Dense)             (None, 256)               51456     
_________________________________________________________________
dense_57 (Dense)             (None, 1)                 257       
Total params: 495,713
Trainable params: 495,713
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_37 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 45, 200)           120200    
_________________________________________________________________
global_max_pooling1d_37 (Glo (None, 200)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 256)               51456     
_________________________________________________________________
dense_75 (Dense)             (None, 1)                 257       
Total params: 515,713
Trainable params: 515,713
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_46 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_46 (Conv1D)           (None, 44, 200)           140200    
_________________________________________________________________
global_max_pooling1d_46 (Glo (None, 200)               0         
_________________________________________________________________
dense_92 (Dense)             (None, 256)               51456     
_________________________________________________________________
dense_93 (Dense)             (None, 1)                 257       
Total params: 535,713
Trainable params: 535,713
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embe

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 170.7min finished


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_50 (Embedding)     (None, 50, 100)           343800    
_________________________________________________________________
conv1d_50 (Conv1D)           (None, 47, 200)           80200     
_________________________________________________________________
global_max_pooling1d_50 (Glo (None, 200)               0         
_________________________________________________________________
dense_100 (Dense)            (None, 256)               51456     
_________________________________________________________________
dense_101 (Dense)            (None, 1)                 257       
Total params: 475,713
Trainable params: 475,713
Non-trainable params: 0
_________________________________________________________________
10511.909377098083


In [11]:
test_accuracy = grid.score(tweets_test, labels_test)
print(test_accuracy)

grid.best_params_

0.880636596078898


{'embedding_dim': 100,
 'kernel_size': 4,
 'maxlen': 50,
 'num_filters': 200,
 'vocab_size': 3438}

In [12]:
tweet_new = ["apalagi kalau yg jadi presiden dia .. tak dpt di bayangkan gmn jadinya negri ini"]

## Pra Pengolahan Data Baru
sequences = tokenizer.texts_to_sequences(tweet_new)
tweet_new = pad_sequences(sequences, maxlen=50, padding='post')

# Deteksi Sentimen Data Baru
print(grid.predict(tweet_new))

[[1]]
