In [None]:
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import time
from wordcloud import WordCloud, STOPWORDS
from IPython.core.display import display, HTML
import plotly.graph_objects as go
import re
# Natural Language Tool Kit
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
import cufflinks as cf
cf.go_offline()

In [None]:
train_df = pd.read_csv("https://raw.githubusercontent.com/NgakanWidyasprana/title-campaign-classification/main/Dataset/Train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/NgakanWidyasprana/title-campaign-classification/main/Dataset/Test.csv")

In [None]:
print(train_df)

In [None]:
print(test_df)

In [None]:
# Stop Words

id_stopword_dict = pd.read_csv('https://raw.githubusercontent.com/NgakanWidyasprana/title-campaign-classification/main/Dataset/stopwordbahasa.csv', header=None)
id_stopword_dict = id_stopword_dict.rename(columns={0: 'stopword'})

In [None]:
print(id_stopword_dict)

In [None]:
def remove_stopword(text):
    text = ' '.join(['' if word in id_stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

In [None]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    # split to array (default delimiter is " ")
    text = remove_stopword(text)
    return text

In [None]:
train_df['Title Campaign'] = train_df['Title Campaign'].apply(lambda x : clean_text(x))
test_df['Title Campaign'] = test_df['Title Campaign'].apply(lambda x : clean_text(x))

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
!pip install PySastrawi

In [None]:
# Steaming
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def steaming(text):
  return stemmer.stem(text)

In [None]:
print(stemmer)

In [None]:
train_df['Title Campaign'] = train_df['Title Campaign'].apply(lambda x : steaming(x))
test_df['Title Campaign'] = test_df['Title Campaign'].apply(lambda x : steaming(x))

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
# How many unique words have this text

def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count

text_values = train_df['Title Campaign']

counter = counter_word(text_values)
print(f"The len of words is: {len(counter)}")
list(counter.items())[:10]

In [None]:
# The maximum number of words to be used. (most frequent)

vocab_size = len(counter)
embedding_dim = 32

# Max number of words in each complaint
max_length = 20
trunc_type = 'post'
padding_type = 'post'

# oov_took its set for words out our word index
oov_tok = "<XXX>"
training_size = 80
seq_len = 12

# based on 80% of the data
training_sentences = train_df['Title Campaign'][0:training_size]
training_labels = train_df.Label[0:training_size]

valid_sentences = train_df['Title Campaign'][training_size:]
valid_labels = train_df.Label[training_size:]

print('The Shape of training ',training_sentences.shape)
print('The Shape of validation',valid_sentences.shape)

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [None]:
print(word_index)

In [None]:
# Lets see the first 10 elements
print("THe first word Index are: ")
for x in list(word_index)[0:15]:
    print (" {},  {} ".format(x,  word_index[x]))

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print("Sentence : {}".format(train_df['Title Campaign'][0]))
print("Text Sequences : {}".format(training_sequences[0]))
print("Text Padded : {}".format(training_padded[0]))

In [None]:
valid_sequences = tokenizer.texts_to_sequences(valid_sentences)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print("Sentence : \n{}".format(pd.Series(data=valid_sentences, index = [80])))
print("\nText Sequences : {}".format(valid_sequences[0]))
print("Text Padded : {}".format(valid_padded[0]))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid') # remember this is a binary classification
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# start_time = time.time()

num_epochs = 40
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(valid_padded, valid_labels))

# final_time = (time.time()- start_time)/60
# print(f'The time in minutes: {final_time}')

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [None]:
model_loss[['accuracy','val_accuracy']].plot()

In [None]:
predictions = model.predict(valid_padded)
predictions

In [None]:
testing_sequences2 = tokenizer.texts_to_sequences(test_df["Title Campaign"])
testing_padded2 = pad_sequences(testing_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print(test_df['Title Campaign'][0])
print(testing_sequences2[0])
print(testing_padded2[0])

In [None]:
predictions = model.predict(testing_padded2)
predictions

#Saved Model in .H5 Files#

In [None]:
model.save('/my_model.h5')

In [None]:
new_model = tf.keras.models.load_model('/my_model.h5')
new_model.summary()

In [None]:
text = ['Bersama Menjaga Lingkungan Asri']
text_test = pd.DataFrame(text, columns=['Campaign'])

text_clean = text_test['Campaign'].apply(lambda x : clean_text(x))
text_final = text_test['Campaign'].apply(lambda x : steaming(x))

print(text_final)

In [None]:
testing_text_sequence = tokenizer.texts_to_sequences(text_final)
testing_text_padded = pad_sequences(testing_text_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(testing_text_sequence)
print(testing_text_padded)

In [None]:
predictions = new_model.predict(testing_text_padded)
predictions

#Saved Model TFJS#

In [None]:
# import time
# saved_model_path = "./{}.h5".format(int(time.time()))

# model.save(saved_model_path)

In [None]:
# import tensorflow as tf
# !pip install tensorflowjs

# !tensorflowjs_converter --input_format=keras {saved_model_path} ./