In [9]:
import pandas as pd
import tensorflow as tf
import numpy as np

import json
import zipfile
import os
import random
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
!gdown --id 1MhaG0zI4Zjt4ion5QXENwY8RK3YQfFOf

Downloading...
From: https://drive.google.com/uc?id=1MhaG0zI4Zjt4ion5QXENwY8RK3YQfFOf
To: /content/Dataset for Detection of Cyber-Trolls.json.zip
  0% 0.00/686k [00:00<?, ?B/s]100% 686k/686k [00:00<00:00, 98.1MB/s]


In [13]:
def fetching_information_from_resource():
  # menentukan target file zip
  zip_file = 'Dataset for Detection of Cyber-Trolls.json.zip'
  extracting = zipfile.ZipFile(zip_file, 'r')

  # membuat directory dataset
  os.mkdir('dataset')
  dataset_dir = 'dataset'

  # extracting file zip
  extracting.extractall(dataset_dir)
  extracting.close()

  # membuat variabel untuk menampung dataset yang bertipe json
  target_file = os.path.join(dataset_dir, 'Dataset for Detection of Cyber-Trolls.json')
  json_file = pd.read_json(target_file, lines = True)

  general_datas = json_file['content']
  general_labels = []

  for index in range(len(general_datas)):
    general_labels.append(json_file['annotation'][index]['label'][0])

  indexes = list(range(len(general_labels)))

  random.shuffle(indexes)

  randomized_training_data = []
  randomized_training_label = []

  for index in indexes:
    randomized_training_data.append(general_datas[index])
    randomized_training_label.append(general_labels[index])

  return (randomized_training_data, randomized_training_label)

In [2]:
def tokenizing(datas, labels):
  VOCAB_SIZE = 1000
  tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token = '<OOV>')
  tokenizer.fit_on_texts(datas)
  word_index = tokenizer.word_index

  sequences = tokenizer.texts_to_sequences(datas)

  padded_sequences = pad_sequences(sequences, padding = 'post', maxlen = 100)

  integer_labels = [int(x) for x in labels]
  arrayed_labels = np.array(integer_labels)

  with open ('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

  return (padded_sequences, arrayed_labels)

In [3]:
def splitting_dataset(datas, labels, TRAINING_PORTION):
  TRAINING_PORTION = 0.9

  training_datas = datas[:round(len(datas) * TRAINING_PORTION)]
  training_labels = labels[:round(len(datas) * TRAINING_PORTION)]
  validation_datas = datas[round(len(datas) * TRAINING_PORTION) :]
  validation_labels = labels[round(len(datas) * TRAINING_PORTION) :]

  return (training_datas, training_labels, validation_datas, validation_labels)

In [4]:
def generate_confusion_matrix(actual_label, predicted_label):
  matriks = tf.math.confusion_matrix(actual_label,predicted_label)
  print(matriks)

In [16]:
def training_model(training_data, training_label, validation_data, validation_label, model_filename, model_title):
  print("--- {} ---".format(model_title))

  model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(
        1000,
        500,
        input_length = 100
    ),
    tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(16)
    ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(25, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
  ])

  model.compile(
    loss='binary_crossentropy',
    metrics=['accuracy',tf.keras.metrics.Recall(), tf.keras.metrics.Precision()],
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
  )

  model.fit(
    training_data, 
    training_label, 
    epochs = 1,
    # validation_data = (validation_data, validation_label)
  )

  raw_predicted_label = model.predict(validation_data)
  predicted_label = []
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
  for label in raw_predicted_label:
    if label > 0.5:
      result = 1
    else:
      result = 0
    predicted_label.append(result)

  generate_confusion_matrix(validation_label, predicted_label)

  saved_model_path = './model/'+model_filename
  model.save(saved_model_path)

  return model

In [17]:
def undersampling_dataset(training_datas, training_labels):
  # cari kelas mayoritas
  true_count = 0
  false_count = 0

  for index in range(len(training_datas)):
    if training_labels[index] == 0:
      false_count += 1
      training_labels[index] = 0
    elif training_labels[index] == 1:
      true_count += 1
      training_labels[index] = 1

    if true_count > false_count:
      majority_class = 1
    else:
      majority_class = 0

    # masukin index data yang kelas mayoritas ke majority_indexes, sekaligus inisialisasi balanced datas dan labels
  majority_indexes = []
  balanced_datas = []
  balanced_labels = []

  for index in range(len(training_datas)):
    if training_labels[index] == majority_class:
      majority_indexes.append(index)
    else:
      # balanced datas dan balanced labels akan diisi sama kelas minoritas dulu
      balanced_datas.append(training_datas[index])
      balanced_labels.append(training_labels[index])

  # randmon index dari kelas majoritas, agar yang di undersampling data acak (tidak berurutan)
  random.shuffle(majority_indexes)

  # memasukan data dari kelas mayoritas ke balanced_datas dan balanced_labels dengan banyak data setara dengan data minoritas
  balancing_point = len(balanced_datas)

  for index in range(balancing_point):
    majority_index = majority_indexes[index]

    balanced_datas.append(training_datas[majority_index])
    balanced_labels.append(training_labels[majority_index])

  # random data yang sudah balance
  indexes = list(range(len(balanced_datas)))

  random.shuffle(indexes)

  randomized_training_data = []
  randomized_training_label = []

  for index in indexes:
    randomized_training_data.append(balanced_datas[index])
    randomized_training_label.append(balanced_labels[index])

  return (np.array(randomized_training_data), np.array(randomized_training_label))

In [20]:
# (general_datas, general_labels) = fetching_information_from_resource()

(tokenized_datas, arrayed_labels) = tokenizing(general_datas, general_labels)

(
    training_datas, 
    training_labels, 
    validation_datas, 
    validation_labels
) = splitting_dataset(tokenized_datas, arrayed_labels, 0.9)
unbalanced_model = training_model(training_datas, training_labels, validation_datas, validation_labels, 'unbalanced_model.h5', 'UNBALANCED MODEL')

(balanced_datas, balanced_labels) = undersampling_dataset(tokenized_datas, arrayed_labels)

balanced_model = training_model(balanced_datas, balanced_labels, validation_datas, validation_labels, 'balanced_model.h5', 'BALANCED MODEL')

--- UNBALANCED MODEL ---
tf.Tensor(
[[906 319]
 [191 584]], shape=(2, 2), dtype=int32)
--- BALANCED MODEL ---
tf.Tensor(
[[720 505]
 [ 72 703]], shape=(2, 2), dtype=int32)
