<a href="https://colab.research.google.com/github/HighkalW/CapstoneProject/blob/toxicity_classifier/Toxicity_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Creation for Toxicity Classifier

## Downloading Datasets from Kaggle

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
# Import OS for navigation and environment set up
import os
# Check current location, '/content' is the Colab virtual machine
print(os.getcwd())
# Enable the Kaggle environment, use the path to the directory your Kaggle API JSON is stored in
os.environ['KAGGLE_CONFIG_DIR'] = '../gdrive/MyDrive/Kaggle'

/content


In [3]:
!kaggle datasets download -d ilhamfp31/indonesian-abusive-and-hate-speech-twitter-text
!kaggle datasets download -d oswinrh/indonesian-stoplist

!unzip 'indonesian-abusive-and-hate-speech-twitter-text.zip' -d 'abusive_text_dataset'
!unzip 'indonesian-stoplist.zip' -d 'stoplist_dataset'

Downloading indonesian-abusive-and-hate-speech-twitter-text.zip to /content
  0% 0.00/814k [00:00<?, ?B/s]
100% 814k/814k [00:00<00:00, 63.2MB/s]
Downloading indonesian-stoplist.zip to /content
  0% 0.00/2.27k [00:00<?, ?B/s]
100% 2.27k/2.27k [00:00<00:00, 2.40MB/s]
Archive:  indonesian-abusive-and-hate-speech-twitter-text.zip
  inflating: abusive_text_dataset/README.md  
  inflating: abusive_text_dataset/abusive.csv  
  inflating: abusive_text_dataset/citation.bib  
  inflating: abusive_text_dataset/data.csv  
  inflating: abusive_text_dataset/new_kamusalay.csv  
Archive:  indonesian-stoplist.zip
  inflating: stoplist_dataset/stopwordbahasa.csv  


## Importing Datasets Using Pandas

note: abusive sentences is the training data

In [4]:
import pandas as pd

stopwords_path = '/content/stoplist_dataset/stopwordbahasa.csv'
abusive_words_path = '/content/abusive_text_dataset/abusive.csv'
alay_words_path = '/content/abusive_text_dataset/new_kamusalay.csv'
abusive_sentences_path = '/content/abusive_text_dataset/data.csv'

df_stopwords = pd.read_csv(stopwords_path, header=None)
df_stopwords = df_stopwords.rename(columns={0: 'stopword'})
df_abusive = pd.read_csv(abusive_words_path, encoding='latin-1')
df_alay = pd.read_csv(alay_words_path, encoding='latin-1', header=None)
df_alay = df_alay.rename(columns={0: 'original', 
                                  1: 'replacement'})
df_alay_map = dict(zip(df_alay['original'], df_alay['replacement']))
df_abusive_sentences = pd.read_csv(abusive_sentences_path, encoding='latin-1') 

### Checking the structure of each DataFrame

In [5]:
df_stopwords.head()

Unnamed: 0,stopword
0,ada
1,adalah
2,adanya
3,adapun
4,agak


In [6]:
df_abusive.head()

Unnamed: 0,ABUSIVE
0,alay
1,ampas
2,buta
3,keparat
4,anjing


In [7]:
df_alay.head()

Unnamed: 0,original,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [8]:
df_abusive_sentences.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


## Data Preprocessing

In [9]:
import re

def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('\r',' ',text) # Remove every '\r'
    text = re.sub('(?i)rt',' ',text) # Remove every retweet symbol
    text = re.sub('@[^\s]+[ \t]','',text) # Remove every username
    text = re.sub('(?i)user','',text) # Remove every username
    text = re.sub('(?i)url',' ',text) # Remove every url
    text = re.sub(r'\\x..',' ',text) # Remove every emoji
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text) #Remove characters repeating more than twice

    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 

    return text

def normalize_alay(text):
    words = text.split(' ')
    normalized_words= []
    for word in words:
      if word in df_alay_map:
        normalized_words.append(df_alay_map[word])
      else:
        normalized_words.append(word)
      normalized_sentences = " ".join(normalized_words)
    return normalized_sentences

def remove_stopword(text):
    text = ' '.join(['' if word in df_stopwords.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()

    return text

def preprocess(text):
    text = remove_unnecessary_char(text)
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = normalize_alay(text)
    text = remove_stopword(text)

    return text


In [10]:
df_abusive_sentences['Tweet'].apply(preprocess)

0        cowok berusaha melacak perhatian gue lantas re...
1        telat tau edan sarap gue bergaul cigax jifla c...
2        41 kadang berpikir percaya tuhan jatuh berkali...
3                                      ku tau matamu sipit
4                   kaum cebong kafir dongoknya dungu haha
                               ...                        
13164                berbicara ndasmu congor sekata anjing
13165                                    kasur enak kunyuk
13166                             hati hati bisu bosan duh
13167    bom real mudah terdeteksi bom terkubur dahsyat...
13168                              situ foto ya kutil onta
Name: Tweet, Length: 13169, dtype: object

## Building the Model

In [11]:
import tensorflow as tf
print(tf.__version__)

2.8.0


In [12]:
# Run if Tensorflow version < 2.0
# !pip install --upgrade tensorflow

In [15]:
from tensorflow.keras.layers import TextVectorization

# New Section