<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/AlBerto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AlBERTo Hate Speech Classifier 

In [1]:
!pip install ekphrasis
!pip install bert-tensorflow
!pip install transformers==3.5


import pandas as pd
import numpy as np
import collections
import logging
import os
import re

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from transformers import BertTokenizer, WordpieceTokenizer
from transformers.tokenization_bert import load_vocab



In [18]:
import sys
!pip install tensorflow==1.15.0

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
from run_classifier import *
import modeling
import optimization
import tokenization



In [19]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load the dataset 

In [20]:
# directory name 
input_dir = '/content/drive/My Drive/HLT/clean_dataset_training/' 
AlBERTo_path = '/content/drive/MyDrive/HLT/alberto_uncased_L-12_H-768_A-12_italian_ckpt/'

# Spec
pd.set_option("display.max_colwidth", None)

In [21]:
tsv_file = open(input_dir+"training_dataset.csv")

dataset = pd.read_csv(tsv_file,sep=',')

## Configure AlBERTo classes 

In [30]:
text_processor = TextPreProcessor (
    # terms that will be normalized
    normalize=[ 'url' , 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'] ,
    # terms that will be annotated
    annotate={"hashtag"} ,
    fix_html=True ,  # fix HTML tokens

    unpack_hashtags=True ,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts = [ emoticons ]
)

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


In [23]:
dataset_sentences = dataset['text']
dataset_labels = dataset['hs']

In [24]:
i = 0 
examples = []
for s in dataset_sentences: 
  examples.append([dataset_labels[i], s])
  i = i+1

examples = np.array(examples)

In [25]:
'''
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create InputExample's using the constructor provided in the BERT library.

    text_a is the text we want to classify, which in this case, is the Request field in our Dataframe.
    text_b is used if we're training a model to understand the relationship between sentences (i.e. is text_b a translation of text_a? Is text_b an answer to the question asked by text_a?). This doesn't apply to our task, so we can leave text_b blank.
    label is the label for our example, i.e. True, False

'''
# guid = Globally unique ID for bookkeeping, unused in this example
f = lambda x: InputExample(guid=None, text_a = x[1], text_b = None, label = int(x[0]))

examples = map(f, examples)
examples = list(examples)

# Tokenizer 

In [33]:
VOCAB_FILE = '/content/drive/MyDrive/HLT/alberto_uncased_L-12_H-768_A-12_italian_ckpt/vocab.txt'

#Inizialize the tokenizer
tokenizer = tokenization.FullTokenizer(VOCAB_FILE, do_lower_case=True)

# Model prediction 

In [34]:
MAX_SEQ_LENGTH = 128
label_list = [0, 1]

# MODEL PREDICTIONS
input_features = convert_examples_to_features(
      examples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 6837
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: None
INFO:tensorflow:tokens: [CLS] e terrorismo anche questo per mettere in uno stato di soggezione le persone e render ##le innocue mentre qualcuno [SEP]
INFO:tensorflow:input_ids: 2 13 4923 23 79 22 605 24 153 184 12 49535 40 234 13 20897 1041 90954 408 271 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0