In [None]:
import pandas as pd
import numpy as np
#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns

%matplotlib inline


#NLTK libraries
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
  

#Metrics libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Deep learning libraries
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import string
pucn = string.punctuation
stop = stopwords.words('english')

In [None]:
# remove urls from text
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    url_free = url.sub(r'', text)
    return url_free

# remove numbers 
def remove_numbers(text):
    alpha = ''.join([word for word in text if not word.isdigit()])
    return alpha

# remove anything that's not a word
def remove_nonwords(text):
    sent = re.sub('\w*\d\w*', '', text)
    return sent

In [None]:
# remove whitespaces
def remove_whitespaces(text):
    sent = re.sub(r'\s+', ' ',  text)  
    return sent

# remove html
def remove_html(text):
    html=re.compile(r'<.*?>')
    no_html = html.sub(r'',text)
    return no_html

# remove punctuation 
def remove_punctuation(text):
    no_punct=[words for words in text if words not in pucn]
    words_wo_punct=''.join(no_punct)
    words_wo_punct = words_wo_punct.lower()
    return words_wo_punct

In [None]:
# remove regex
def remove_rege(text):
    reg=re.sub(r'[^\w]', ' ', text)
    return reg

In [None]:
# remove emojis
def remove_emojis(text):
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030""]+", re.UNICODE)
    no_emoji = re.sub(emoji, '', text)
    return no_emoji 

# lematize text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

In [None]:
df = pd.read_csv('toxic comments/train.csv')

In [None]:
df.head()

In [None]:
df. drop("id", axis=1, inplace=True) 

In [None]:
df.head()

In [None]:
 df. rename(columns = {'comment_text':'comment'}, inplace = True)

In [None]:
df.tail()

In [None]:
testtxt = df.comment[0]
testtxt

In [None]:
comm = remove_punctuation(testtxt)

In [None]:
comm

In [None]:
clean = remove_nonwords(comm)

In [None]:
clean

In [None]:
url=remove_url(clean)
htm = remove_html(url)
whi = remove_whitespaces(htm)

In [None]:
url

In [None]:
reg = remove_rege(htm)


In [None]:
whi

In [None]:
emo = remove_emojis(whi)

In [None]:
emo

In [None]:
reg

In [None]:
df['comment']=df['comment'].apply(lambda x: remove_punctuation(x))
df['comment']=df['comment'].apply(lambda x: remove_nonwords(x))
df['comment']=df['comment'].apply(lambda x: remove_url(x))
df['comment']=df['comment'].apply(lambda x: remove_html(x))
df['comment']=df['comment'].apply(lambda x: remove_whitespaces(x))
df['comment']=df['comment'].apply(lambda x: remove_rege(x))
df['comment']=df['comment'].apply(lambda x: remove_emojis(x))

In [None]:
df.head()

In [None]:
testtxt = df.comment[13000]
testtxt

In [None]:
df.shape

In [None]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

In [None]:
X = df['comment']
y = df[df.columns[1:]].values

In [None]:
MAX_FEATURES = 200000 # number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

### Make Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you Fuck I will kill you.')

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
res=(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res.shape

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}')

In [None]:
!pip install gradio jinja2

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicomment.h5')

In [None]:
model = tf.keras.models.load_model('toxicomment.h5')

In [None]:
res = model.predict(np.expand_dims(input_text,0))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Write comment'),
                        outputs='text')

In [None]:
interface.launch(share=True)