In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import pickle
from keras.models import load_model

In [2]:
def get_model(model_path,tokenizer_path):
    arabic_model = load_model(model_path)

    # loading
    with open(tokenizer_path, 'rb') as handle:
        arabic_tokenizer = pickle.load(handle)
    
    return arabic_model,arabic_tokenizer

In [3]:
arabic_model,arabic_tokenizer = get_model('arabic_model.h5','arabic_tokenizer.pickle')

In [4]:
def predict_arabic(inp,model,tokenizer):
    text_token = tokenizer.texts_to_sequences(inp)
    maxlen = 13 # choosed by the avg
    text_token_pad = pad_sequences(text_token, maxlen=maxlen)

    o = model.predict(text_token_pad)
    return o

In [5]:
def predict_english(inp,model,tokenizer):
    text_token = tokenizer.texts_to_sequences(inp)
    maxlen = 200 # choosed by the avg
    text_token_pad = pad_sequences(text_token, maxlen=maxlen)

    o = model.predict(text_token_pad)
    return o

In [24]:
from spacy_langdetect import LanguageDetector
import spacy
from spacy.language import Language
@Language.factory('language_detector')
def language_detector(nlp, name):
    return LanguageDetector()

ValueError: [E004] Can't set up pipeline component: a factory for 'language_detector' already exists. Existing factory: <function language_detector at 0x0000025B789F2160>. New factory: <function language_detector at 0x0000025B0AD46A60>

In [25]:
def get_langauge_model():
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 2000000
    nlp.add_pipe('language_detector', last=True)
    return nlp

In [8]:
def predict_langauge(text,nlp): 
    doc = nlp(text)
    detect_language = doc._.language
    return detect_language

In [26]:
arabic_model,arabic_tokenizer = get_model('arabic_model.h5','arabic_tokenizer.pickle')
english_model,english_tokenizer = get_model('english_model.h5','english_tokenizer.pickle')
nlp = get_langauge_model()

In [37]:
def predict_toxicity(text,arabic_model,arabic_tokenizer,english_model,english_tokenizer,nlp):
    if predict_langauge(text,nlp)['language'] == 'en':
        return 'en',predict_english([text],english_model,english_tokenizer)[0][0]
    elif predict_langauge(text,nlp)['language'] in ['ar','fa','ur']:
        return 'ar',predict_arabic([text],arabic_model,arabic_tokenizer)[0][0] 
    else:
        return -1,None

In [40]:
def display_toxicity(text):
    lang,pred = predict_toxicity(text,arabic_model,arabic_tokenizer,english_model,english_tokenizer,nlp)
    if(lang==-1):
        print("model does not take other lang than arabic,english")
        return None
    if(lang=='en'):
        print("text is english")
    elif (lang=='ar'):
        print('text is arabic')
    if(pred>=0.5):
        print("text is TOXIC")
    else:
        print("text is not TOXIC")

In [41]:
display_toxicity('يا غبي')

text is arabic
text is TOXIC
