## 1. Installing needed libraries

In [1]:
!pip install openpyxl
!pip install PyArabic
!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null
!pip install emoji 
!pip install Arabic-Stopwords
!pip install tkseem
!pip install tnkeeh
!pip3 install fr-word-segment
!pip install pyspellchecker

## 2. Imports

In [2]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
from keras.preprocessing.text import Tokenizer

import nltk
import string
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from fastai.text.all import *

import sklearn
import regex as re
from unicodedata import normalize

import torch
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor
from torch.autograd import Variable
from torch.optim.lr_scheduler import CyclicLR
from torchvision import models

from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score
import os
import gensim


# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GRU,MaxPooling1D,GlobalMaxPooling1D,Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from keras import callbacks
from keras.utils.vis_utils import plot_model

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
from sklearn.metrics import roc_auc_score, accuracy_score,roc_curve, auc, plot_confusion_matrix, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from sklearn.manifold import TSNE
from sklearn.naive_bayes import MultinomialNB
from keras.preprocessing.text import Tokenizer
import emoji
from keras.models import Model
import seaborn as sn
import pyarabic.araby as ar
import tkseem as tk
import tnkeeh as tn
from nltk.stem.isri import ISRIStemmer
from spellchecker import SpellChecker
from wordsegment import load,segment
from keras.layers.merge import Concatenate
import tensorflow as tf
load()

## 3. Loading Data

In [163]:
df_ar = pd.read_csv('/kaggle/input/twitter/ar_dataset.csv')
df_ar2 = pd.read_excel('/kaggle/input/twitter/arr.xlsx')

#main data
df_fr = pd.read_csv('/kaggle/input/twitter/fr_dataset.csv')
df_fr2 = pd.read_csv('/kaggle/input/twitter/french_tweets.csv')

## 4. Exploring data

### 4.1 French Dataset

#### 4.1.1 Main dataset

In [164]:
df_fr.head()

In [165]:
df_fr.describe()

In [166]:
print('Size of the dataset:')
len(df_fr)

In [167]:
#test if the data contains null values
print('Nan value',df_fr.isnull().sum())

In [168]:
#take a look at the column of the dataframe to see the features
df_fr.columns

In [169]:
#### Class distribution 
cum = df_fr['target'].value_counts().to_frame()
cum['HITId'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='HITId',y='target',ax=ax)

#### 4.1.2 Assest dataset

In [170]:
df_fr2.head()

In [171]:
print('Size of the dataset:')
len(df_fr2)

In [172]:
#test if the data contains null values
print('Nan value',df_fr2.isnull().sum())

### 4.2 Arabic dataset

#### 4.2.1 Main dataset

In [173]:
df_ar.head()

In [174]:
df_ar.describe()

In [175]:
print('Size of the dataset:')
len(df_ar)

In [176]:
#test if the data contains null values
print('Nan value',df_ar.isnull().sum())

In [177]:
#### Class distribution 
cum = df_ar['target'].value_counts().to_frame()
cum['HITId'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='HITId',y='target',ax=ax)

#### 4.2.2 Assest dataset

In [178]:
df_ar2.head()

In [179]:
print('Size of the dataset:')
len(df_ar2)

In [180]:
#test if the data contains null values
print('Nan value',df_ar2.isnull().sum())

## 5. Preprocessing on the datasets

### 5.1 French Dataset

#### 5.1.1 Hateful tweets

In [181]:
# negative tweets contained
# we put target = 0 negative
neg = df_fr.loc[df_fr['target']!='normal','target'] = 0
neg = df_fr

#### 5.1.1 No hateful tweets

In [182]:
#positive tweets contained in the assest dataset
pos = df_fr2.loc[df_fr2['label']==1]
#since we have a several amount of data, we take a part of it
pos= pos.rename(columns={'label':'target','text':'tweet'})
n = len(pos)
pos= pos[0:int(n/100)]

#### 5.1.3 Merging 

In [183]:
# merging the two dataframes
data_fr = pd.concat([neg,pos], ignore_index=True, sort=False)
data_fr.head()

In [184]:
# fixing nan values
data_fr.loc[data_fr['sentiment'].isnull().values==True,'sentiment'] = 'normal'
data_fr.loc[data_fr['directness'].isnull().values==True,'directness'] = 'direct'
data_fr.loc[data_fr['group'].isnull().values==True,'group'] = 'nothing'
data_fr.loc[data_fr['annotator_sentiment'].isnull().values==True,'annotator_sentiment'] = 'indifference'
data_fr = data_fr.sample(frac = 1)
data_fr.reindex().head()
data_fr = data_fr.drop(columns={'HITId'})
data_fr.head()

#### 5.1.4 Distribution of data 

In [185]:
# distribution of classes: 0,1
cum = data_fr['target'].value_counts().to_frame()
cum['tweet'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='tweet',y='target',ax=ax)

In [186]:
plt.figure(figsize=(5,5))
plt.pie(data_fr["target"].value_counts(),labels=data_fr["target"].value_counts().index,autopct=lambda p:f'{p:.2f}%',
        shadow=True,colors=['mediumvioletred','darkturquoise'],labeldistance = 1.1,textprops={'fontsize': 14})

plt.savefig("distribution des données dans les différentes classes.png")
plt.show()

#### 5.1.5 Encoding columns

In [187]:
#encoding labels
le = preprocessing.LabelEncoder()
data_fr.sentiment = le.fit_transform(data_fr.sentiment)
data_fr.directness = le.fit_transform(data_fr.directness)
data_fr.annotator_sentiment = le.fit_transform(data_fr.annotator_sentiment)
data_fr.group = le.fit_transform(data_fr.group)
data_fr = data_fr.sample(frac = 1)
data_fr

### 5.2 Arabic dataset

#### 5.2.1 Hateful tweets

In [188]:
neg = df_ar.loc[df_ar['target']!='normal','target'] = 0
neg = df_ar
neg.head()

#### 5.2.2 No Hateful tweets

In [189]:
#positive tweets contained in the assest dataset
df_ar2.loc[df_ar2['Sentiment']=='Positive']
df_ar2.loc[df_ar2['Sentiment']=='Positive','Sentiment'] = 1
pos = df_ar2.loc[df_ar2["Sentiment"]==1]
pos= pos.rename(columns={'Sentiment':'target','Feed':'tweet'})
pos = pos.drop(columns={'ID'})

#### 5.2.3 Merging

In [190]:
# merging the two dataframes
data_ar = pd.concat([neg[0:int(len(pos)/1.5)],pos], ignore_index=True, sort=False)
data_ar.head()

In [191]:
data_ar.loc[data_ar['sentiment'].isnull().values==True,'sentiment'] = 'normal'
data_ar.loc[data_ar['directness'].isnull().values==True,'directness'] = 'direct'
data_ar.loc[data_ar['group'].isnull().values==True,'group'] = 'nothing'
data_ar.loc[data_ar['annotator_sentiment'].isnull().values==True,'annotator_sentiment'] = 'indifference'
data_ar = data_ar.sample(frac = 1)
data_ar = data_ar.sample(frac = 1)
data_ar.drop(columns='HITId')

#### 5.2.4 Data distribution before resampling

In [192]:
# distribution of classes: 0,1
cum = data_ar['target'].value_counts().to_frame()
cum['tweet'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='tweet',y='target',ax=ax)

#### 5.2.5 Resampling

In [193]:
def resample(df):
    setnew = df
    lab0,lab1 = setnew[setnew['target'] == 0], setnew[setnew['target'] == 1]
    c0, c1 = setnew['target'].value_counts()
  
    lab0_sampled = lab0.sample(c0, replace=True) 
    lab1_sampled = lab1.sample(3*c1, replace=True)

    setnew_resampled = pd.concat([lab0_sampled,lab1_sampled], axis=0)
    setnew_resampled = setnew_resampled.sample(frac = 1)
    return setnew_resampled

In [194]:
data_ar = resample(data_ar)
data_ar = data_ar.drop(columns={'HITId'})
data_ar.head()

#### 5.2.6 Data distribution after resampling

In [195]:
# distribution of classes: 0,1
cum = data_ar['target'].value_counts().to_frame()
cum['tweet'] = cum.index
cumfig, ax = plt.subplots(figsize=(5,5))
sn.barplot(data=cum,x='tweet',y='target',ax=ax)

In [196]:
plt.figure(figsize=(5,5))
plt.pie(data_ar["target"].value_counts(),labels=data_ar["target"].value_counts().index,autopct=lambda p:f'{p:.2f}%',
        shadow=True,colors=['mediumvioletred','darkturquoise'],labeldistance = 1.1,textprops={'fontsize': 14})

plt.savefig(" ara distribution des données dans les différentes classes.png")
plt.show()

#### 5.2.7 Encoding columns


In [197]:
data_ar.sentiment = le.fit_transform(data_ar.sentiment)
data_ar.directness = le.fit_transform(data_ar.directness)
data_ar.annotator_sentiment = le.fit_transform(data_ar.annotator_sentiment)
data_ar.group = le.fit_transform(data_ar.group)
data_ar = data_ar.sample(frac = 1)
data_ar.head()

## 6. Preprocessing/Cleaning on tweets

### 6.1 Help functions

In [198]:
def hash_fix(h):
    h1 = re.sub(r'[0-9]+', '', h)
    h2 = re.sub(r'#', '', h1)
    h3 = segment(str(h2))
    h4 = ' '.join(map(str, h3)) 
    return h4

In [199]:
tok_ar = tk.WordTokenizer()
tok_ar.train('/kaggle/input/twitter/ar_dataset.csv')
def prepro_ar(tweet):
    arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    
    tweet = tweet.replace('user', '')
    tweet = tweet.replace('@user', '')
    tweet = re.sub( r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",'',tweet)

    tweet = tweet.replace('url', '')
    tweet = re.sub(r"\p{P}", lambda m: "", tweet)
    tweet = re.sub(arabic_diacritics, '', str(tweet))
    tweet = re.sub(r'(.)\1+', "", tweet) 
    tweet = ar.strip_tashkeel(tweet)
    tweet = ar.strip_tatweel(tweet)
    tweet = tweet.replace("@", " ")
    tweet = tweet.replace("_", " ")
    tweet = re.sub("ى", "ي", tweet)
    tweet = re.sub("ؤ", "ء", tweet)
    tweet = re.sub("ئ", "ء", tweet)
    tweet = re.sub("ة", "ه", tweet)
    tweet = re.sub("گ", "ك", tweet)
    tweet = tweet.replace("آ", "ا")
    tweet = tweet.replace("إ", "ا")
    tweet = tweet.replace("أ", "ا")
    tweet = tweet.replace("ؤ", "و")
    tweet = tweet.replace("ئ", "ي")
    tweet = nltk.tokenize.word_tokenize(tweet)
    tweet = [ISRIStemmer().suf32(w) for w in tweet]
    for i in range(len(tweet)):
        if tweet[i] == 'اه':
            tweet[i] = 'الله'
    return tweet

In [200]:
contractions_fr = {
        'administration':'admin',
        'avec':'ac',
        'beaucoup':'bp',
        'c’est-à-dire':'cad',
        'cependant':'cpd',
        'chose':'ch',
        'conclusion':'ccl',
        'confer ':'cf',
        'court terme':'ct',
        'dans':'ds',
        'dedans':'dd',
        'définition':'déf',
        'et cetera':'etc',
        'être':'ê',
        'exemple':'ex',
        'extérieur':'ext',
        'font':'ft',
        'général':'gal',
        'gouvernement':'gouv',
        'grand':'gd',
        'groupe':'gp',
        'identique':'idel',
        'introduction':'intro',
        'jour':'jr',
        'long terme':'lt',
        'lorsque':'lsq',
        'mais':'ms',
        'même':'^m',
        'moyen terme':'mt',
        'nombre':'nb',
        'nombreux':'nbx',
        'nombre':'nb',
        'nombreux':'nbx',
        'observation':'obs',
        'ordre du jour':'oj',
        'page':'p',
        'parce que':'pcq',
        'pendant':'pdt',
        'personne':'pers',
        'point':'pt',
        'peut-être':'pê',
        'pour':'pr',
        'pourtant':'prtt',
        'quand':'qd',
        'quantité':'qté',
        'que':'q',
        'quelqu’un':'qqn',
        'quelque chose':'qqch',
        'quelque':'qq',
        'quelquefois':'qqf',
        'question':'quest',
        'rendez-vous':'rdv',
        'responsabilité':'respité',
        'seulement':'slt',
        'solution':'sol',
        'sont':'st',
        'sous':'ss',
        'souvent':'svt',
        'temps':'tps',
        'toujours':'tjrs',
        'tous':'ts',
        'tout':'tt',
        'toute':'tte',
        'toutes':'ttes',
        'vous':'vs',
        'le':'l\'',
        'me':'m\'',
        'de':'d\'',
        'te':'t\'',
        'se':'s\'',
        'ce':'c\'',
        'ne':'n\'',
        'que':'qu\'',
        'jusque':'jusqu\'',
        'lorsque':'lorsqu\'',
        'puisque':'puisqu\'',
        'quelque':'quelqu\'',
        'quoique':'quoiqu\'',
        'parce que':'parce qu\'',
        'tel que':'tel qu\'',
        'telle que':'telle qu\'',       
        'faculte':'fac',
        'bien':'bin',    
        'attend':'att',
        'je': 'j\'',
        'rire':'ptdr',
        'rire':'lol',
        'rire':'lmfao',
        'putin':'ptn',
}

In [201]:
french_stopwords = nltk.corpus.stopwords.words('french')
lemmatizer = FrenchLefffLemmatizer()
spell = SpellChecker(language='fr')

In [202]:
def prepro_fr(tweet):
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # normalize unicode characters
    tweet = normalize('NFD', tweet).encode('ascii','ignore')
    tweet = tweet.decode('UTF-8')
    #demojize
    tweet = emoji.demojize(tweet)
    if "#" in tweet:
        tweet = hash_fix(tweet)
    tweet = tweet.replace('user', '')
    tweet = tweet.replace('@user', '')
    tweet = tweet.replace('url', '')
    tweet = re.sub( r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",'',tweet)
    # convert to lower case
    tweet = tweet.lower()
    tweet = tweet.replace('\'', '\' ') 
    # remove punctuation
    tweet = re.sub(r"\p{P}", lambda m: "-" if m.group(0) == "-" else "", tweet)
    # tokenization
    tweet = nltk.tokenize.word_tokenize(tweet)
    #contractions
    tweet = [list(contractions_fr.keys())[list(contractions_fr.values()).index(word)] if word in contractions_fr.values() else word for word in tweet]
    
    # stop words
    tweet = [w for w in tweet if w not in french_stopwords]
    sc = 'j[a-z]*'
    v = '[^aeyouisch]*'
    for w in tweet:
        x = re.findall(sc,w)
        xx = re.findall(v,w)
        if len(x)!=0:
            w = x[0].replace('j', 'je ')
            if len(x)>=3:
                w = w.replace(xx[0], '')
    # remove non-printable chars form each token
    tweet = [re_print.sub('', w) for w in tweet]
    # lemmatization
    tweet = [lemmatizer.lemmatize(w) for w in tweet]
    return tweet

### 6.2 Arabic

In [203]:
data_ar.tweet = data_ar.tweet.apply(lambda t: prepro_ar(t))

In [204]:
data_ar.head()

### 6.3 French

In [205]:
data_fr.tweet = data_fr.tweet.apply(lambda t: prepro_fr(t))

In [206]:
data_fr.head()

## 7. Splitting Data

## 7.1 Arabic

In [207]:
labels_ar = data_ar.target.values
data_ar = data_ar.drop(columns={'target'})

In [208]:
X_train_ar,X_test_ar, y_train_ar,y_test_ar = train_test_split(data_ar,
                                                              labels_ar,
                                                              test_size=0.1)


### 7.2 French

In [209]:
labels_fr = data_fr.target.values
data_fr = data_fr.drop(columns={'target'})

In [210]:
X_train_fr,X_test_fr, y_train_fr,y_test_fr = train_test_split(data_fr,
                                                              labels_fr,
                                                              test_size=0.1)

## 8. Word Embedding

In [211]:
embed_dim = 300 

### 8.1 Arabic

In [52]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz

In [53]:
!gunzip cc.ar.300.vec.gz

In [212]:
f_ar = open('cc.ar.300.vec', encoding='utf-8')

In [213]:
#loading pretrained model to word vecs
embeddings_index_ar = {}
for line in tqdm(f_ar):
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings_index_ar[word]=vector
f_ar.close()

print('found %s word vectors' % len(embeddings_index_ar))

In [214]:
X_train_ar1 = list(X_train_ar.tweet.values)
X_test_ar1 = list(X_test_ar.tweet.values)

tok_ar = Tokenizer()
tok_ar.fit_on_texts(data_ar.tweet.values)
word_index_ar = tok_ar.word_index

#text to integer sequence
X_train_ar1 = tok_ar.texts_to_sequences(X_train_ar1)
X_test_ar1 = tok_ar.texts_to_sequences(X_test_ar1)

In [218]:
tweets_length_ar= [len(X_train_ar1[i]) for i in range(len(X_train_ar1))]
max_seq_len_ar = max(tweets_length_ar)
print(max_seq_len_ar)

In [219]:
#padding
X_train_ar1 = pad_sequences(X_train_ar1, padding='post', maxlen=max_seq_len_ar)
X_test_ar1 = pad_sequences(X_test_ar1, padding='post', maxlen=max_seq_len_ar)

In [220]:
X_train_ar2 = X_train_ar[['sentiment', 'directness', 'annotator_sentiment', 'group']].values
X_test_ar2 = X_test_ar[['sentiment', 'directness', 'annotator_sentiment', 'group']].values

In [223]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found_ar = []
nb_words_ar = len(word_index_ar)+1
embedding_matrix_ar = np.zeros((nb_words_ar, embed_dim))
for word, i in word_index_ar.items():
    embedding_vector = embeddings_index_ar.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_ar[i] = embedding_vector
    else:
        words_not_found_ar.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_ar, axis=1) == 0))
print(len(embedding_matrix_ar))

### 8.2 French

In [61]:
#Loading embedding  model!
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz

In [62]:
!gunzip cc.fr.300.vec.gz

In [63]:
f_fr = open('cc.fr.300.vec', encoding='utf-8')

In [64]:
#loading pretrained model to word vecs
embeddings_index_fr = {}
for line in tqdm(f_fr):
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embeddings_index_fr[word]=vector
f_fr.close()

print('found %s word vectors' % len(embeddings_index_fr))

In [65]:
X_train_fr1 = list(X_train_fr.tweet.values)
X_test_fr1 = list(X_test_fr.tweet.values)

tok_fr = Tokenizer()
tok_fr.fit_on_texts(data_fr.tweet.values)
#text to integer sequence
X_train_fr1 = tok_fr.texts_to_sequences(X_train_fr1)
X_test_fr1 = tok_fr.texts_to_sequences(X_test_fr1)

In [66]:
#counting the length of the tweet and taking the max
tweets_length_fr= [len(X_train_fr1[i]) for i in range(len(X_train_fr1))]
max_seq_len_fr = max(tweets_length_fr)
print(max_seq_len_fr)

In [67]:
#padding
X_train_fr1 = pad_sequences(X_train_fr1, maxlen=max_seq_len_fr)
X_test_fr1 = pad_sequences(X_test_fr1, maxlen=max_seq_len_fr)

In [68]:
X_train_fr2 = X_train_fr[['sentiment', 'directness', 'annotator_sentiment', 'group']].values
X_test_fr2 = X_test_fr[['sentiment', 'directness', 'annotator_sentiment', 'group']].values

In [69]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found_fr = []
word_index_fr = tok_fr.word_index
nb_words_fr = len(word_index_fr)+1
embedding_matrix_fr = np.zeros((nb_words_fr, embed_dim))
for word, i in word_index_fr.items():
    embedding_vector = embeddings_index_fr.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_fr[i] = embedding_vector
    else:
        words_not_found_fr.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_fr, axis=1) == 0))
print(len(embedding_matrix_fr))

## 9. Classification Model

### 9.1 Building model

In [224]:
input_fr_1 = Input(shape=(max_seq_len_fr,))
input_ar_1 = Input(shape=(max_seq_len_ar,))
input_2 = Input(shape=(4,))

In [225]:
embedding_layer_fr = Embedding(
                            nb_words_fr, 
                            embed_dim,  
                            weights=[embedding_matrix_fr],
                            trainable=False,
                            input_length=max_seq_len_fr,
                    )(input_fr_1)

In [226]:
embedding_layer_ar = Embedding(
                            nb_words_ar, 
                            embed_dim,  
                            weights=[embedding_matrix_ar],
                            trainable=False,
                            input_length=max_seq_len_ar,
                    )(input_ar_1)

In [227]:
def NN(lang,input_1,input_2):
    if lang=='AR':  
        embedding_layer = embedding_layer_ar
    else:
        embedding_layer = embedding_layer_fr

    lstm = LSTM(64)(embedding_layer)
    dense_layer_1 = Dense(10, activation='relu')(input_2)
    dense_layer_2 = Dense(10, activation='relu')(dense_layer_1)
    concat_layer = Concatenate()([lstm, dense_layer_2])
    dense_layer_3 = Dense(10, activation='relu')(concat_layer)
    
    output = Dense(1, activation='sigmoid')(dense_layer_3)
    
    model = Model(inputs=[input_1, input_2], outputs=output)
    
    plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
    return model

### 9.2 Training Model

In [228]:
#hyperparameters
num_epochs = 10
batch_size = 32
early = callbacks.EarlyStopping(monitor='val_loss',
                                min_delta=0, 
                                patience=3,
                                verbose=1, 
                                mode='auto')

### 9.2.1 French

In [75]:
model_fr = NN('FR',input_fr_1,input_2)
model_fr.compile(optimizer = 'adam',
                 loss = 'binary_crossentropy', 
                 metrics = ['accuracy'])

print('French:')
Xfr = [X_train_fr1,X_train_fr2]
history_fr = model_fr.fit(Xfr,
                          y_train_fr.astype(np.float32),
                          batch_size=batch_size,
                          epochs=num_epochs,
                          validation_split=0.2,
                          callbacks = [early],
                          verbose=1)

### 9.2.2 Arabic

In [229]:
model_ar = NN('AR',input_ar_1,input_2)
model_ar.compile(optimizer = 'adam',
                 loss = 'binary_crossentropy', 
                 metrics = ['accuracy'])

print('Arabic:')
Xar = [np.asarray(X_train_ar1), np.asarray(X_train_ar2)]
history_ar = model_ar.fit(Xar,
                          y_train_ar.astype(np.float32),
                          batch_size=batch_size,
                          epochs=num_epochs,
                          validation_split=0.2,
                          callbacks = [early],
                          verbose=1)

### 9.3 Evaluating & Testing model

#### 9.3.1 Help functions

In [231]:
def plot_acc_loss(history):
    """
    Plot accuracy and loss of a model
    @params:
            - history: history of the model
    @return:
            plots
    """
    fig,ax = plt.subplots(1,2,figsize=(10,5))
    l = list(history.history.keys())
    print(l)
    # accuracy plot
    ax[0].plot(history.history[l[1]])
    ax[0].plot(history.history[l[3]])
    ax[0].set_title('model accuracy')
    ax[0].set_ylabel('accuracy')
    ax[0].set_xlabel('epoch')
    ax[0].legend(['train', 'test'], loc='upper left')
    # loss plot
    ax[1].plot(history.history[l[0]])
    ax[1].plot(history.history[l[2]])
    ax[1].set_title('model loss')
    ax[1].set_ylabel('loss')
    ax[1].set_xlabel('epoch')
    ax[1].legend(['train', 'test'], loc='upper left')

In [232]:
def predicted_label(model,x):
    pred = model.predict(x)
    lab_pred = []
    for i in range(len(pred)):
        if pred[i][0]>=0.5:
            lab_pred.append(1)
        else:
            lab_pred.append(0)
    return lab_pred

In [233]:
def plot_cm(model,x,y):
    pred = model.predict(x)
    y_pred = predicted_label(model,x)
    cm = confusion_matrix(list(y),y_pred)  
    sn.heatmap(cm, annot=True)

#### 9.3.2 Arabic

In [234]:
plot_acc_loss(history_ar)

In [235]:
print("Confusion Matrix for arabic dataset")
Xtestar = [np.asarray(X_test_ar1), np.asarray(X_test_ar2)]
plot_cm(model_ar,Xtestar,y_test_ar)

In [236]:
test_loss_ar, test_acc_ar = model_ar.evaluate(Xtestar,y_test_ar.astype(np.float32))
print('Test accuracy:', test_acc_ar)

In [237]:
pred_ar =predicted_label(model_ar,Xtestar)
# accuracy: (tp + tn) / (p + n)
accuracy_ar = accuracy_score(list(y_test_ar), pred_ar)
print('Accuracy: %f' % accuracy_ar)
# precision tp / (tp + fp)
precision_ar = precision_score(list(y_test_ar), pred_ar)
print('Precision: %f' % precision_ar)
# recall: tp / (tp + fn)
recall_ar = recall_score(list(y_test_ar), pred_ar)
print('Recall: %f' % recall_ar)
# f1: 2 tp / (2 tp + fp + fn)
f1_ar = f1_score(list(y_test_ar), pred_ar)
print('F1 score: %f' % f1_ar)

#### 9.3.4 French

In [None]:
plot_acc_loss(history_fr)

In [None]:
print("Confusion Matrix for french dataset")
Xtestfr = [np.asarray(X_test_fr1), np.asarray(X_test_fr2)]
plot_cm(model_fr,Xtestfr,y_test_fr)

In [None]:
test_loss, test_acc = model_fr.evaluate(Xtestfr,y_test_fr.astype(np.float32))
print('Test accuracy:', test_acc)

In [None]:
pred_fr =predicted_label(model_fr,Xtestfr)
# accuracy: (tp + tn) / (p + n)
accuracy_fr = accuracy_score(list(y_test_fr), pred_fr)
print('Accuracy: %f' % accuracy_fr)
# precision tp / (tp + fp)
precision_fr = precision_score(list(y_test_fr), pred_fr)
print('Precision: %f' % precision_fr)
# recall: tp / (tp + fn)
recall_fr = recall_score(list(y_test_fr), pred_fr)

print('Recall: %f' % recall_fr)
# f1: 2 tp / (2 tp + fp + fn)
f1_fr = f1_score(list(y_test_fr), pred_fr)
print('F1 score: %f' % f1_fr)
