# **Important Installations & Dependencies**


In [None]:
from keras.preprocessing import sequence, text
from keras.models import Model
from gensim.models.keyedvectors import KeyedVectors
from sklearn import preprocessing
from time import time
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.metrics import (
    classification_report as creport
)
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score,recall_score
from sklearn import preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Arabic Word Embeddings (AraVec Or fasttext) 


In [None]:
! unzip '/content/drive/My Drive/New- test/tweets_sg_300.zip'  

Archive:  /content/drive/My Drive/New- test/tweets_sg_300.zip
  inflating: tweets_sg_300           
  inflating: tweets_sg_300.trainables.syn1neg.npy  
  inflating: tweets_sg_300.wv.vectors.npy  


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz
!gunzip cc.ar.300.bin.gz

In [None]:
# Word_embedding_path
embedding_path = '/content/tweets_sg_300'           # change the path to '/content/cc.ar.300.bin' when fasttext is needed to use

In [None]:
X = []
y = []
for data_path in ["/content/drive/MyDrive/DatasetCleaned/data_train.csv"]:
     with open(data_path, 'r') as f:
          for i, line in enumerate(f):
              if i == 0: continue
              else:
                  temp = line.split(',')
                  X.append(temp[0].split())
                  y.append(temp[1].replace('\n', ''))
X, y = np.array(X), np.array(y)


In [None]:
X_test = []
y_test = []
for data_path in ["/content/drive/MyDrive/DatasetCleaned/data_test.csv"]:
     with open(data_path, 'r') as f:
          for i, line in enumerate(f):
              if i == 0: continue
              else:
                  temp = line.split(',')
                  X_test.append(temp[0].split())
                  y_test.append(temp[1].replace('\n', ''))
X_test, y_test = np.array(X_test), np.array(y_test)


**## Get Word2Vec**

In [None]:
def get_init_parameters(path, ext=None):
    word_model = KeyedVectors.load(path).wv
    n_words = len(word_model.vocab)
    vocab_dim = word_model[word_model.index2word[0]].shape[0]
    index_dict = dict()
    for i in range(n_words):
        index_dict[word_model.index2word[i]] = i+1
    print('Number of words in the word embedding',n_words)
    #print('word_model', word_model)
    #print("index_dict",index_dict)
    return word_model, index_dict, n_words, vocab_dim

In [None]:
WORD_MODEL, index_dict, MAX_FEATURES, EMBED_SIZE = get_init_parameters(embedding_path) 

In [None]:
EMBED_SIZE

In [None]:
len(index_dict)

In [None]:
def get_word_index(train_raw_text, test_raw_text, n_words):
    tokenizer = text.Tokenizer(num_words=n_words)
    tokenizer.fit_on_texts(list(train_raw_text))
    word_index = tokenizer.word_index

    return word_index

In [None]:
word_index  = get_word_index(X,X_test,MAX_FEATURES)

In [None]:
def w2v(word_index, embedding_index, vocab_dim):
    print('Building embedding matrix...')
    dicc={}
    embedding_matrix = np.zeros((len(word_index) + 1, vocab_dim))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index.get_vector(word)
        except:
            pass
        dicc[word]= embedding_matrix[i]

    print('Embedding matrix built.') 
    #print("Word index", word_index.items())
    #print(embedding_matrix) 
    return dicc

In [None]:
dicc= w2v(word_index, WORD_MODEL, EMBED_SIZE)

In [None]:
len(dicc)

In [None]:
"""
To use the word embeddings with the classical machine learning models, 
the average vector of all the embeddings of the tweet words is computed
"""
class MeanEmbeddingVectorizer(object):
    def __init__(self, dicc):
        self.dicc = dicc
        if len(dicc)>0:
            self.dim=300
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.dicc[w] for w in words if w in self.dicc] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# **Training & Testing**

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
NB_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("NB_w2v",  naive_bayes.BernoulliNB())])

In [None]:
NB_w2v= NB_w2v.fit(X, y)
predictions_NB= NB_w2v.predict(X_test)

In [None]:
print("NB weighted-averaged precision-score -> ",precision_score(y_test, predictions_NB,average="macro", pos_label="1"))
print("NB weighted-averaged recall-score -> ",recall_score(y_test, predictions_NB,average="macro", pos_label="1"))
print("NB weighted-averaged F1-score -> ",f1_score(y_test, predictions_NB,average="macro", pos_label="1"))

In [None]:
print(creport(y_test, predictions_NB,target_names=['Offensive', 'Benign'],digits=4))
# print(creport(y_test, predictions_NB,target_names=['Literature,Sports, Judiciary, Politics, Art, Business'],digits=4)) // For Multi dataset