In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install arabert
!pip install transformers

In [None]:
import pandas as pd
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer,AutoModel
import torch
from tqdm import tqdm
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Dropout,Conv1D,MaxPooling1D,Reshape
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
%cd /content/drive/MyDrive/Gp\ Dataset

In [None]:
labeledData=pd.read_csv('labeled_data.csv')
labeledData = labeledData.sample(frac=1, random_state=np.random.RandomState(seed=42))
#labeledData = labeledData.sample(frac=0.027)
labeledData = labeledData.sample(frac=0.1, random_state=np.random.RandomState(seed=42))
labeledData = labeledData.reset_index(drop=True)
labeledData['label']=labeledData['label'].replace(['not credible','credible'],[0,1])

In [None]:
train_data, val_data=train_test_split(labeledData,test_size=0.2,random_state=42)

train_value_counts=train_data['label'].value_counts()
val_value_counts=val_data['label'].value_counts()

In [None]:
plt.pie(train_value_counts,labels=train_value_counts.index,autopct='%1.1f%%')
plt.title('Pie Chart of Train 0s and 1s')
plt.show()

In [None]:
plt.pie(val_value_counts,labels=val_value_counts.index,autopct='%1.1f%%')
plt.title('Pie Chart of Val 0s and 1s')
plt.show()

In [None]:
model_name = "aubmindlab/bert-base-arabertv2"
ArabertPreprocessor(
  model_name= "",
  keep_emojis = False,
  remove_html_markup = True,
  replace_urls_emails_mentions = True,
  strip_tashkeel = True,
  strip_tatweel = True,
  insert_white_spaces = True,
  remove_non_digit_repetition = True,
  replace_slash_with_dash = None,
  map_hindi_numbers_to_arabic = None,
  apply_farasa_segmentation = None,
)
model_name = "aubmindlab/bert-base-arabertv2"
train_arabert_prep = ArabertPreprocessor(model_name=model_name)
train_arabert_tokenizer=AutoTokenizer.from_pretrained(model_name)
train_arabert_model = AutoModel.from_pretrained(model_name)


val_arabert_prep = ArabertPreprocessor(model_name=model_name)
val_arabert_tokenizer=AutoTokenizer.from_pretrained(model_name)
val_arabert_model = AutoModel.from_pretrained(model_name)

In [None]:
def create_model1(numoffeatures):
    model=Sequential()
    model.add(Embedding(input_dim=numoffeatures,output_dim=50))
    model.add(LSTM(units=64))
    model.add(Dense(1,activation='sigmoid'))
    model.summary()
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model

def create_model2(numoffeatures):
    model_lstm = Sequential()
    model_lstm.add(Embedding(input_dim=numoffeatures, output_dim=256,))
    model_lstm.add(SpatialDropout1D(0.3))
    model_lstm.add(LSTM(256, dropout=0.3, recurrent_dropout=0.3))
    model_lstm.add(Dense(256, activation='relu'))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(
        loss='binary_crossentropy',
        optimizer='Adam',
        metrics=['accuracy']
    )
    model_lstm.summary()
    return model_lstm

def create_model3(numoffeatures):
    model_lstm=Sequential()
    model_lstm.add(Embedding(numoffeatures,100))
    model_lstm.add(Dropout(0.2))
    model_lstm.add(Conv1D(64, 5, activation='relu'))
    model_lstm.add(MaxPooling1D(pool_size=4))
    model_lstm.add(LSTM(20, return_sequences=True))
    model_lstm.add(LSTM(20))
    model_lstm.add(Dropout(0.2))
    model_lstm.add(Dense(512))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(256))
    model_lstm.add(Dense(1, activation='sigmoid'))
    model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model_lstm.summary()
    return model_lstm

def create_model4(numoffeatures):
    model=Sequential()
    model.add(Reshape((64*768,), input_shape=(64, 768)))
    model.add(Dense(64, activation='relu',input_dim=numoffeatures))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1,activation='sigmoid'))
    model.summary()
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [None]:
def data_generator(data,preprocesser,tokenizer,model,max_len,batch_size):
  num_samples=len(data)
  while True:
    for offset in range(0, num_samples,batch_size):
      batch=data.iloc[offset:offset+batch_size]
      batch=batch.reset_index(drop=True)
      #print(batch)
      embeddings_list=[]
      with torch.no_grad():
        for i in range(len(batch)):
          text = batch.loc[i,'title']
          preprocessed=preprocesser.preprocess(text)
          tokenized=' '.join(tokenizer.tokenize(preprocessed))
          input_ids = tokenizer.encode(tokenized, padding="max_length", truncation=True, max_length=max_len, return_tensors='pt')
          embeddings=model(input_ids)[0]
          embeddings_list.append(embeddings.detach().numpy())
      x=np.concatenate(embeddings_list, axis=0)
      y=batch['label']
      y=y.values
      print(y.shape)
      print(x.shape)
      yield x,y

In [None]:
train_gen=data_generator(train_data,train_arabert_prep,train_arabert_tokenizer,train_arabert_model,64,64)
val_gen=data_generator(val_data,val_arabert_prep,val_arabert_tokenizer,val_arabert_model,64,64)

In [None]:
batch_size = 64
train_steps_per_epoch = len(train_data)//batch_size
if len(train_data) % batch_size != 0:
    train_steps_per_epoch += 1
print(train_steps_per_epoch)

val_steps_per_epoch = len(val_data)//batch_size
if len(val_data) % batch_size != 0:
    val_steps_per_epoch += 1
print(val_steps_per_epoch)

In [None]:
model=create_model4(768)
#X_train,X_test,Y_train,Y_test=train_test_split(embeddings_array,y,test_size=0.2,random_state=20)
checkpoint_callback = ModelCheckpoint('model_weights_{epoch:02d}.h5', save_weights_only=True)
model.fit(train_gen,steps_per_epoch=train_steps_per_epoch,validation_data=val_gen,validation_steps=val_steps_per_epoch, epochs=5,verbose=1,callbacks=[checkpoint_callback])