# Mounting drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing libraries

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn import metrics
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold
from collections import Counter
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Reading dataset from drive

In [3]:
dataSet = pd.read_csv('/content/drive/MyDrive/Medical Misinformation Detection Model/Datasets/Created Dataset/tweets_without_cleaning.csv', sep = ",")

# Hyperparameter for keras model an tensorflow tokenizer and cross validation

In [4]:
vocab_size = 45000
embedding_dim = 18
max_length = 250
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Experiment

In [5]:
dataSet_text = dataSet['text']
dataSet_label = dataSet['label']
accuracy  = 0
recall    = 0
f1_score  = 0
precision = 0
print('------------------------------------------------------------')
print('!!!!!  dataset used is : tweets without cleaning  !!!!!')
print('------------------------------------------------------------')
#cross validation counter
split = 1
kfold = KFold(n_splits=10, shuffle=True)
#cross validation loop
for train, test in kfold.split(dataSet_text, dataSet_label):
  #convert the vectors to np array
  training_labels = np.array(dataSet_label[train])
  testing_labels  = np.array(dataSet_label[test])
  training_padded = np.array(dataSet_text[train])
  testing_padded  = np.array(dataSet_text[test])
  
  #some output
  print('             !!!!!  split number : '+ str(split) + '  !!!!!')
  print('     ' + f'In training: {Counter(training_labels)}')
  print('     ' + f'In testing: {Counter(testing_labels)}')


########################### SVM model ###########################

  #creat SVM model 
  svm_model = Pipeline([('tfidf',TfidfVectorizer()), ('clf', SVC(probability=True)),])
  #fit the model
  svm_model.fit(training_padded, training_labels)
  #get the predicted lable
  svm_lable_predicted = svm_model.predict(testing_padded)

########################### end of SVM model ###########################


########################### Naive-Bayes model ###########################

  #creat Naive-Bayes model
  Naive_Bayes_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
  #fit the model
  Naive_Bayes_model.fit(training_padded, training_labels)
  #get the predicted lable  
  Naive_Bayes_lable_predicted = Naive_Bayes_model.predict(testing_padded)

########################### end of Naive-Bayes model ###########################


########################### Tensorflow tokenizer and Keras model ###########################

  #create the tokenizer and fit the tokenizer dictionary
  tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(dataSet_text[train])
  #convert texts to sequences
  training_sequences = tokenizer.texts_to_sequences(dataSet_text[train])
  testing_sequences = tokenizer.texts_to_sequences(dataSet_text[test])
  #make the sequences in the same length
  training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)    
  testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  
  #convert the vectors to np array
  training_padded = np.array(training_padded)
  testing_padded = np.array(testing_padded)
  
  #create the keras model with 4 layers
  keras_model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(24, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  #create the metrics array
  metrics_array = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
  ]
  #compile the model
  keras_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=metrics_array)

  #model training with 15 epochs
  history = keras_model.fit(training_padded, training_labels, epochs=15, validation_data=(testing_padded, testing_labels), verbose=2)

  # testing the model
  keras_lable_predicted = keras_model.predict(testing_padded)
  keras_lable_predicted = keras_lable_predicted.flatten()
  keras_lable_predicted = np.where(keras_lable_predicted > 0.5, True, False)

########################### end of Tensorflow tokenizer and Keras model ###########################


  # create final predicted result
  lable_predicted = []
  for i in range (0, len(testing_labels)):
    if  (keras_lable_predicted[i] and  Naive_Bayes_lable_predicted[i]) or ( keras_lable_predicted[i] and svm_lable_predicted[i]) or ( Naive_Bayes_lable_predicted[i] and svm_lable_predicted[i]):
      lable_predicted.append(True)
    else:
      lable_predicted.append(False)

  #create and show the confusion matrix
  cm = metrics.confusion_matrix(testing_labels, lable_predicted)
  sn.heatmap(cm, annot=True, fmt='d')
  plt.xlabel('Predicted')
  plt.ylabel('Truth')
  plt.title('confusion matrix')
  plt.show()

  #show the classification report 
  print(metrics.classification_report(testing_labels, lable_predicted))

  #add the new metrics result from the new cross val. to the old  metrics result from the old cross val.
  accuracy  = accuracy + metrics.accuracy_score(testing_labels, lable_predicted)
  recall    = recall + metrics.recall_score(testing_labels, lable_predicted)
  f1_score  = f1_score + metrics.f1_score(testing_labels, lable_predicted)
  precision = precision + metrics.precision_score(testing_labels, lable_predicted)
  #indexing
  split = split + 1

#compute the mean of the using metrics in the cross val.
accuracy  = accuracy / 10
recall    = recall / 10
f1_score  = f1_score / 10
precision = precision / 10
#some output
print("mean accuracy for this experiment is " + str(round(accuracy, 2)))
print("mean recall for this experiment is " + str(round(recall, 2)))
print("mean precision for this experiment is " + str(round(precision, 2)))
print("mean f1_score for this experiment is " + str(round(f1_score, 2)))


------------------------------------------------------------
!!!!!  dataset used is : tweets without cleaning  !!!!!
------------------------------------------------------------
             !!!!!  split number : 1  !!!!!
     In training: Counter({True: 8939, False: 8927})
     In testing: Counter({False: 999, True: 987})


ValueError: ignored