# Preporation

In [79]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import transformers
from sklearn.metrics import precision_score, recall_score, f1_score,\
 accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier as Knn

In [80]:
train_data_path = "/content/drive/MyDrive/FE_models/Data/Train_data.xlsx"
test_data_path = "/content/drive/MyDrive/FE_models/Data/Test_data.xlsx"

train_data = pd.read_excel(train_data_path)
test_data = pd.read_excel(test_data_path)

target_list = ['Fake', 'Truthful']


In [81]:
MAX_LEN = 256

In [82]:
# downloading the model & the tokenizer
b_tokenizer = transformers.AutoTokenizer.from_pretrained("/content/drive/MyDrive/model-bert-base-arabertv2/Models/tokenizers-bert-base-arabertv2")
b_model = transformers.AutoModel.from_pretrained("/content/drive/MyDrive/FE_models/Fine-tuned_BERT/FineTunedBERT", return_dict=True)

# Data

In [83]:
def emmbding(text):
  input = b_tokenizer.encode_plus(text,
                                  None,
                                  add_special_tokens=True,
                                  max_length=MAX_LEN,
                                  padding="max_length",
                                  truncation=True,
                                  return_attention_mask=True,
                                  return_tensors="pt"
                                  )
  output = b_model(input_ids=input["input_ids"],
                   attention_mask=input["attention_mask"],
                   token_type_ids=input["token_type_ids"]
                   ).pooler_output.detach().numpy()
  return output

In [84]:
# emmbde the Text in train set
train_embd_texts = (emmbding( train_data['Text'][0]))

for text in train_data['Text'][1:]:
  train_embd_texts = np.vstack([train_embd_texts ,emmbding(text)])

train_embd_texts = pd.DataFrame(train_embd_texts)


# emmbde the Text in test set
test_embd_texts = (emmbding( test_data['Text'][0]))

for text in test_data['Text'][1:]:
  test_embd_texts = np.vstack([test_embd_texts ,emmbding(text)])

test_embd_texts = pd.DataFrame(test_embd_texts)

In [85]:
# Adding the Text emmbding as new features
train_data.drop(labels=['Text','Truthful'], axis=1, inplace=True)
test_data.drop(labels=['Text','Truthful'], axis=1, inplace=True)

train_data = pd.concat((train_data, train_embd_texts), axis=1)
test_data = pd.concat((test_data, test_embd_texts), axis=1)


In [86]:
# Features range
col = list(range(0,768))

# The Model

In [87]:
knn = Knn(n_neighbors=5, weights='distance')

# Trainning

In [88]:
train_data

Unnamed: 0,Fake,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,1,-0.473601,-0.323795,-0.284004,0.065375,0.168722,0.603801,0.120702,0.351045,0.612958,...,0.198654,0.205065,-0.546238,-0.336551,-0.376475,-0.503849,0.532969,-0.481151,0.027728,-0.438514
1,0,-0.010770,0.140767,0.054278,-0.414129,-0.325656,-0.341838,-0.687630,-0.484328,-0.519255,...,-0.149488,-0.385988,0.463939,0.043626,0.191616,0.059563,-0.074667,-0.031275,-0.301401,0.371909
2,0,0.147585,0.168619,0.402191,-0.365630,-0.393448,-0.115475,-0.601188,-0.405202,-0.601286,...,-0.096004,-0.285114,0.696214,0.045502,0.209882,0.226134,-0.147225,0.045279,-0.167611,0.318341
3,1,-0.514645,-0.155438,-0.036774,0.014530,0.341492,0.697985,0.061258,0.331818,0.526819,...,0.116317,0.246974,-0.567998,-0.305817,-0.283029,-0.567555,0.507843,-0.220122,0.108277,-0.373246
4,1,-0.480899,-0.242695,-0.109058,0.060867,0.195914,0.559748,-0.002612,0.452169,0.503243,...,0.075119,0.133486,-0.650744,-0.284958,-0.400682,-0.358581,0.621589,-0.357767,0.200112,-0.258069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,0,0.167432,0.115553,0.148834,-0.223361,-0.173407,-0.274314,-0.489178,-0.421409,-0.519275,...,-0.132602,-0.333933,0.209602,0.039792,0.119313,0.510803,-0.039563,-0.001553,-0.060495,0.093963
1378,0,-0.325388,-0.151153,-0.207359,-0.222646,0.071686,-0.126223,-0.437132,-0.490343,-0.309853,...,0.217991,-0.226875,-0.065070,-0.197321,-0.366978,-0.229876,0.223377,0.258430,0.099050,0.279263
1379,1,-0.379335,-0.206486,-0.162637,-0.016917,0.276696,0.560107,0.186683,0.387232,0.552602,...,0.008179,0.146767,-0.524878,-0.240922,-0.380066,-0.510718,0.538389,-0.247118,0.087509,-0.476187
1380,0,0.362520,0.116820,0.042251,-0.104040,-0.241352,-0.545494,-0.415508,-0.557952,-0.563832,...,-0.091419,-0.473848,0.602896,0.171411,0.301621,0.399550,-0.528472,0.361194,0.089855,0.468990


In [89]:
knn.fit(train_data[col][:-1], train_data["Fake"][:-1])

# Testing  

In [90]:
y_test = test_data['Fake']
pred_y_test = knn.predict(test_data[col])

In [91]:
metrics = {}

metrics["accuracy_score"] = accuracy_score(y_test, pred_y_test)
metrics["precision_score"] =  precision_score(y_test, pred_y_test)
metrics["recall_score"] = recall_score(y_test, pred_y_test)
metrics["f1_score"] =  f1_score(y_test, pred_y_test)

print(f"metrics: {metrics}")
print(classification_report(y_test, pred_y_test, target_names=['Truthful', 'Fake']))


metrics: {'accuracy_score': 0.9450867052023122, 'precision_score': 0.946524064171123, 'recall_score': 0.9516129032258065, 'f1_score': 0.9490616621983914}
              precision    recall  f1-score   support

    Truthful       0.94      0.94      0.94       160
        Fake       0.95      0.95      0.95       186

    accuracy                           0.95       346
   macro avg       0.94      0.94      0.94       346
weighted avg       0.95      0.95      0.95       346



# Interface For testing

In [92]:
def interface(model):
  text = input("Enter an Arabic text:                              \n ")
  embd_text = emmbding(text)
  pred_value = model.predict(embd_text).item()

  if pred_value == 1:
    print('The review is Fake')
  elif pred_value == 0:
    print('The review is Truthful')

In [93]:
interface(knn)

Enter an Arabic text:                              
 اكره كل هذه الفنادق الكريهة والقذرة اتمنا ان يموت جميع من في العالم لهذا السبب 
The review is Fake


In [94]:
interface(knn)

Enter an Arabic text:                              
 كان الفندق ذو جودة متوسطة كما هو متوقع ولكن الموظفون كانوا لطيفين جدا 
The review is Truthful


In [95]:
# اكره كل هذه الفنادق الكريهة والقذرة اتمنا ان يموت جميع من في العالم لهذا السبب => FAKE
# كان الفندق ذو جودة متوسطة كما هو متوقع ولكن الموظفون كانوا لطيفين جدا => Truthful

In [100]:
train_data.to_excel('/content/drive/MyDrive/FE_models/Data/train_data_w_emmbd.xlsx', index=False)

In [98]:
test_data.to_excel('/content/drive/MyDrive/FE_models/Data/test_data_w_emmbd.xlsx', index=False)

# Exit

In [None]:
exit()