<a href="https://colab.research.google.com/github/MoncefDj/AI-DS-Masters/blob/main/NLP_Arabic_fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Fake News Detection
**By:**
- Djezza Moncef
- Sadoudi Abdessamad

## Install necessary libraries

In [None]:
!pip install pandas numpy scikit-learn nltk --quiet

## Import libraries

In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# Double-check specific components for punkt
nltk.download('tokenizers/punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading tokenizers/punkt: Package 'tokenizers/punkt'
[nltk_data]     not found in index
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Define Arabic stopwords


In [None]:
try:
    from arabicstopwords.arabicstopwords import stopwords_list
except ImportError:
    stopwords_list = set(stopwords.words('arabic'))

## Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define the dataset path (Arabic Fake News Dataset (AFND))
dataset_path = '/content/drive/MyDrive/study/Data/NLP Arabic fake news detection dataset/Dataset'
sources_path = '/content/drive/MyDrive/study/Data/NLP Arabic fake news detection dataset/sources.json'

# Load the sources.json file
with open(sources_path, 'r', encoding='utf-8') as f:
    sources = json.load(f)

# Prepare a DataFrame to hold all articles and their labels
data = []

# Traverse through the directories and collect articles
for source, label in sources.items():
    source_path = os.path.join(dataset_path, source, 'scraped_articles.json')
    if os.path.exists(source_path):
        with open(source_path, 'r', encoding='utf-8') as f:
            articles = json.load(f)['articles']
            for article in articles:
                data.append({
                    'title': article['title'],
                    'text': article['text'],
                    'published_date': article['published date'],
                    'label': label
                })

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,text,published_date,label
0,المنتخب الوطني المغربي لأقل من 20 سنة يخوض تجم...,يخوض المنتخب الوطني المغربي لكرة القدم لأقل من...,2021-05-23T00:00:00,credible
1,وزير النقل اعمارة:السرعة عامل مسبب لحوادث السي...,ترأس عبد القادر اعمارة، وزير التجهيز و النقل و...,2021-05-22T00:00:00,credible
2,"ميسي يؤكد أن الفوز بكأس اسبانيا كان ""نقطة تحول""",أكد النجم الارجنتيني ليونيل ميسي أن فوز فريقه ...,2021-05-22T00:00:00,credible
3,دبلوماسي مغربي سابق بإسبانيا لـ2M.ma: خطأ مدري...,أكّد الدبلوماسي المغربي السابق بإسبانيا، عبد ا...,2021-05-22T00:00:00,credible
4,شركتان، فرنسية وبريطانية تعلنان عن نتائج إيجاب...,أعلنت شركتا سانوفي الفرنسية العملاقة في تصنيع ...,2021-05-17T00:00:00,credible


## Data Preprocessing

In [None]:
# Combine title and text for better context
df['content'] = df['title'] + " " + df['text']

# Remove unnecessary columns
df = df[['content', 'label']]

# Map labels to numeric values
label_mapping = {'credible': 1, 'not credible': 0, 'undecided': -1}
df['label'] = df['label'].map(label_mapping)

# Drop rows with undecided labels
df = df[df['label'] != -1]


In [None]:
df

Unnamed: 0,content,label
0,المنتخب الوطني المغربي لأقل من 20 سنة يخوض تجم...,1
1,وزير النقل اعمارة:السرعة عامل مسبب لحوادث السي...,1
2,"ميسي يؤكد أن الفوز بكأس اسبانيا كان ""نقطة تحول...",1
3,دبلوماسي مغربي سابق بإسبانيا لـ2M.ma: خطأ مدري...,1
4,شركتان، فرنسية وبريطانية تعلنان عن نتائج إيجاب...,1
...,...,...
606907,إجراءات إيطالية صارمة ضد الجماهير الإنقليزية ش...,1
606908,الأزهر يحذّر من لعبة ''فورتنايت'' حذّر مركز ال...,1
606909,يورو 2020: بوتين يدافع عن استضافة مباريات في س...,1
606910,ميركل تشعر بخيبة أمل بعد خروج منتخبها الوطني م...,1


## Text Cleaning and Tokenization

In [None]:
# Define a function to clean and tokenize text
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords_list]
    return ' '.join(tokens)

# Apply preprocessing
df['content'] = df['content'].apply(preprocess_text)


## Splitting Data and Vectorization

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42
)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## Model Training and Evaluation

In [None]:
# Train a Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Make predictions
y_pred = model.predict(X_test_vec)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7097278030677222

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.56      0.63     33539
           1       0.70      0.83      0.76     41370

    accuracy                           0.71     74909
   macro avg       0.71      0.70      0.70     74909
weighted avg       0.71      0.71      0.70     74909



## Save the Model and Vectorizer

In [None]:
import pickle

# Save the trained model
with open('fake_news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


## Load and Use the Model

In [None]:
# Load the model and vectorizer
with open('fake_news_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open('tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

In [None]:
# Example test cases
test_cases = [
    "المنتخب الوطني المغربي يفوز في مباراة ودية استعداداً للتصفيات المؤهلة لكأس العالم.",
    "مخلوقات فضائية شوهدت في سماء الدار البيضاء، حسب شهود عيان.",
    "وزارة الصحة المغربية تنشر تقريراً عن انخفاض معدلات الإصابة بفيروس كورونا.",
    "دراسة تؤكد أن شرب الماء في وقت متأخر يؤدي إلى اختفاء الوزن في أسبوع."
]

# Test the model
for i, text in enumerate(test_cases, start=1):
    processed_text = preprocess_text(text)
    vec = loaded_vectorizer.transform([processed_text])
    prediction = loaded_model.predict(vec)
    label_reverse_mapping = {1: 'credible', 0: 'not credible'}
    print(f"Test Case {i}: {text}")
    print(f"Prediction: {label_reverse_mapping[prediction[0]]}\n")


Test Case 1: المنتخب الوطني المغربي يفوز في مباراة ودية استعداداً للتصفيات المؤهلة لكأس العالم.
Prediction: not credible

Test Case 2: مخلوقات فضائية شوهدت في سماء الدار البيضاء، حسب شهود عيان.
Prediction: credible

Test Case 3: وزارة الصحة المغربية تنشر تقريراً عن انخفاض معدلات الإصابة بفيروس كورونا.
Prediction: credible

Test Case 4: دراسة تؤكد أن شرب الماء في وقت متأخر يؤدي إلى اختفاء الوزن في أسبوع.
Prediction: not credible



#===============================================================================================================================================================

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn nltk xgboost --quiet

# Import libraries
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the dataset path
dataset_path = '/content/drive/MyDrive/study/Data/NLP Arabic fake news detection dataset/Dataset'
sources_path = '/content/drive/MyDrive/study/Data/NLP Arabic fake news detection dataset/sources.json'

# Load the sources.json file
with open(sources_path, 'r', encoding='utf-8') as f:
    sources = json.load(f)

# Prepare a DataFrame to hold all articles and their labels
data = []

# Traverse through the directories and collect articles
for source, label in sources.items():
    source_path = os.path.join(dataset_path, source, 'scraped_articles.json')
    if os.path.exists(source_path):
        with open(source_path, 'r', encoding='utf-8') as f:
            articles = json.load(f)['articles']
            for article in articles:
                data.append({
                    'title': article['title'],
                    'text': article['text'],
                    'published_date': article['published date'],
                    'label': label
                })

# Convert to DataFrame
df = pd.DataFrame(data)

# Combine title and text for better context
df['content'] = df['title'] + " " + df['text']

# Remove unnecessary columns
df = df[['content', 'label']]

# Map labels to numeric values
label_mapping = {'credible': 1, 'not credible': 0, 'undecided': -1}
df['label'] = df['label'].map(label_mapping)

# Drop rows with undecided labels
df = df[df['label'] != -1]

# Check the distribution of labels
df['label'].value_counts()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,207310
0,167233


In [None]:
# Define Arabic stopwords
try:
    from arabicstopwords.arabicstopwords import stopwords_list
except ImportError:
    stopwords_list = set(stopwords.words('arabic'))

# Define a function to clean and tokenize text
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords_list]
    return ' '.join(tokens)

# Apply preprocessing
df['content'] = df['content'].apply(preprocess_text)

# Display the first few rows after preprocessing
df.head()


Unnamed: 0,content,label
0,المنتخب الوطني المغربي لأقل 20 سنة يخوض تجمعا ...,1
1,وزير النقل اعمارة السرعة عامل مسبب لحوادث السي...,1
2,ميسي يؤكد الفوز بكأس اسبانيا نقطة تحول أكد الن...,1
3,دبلوماسي مغربي سابق بإسبانيا خطأ مدريد الرباط ...,1
4,فرنسية وبريطانية تعلنان نتائج إيجابية لقاح ضد ...,1


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42
)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logistic_model.fit(X_train_vec, y_train)

# Make predictions
logistic_y_pred = logistic_model.predict(X_test_vec)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, logistic_y_pred))
print("\nClassification Report:\n", classification_report(y_test, logistic_y_pred))


Logistic Regression Accuracy: 0.7750470570959431

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.71      0.74     33539
           1       0.78      0.83      0.80     41370

    accuracy                           0.78     74909
   macro avg       0.77      0.77      0.77     74909
weighted avg       0.77      0.78      0.77     74909



In [None]:
# Initialize the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
random_forest_model.fit(X_train_vec, y_train)

# Make predictions
rf_y_pred = random_forest_model.predict(X_test_vec)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))


Random Forest Accuracy: 0.821490074623877

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.78      0.80     33539
           1       0.83      0.86      0.84     41370

    accuracy                           0.82     74909
   macro avg       0.82      0.82      0.82     74909
weighted avg       0.82      0.82      0.82     74909



In [None]:
# Initialize the SVM model
svm_model = LinearSVC(random_state=42)

# Train the model
svm_model.fit(X_train_vec, y_train)

# Make predictions
svm_y_pred = svm_model.predict(X_test_vec)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, svm_y_pred))
print("\nClassification Report:\n", classification_report(y_test, svm_y_pred))


SVM Accuracy: 0.7780640510486057

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.72      0.74     33539
           1       0.78      0.83      0.80     41370

    accuracy                           0.78     74909
   macro avg       0.78      0.77      0.77     74909
weighted avg       0.78      0.78      0.78     74909



In [None]:
# Initialize the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb_model.fit(X_train_vec, y_train)

# Make predictions
xgb_y_pred = xgb_model.predict(X_test_vec)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_y_pred))
print("\nClassification Report:\n", classification_report(y_test, xgb_y_pred))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7952448971418654

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.69      0.75     33539
           1       0.78      0.88      0.83     41370

    accuracy                           0.80     74909
   macro avg       0.80      0.79      0.79     74909
weighted avg       0.80      0.80      0.79     74909



In [None]:
# Initialize the MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Train the model
mlp_model.fit(X_train_vec, y_train)

# Make predictions
mlp_y_pred = mlp_model.predict(X_test_vec)

# Evaluate the model
print("MLP Accuracy:", accuracy_score(y_test, mlp_y_pred))
print("\nClassification Report:\n", classification_report(y_test, mlp_y_pred))


MLP Accuracy: 0.8064451534528562

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78     33539
           1       0.82      0.84      0.83     41370

    accuracy                           0.81     74909
   macro avg       0.80      0.80      0.80     74909
weighted avg       0.81      0.81      0.81     74909

