In [4]:
!pip install pandas scikit-learn nltk



In [5]:
!pip install xgboost



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb


# OLD
# from sklearn.model_selection import train_test_split
# from sklearn import svm
# import random


#  NEW
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV


## Train-Test split for TFIDF vectorizer

In [13]:
# Load the dataset
df = pd.read_csv('../final_datasets/articles_dataset.csv')

# Encode the labels (real/fake)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])  # Assuming 'label' column has 'real' and 'fake' values

# Split the data into training and testing sets (80% train, 20% test)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=42)

# Fit the TF-IDF vectorizer on the training data only, then transform both sets
# tfidf_vectorizer = TfidfVectorizer(max_features=50) #old
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # new: Increased features, added n-grams
X_train = tfidf_vectorizer.fit_transform(X_train_texts)
X_test = tfidf_vectorizer.transform(X_test_texts)

# Check the shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5096, 5000)
(1274, 5000)
(5096,)
(1274,)


### SVM
- **Code source:** Support Vector Machine (SVM): https://www.kaggle.com/code/mehmetlaudatekman/text-classification-svm-explained

Support Vector Machine (SVM) is a supervised machine learning technique commonly applied to classification problems, like fake news detection. In this context, SVM works by separating real and fake news articles using a decision boundary based on the features extracted from Arabic text data. For instance, these features might include word frequencies, linguistic patterns, or even word embeddings tailored for Arabic, which capture contextual relationships in the text.

In [14]:
from sklearn.model_selection import cross_val_score


# Train the SVM classifier

# OLD
# svm_classifier = svm.SVC(kernel='linear', random_state=42) 

# # Perform cross-validation to confirm that the number of tfidf features was enough by checking consistency of accuracy among folds
# cv_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5) 

# # Print cross-validation results
# print(f'Cross-Validation Scores: {cv_scores}')
# print(f'Mean Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')


# NEW
svm_model = SVC(random_state=42) 
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],           # Regularization strength
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernels to try
    'gamma': ['scale', 'auto'],       # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [2, 3, 4]               # Degree for polynomial kernel
}



In [None]:
# OLD
# svm_classifier.fit(X_train, y_train)

# # Predict on the test set
# y_pred = svm_classifier.predict(X_test)


# NEW
# Perform Grid Search
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Evaluate on the test set with the best model
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)




# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation results
print("SVM Results:")
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Fitting 5 folds for each of 96 candidates, totalling 480 fits


### XGBOOST
- **Code source:** eXtreme Gradient Boosting (XGBoost): https://www.kaggle.com/code/iamarjunchandra/text-classification-with-rnn-xgboost

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', random_state=42)

# OLD
# # Step 2: Train the classifier
# xgb_classifier.fit(X_train, y_train)

# # Step 3: Predict on the test set
# y_pred_xgb = xgb_classifier.predict(X_test)


# NEW
# Step 2: Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'max_depth': [3, 5, 7],                 # Maximum depth of a tree
    'n_estimators': [50, 100, 200],         # Number of boosted trees to fit
    'subsample': [0.8, 1.0],                # Fraction of samples used per tree
    'colsample_bytree': [0.8, 1.0],         # Fraction of features used per tree
    'gamma': [0, 1, 5],                     # Minimum loss reduction required for a split
    'reg_alpha': [0, 0.1, 1],               # L1 regularization term
    'reg_lambda': [1, 10, 100]              # L2 regularization term
}

# Step 3: Perform Grid Search
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train, y_train)

# Step 4: Best parameters and score
print("Best Parameters:", grid_search_xgb.best_params_)
print("Best Cross-Validation Accuracy:", grid_search_xgb.best_score_)

# Step 5: Evaluate on the test set with the best model
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)






# Step 4: Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

# Step 5: Print the evaluation results for XGBoost
print("XGBoost Results:")
print(f'Accuracy: {accuracy_xgb:.4f}')
print(f'Precision: {precision_xgb:.4f}')
print(f'Recall: {recall_xgb:.4f}')
print(f'F1 Score: {f1_xgb:.4f}')

# Comparison with SVM

Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Accuracy: 0.9757
Precision: 0.9924
Recall: 0.9618
F1 Score: 0.9769
