In [1]:
import pandas as pd
import numpy as np
import nltk
import textstat
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# Load and clean dataset
data = pd.read_csv("WelFake_Dataset.csv")
data.dropna(inplace=True)
data['text'] = data['text'].astype(str)

# Display basic info
print("Dataset Info:")
data.info()
print("\nFirst few rows:")
data.head()

[nltk_data] Downloading package punkt to C:\Users\Zul
[nltk_data]     Zaki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Zul
[nltk_data]     Zaki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 71537 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  71537 non-null  int64 
 1   title       71537 non-null  object
 2   text        71537 non-null  object
 3   label       71537 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB

First few rows:


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [2]:
from tqdm.notebook import tqdm

def get_readability(text):
    try:
        return textstat.flesch_reading_ease(text)
    except Exception:
        return 50

def get_vader_sentiment(text):
    return sia.polarity_scores(text)['compound']

def get_lexical_diversity(text):
    words = text.split()
    if not words:
        return 0
    return len(set(words)) / len(words)

In [None]:
# Cell 3: Apply Feature Engineering
tqdm.pandas()

data['readability'] = data['text'].progress_apply(get_readability)
data['vader_sentiment'] = data['text'].progress_apply(get_vader_sentiment)
data['lexical_diversity'] = data['text'].progress_apply(get_lexical_diversity)

  0%|          | 0/71537 [00:00<?, ?it/s]

  0%|          | 0/71537 [00:00<?, ?it/s]

In [None]:
selected_features = ['readability', 'vader_sentiment', 'lexical_diversity']

X = data[['text'] + selected_features]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X['text']

class MetaSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.feature_names]

In [None]:
meta_features = Pipeline([
    ('select', MetaSelector(selected_features)),
    ('scale', StandardScaler())
])

rf_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', Pipeline([
            ('get_text', TextSelector()),
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2)))
        ])),
        ('meta', meta_features)
    ])),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

from tqdm.notebook import tqdm
with tqdm(total=1, desc="Fitting Random Forest") as pbar:
    rf_pipeline.fit(X_train, y_train)
    pbar.update(1)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', Pipeline([
            ('get_text', TextSelector()),
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2)))
        ])),
        ('meta', meta_features)
    ])),
    ('clf', DecisionTreeClassifier(random_state=42))
])

with tqdm(total=1, desc="Fitting Decision Tree") as pbar:
    dt_pipeline.fit(X_train, y_train)
    pbar.update(1)

In [None]:
# Random Forest Evaluation
y_pred_rf = rf_pipeline.predict(X_test)
print("🔍 Random Forest Model")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Decision Tree Evaluation
y_pred_dt = dt_pipeline.predict(X_test)
print("🔍 Decision Tree Model")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

In [None]:
import joblib
joblib.dump(rf_pipeline, 'rf_pipeline.pkl')

joblib.dump(dt_pipeline, 'dt_pipeline.pkl')

In [None]:
import shap
from sklearn.preprocessing import StandardScaler

meta_X_test = StandardScaler().fit_transform(X_test[selected_features])

explainer = shap.Explainer(rf_pipeline.named_steps['clf'], meta_X_test)
shap_values = explainer(meta_X_test)

shap.summary_plot(shap_values, X_test[selected_features], plot_type="bar")