# Load Dataset:

In [1]:
import pandas as pd

# Load fake and real news
fake_df = pd.read_csv('Fake.csv')
real_df = pd.read_csv('True.csv')

# Label the data
fake_df['label'] = 0
real_df['label'] = 1

# Combine datasets
df = pd.concat([fake_df, real_df]).sample(frac=1).reset_index(drop=True)


# Preprocess Text Data

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raheel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Raheel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Feature Extraction (Vectorization)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']


# Train Classifier

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4748
           1       0.98      0.99      0.99      4232

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



# Explainability with LIME

In [5]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(vectorizer, model)


In [7]:
import lime
import lime.lime_text

explainer = lime.lime_text.LimeTextExplainer(class_names=['Fake', 'Real'])

# Make sure you're using the raw text (not preprocessed one)
idx = 10
text_instance = df['text'].iloc[idx]  # use original 'text' column, not 'clean_text'

exp = explainer.explain_instance(text_instance, pipeline.predict_proba, num_features=10)
exp


<lime.explanation.Explanation at 0x278834aa7b0>

In [8]:
exp.save_to_file('lime_explanation.html')


In [9]:
import joblib
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']