In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import string



In [2]:
# Step 2: Load Dataset
df = pd.read_csv('Fake.csv', low_memory=False)

In [3]:
# Drop unnecessary columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


In [4]:
# Debug: Check column names
print("Columns in dataset:", df.columns)



Columns in dataset: Index(['title', 'text', 'subject', 'date'], dtype='object')


In [5]:
# Step 3: Clean Text Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text
# Combine 'title' and 'text' columns
df['text'] = df['title'] + ' ' + df['text']
df['text'] = df['text'].apply(clean_text)

# Debug: Check if the label column exists
if 'subject' not in df.columns:
    print("Error: 'subject' column not found in the dataset.")
    exit()



In [6]:
# Step 4: Prepare Labels and Features
df = df.dropna()  # Drop rows with any missing values

# Debug: Check unique values in 'subject' column
print("Unique values in 'subject':", df['subject'].unique())

# Define label mapping
label_mapping = {
    'left-news': 1,  # REAL
    'US_News': 1,    # REAL
    'politics': 1,   # REAL
    'News': 0,       # FAKE
    'Government News': 0,  # FAKE
    # Add other mappings as needed
}
# Filter rows with valid 'subject' values
df = df[df['subject'].isin(label_mapping.keys())]

# Map 'subject' to numerical labels
y = df['subject'].map(label_mapping)

# Check for NaN values in 'y'
if y.isnull().any():
    print("Error: 'y' contains NaN values.")
    exit()

X = df['text']

# Debug: Check dataset size
print(f"Dataset size after filtering: {len(df)} rows")




Unique values in 'subject': ['News' 'politics'
 ' of which Soros is a major financier.Mercy Corps: Vis a vis the Arab-Israeli conflict'
 ' high taxes'
 ' Politics According to the Bible and (with Barry Asmus) The Poverty of Nations: A Sustainable Solution.Via: Townhall"'
 'Government News' 'left-news'
 ' claimed that hundreds of alternative media websites were producing  fake news  and  conspiracy  stories and therefore were unreliable as information sources. It wasn t long before the establishment began referencing these politicized lists'
 'US_News' ' and so is Mr. Katzenbach'
 ' fell 5.6 percent Monday. Wynn Resortsslipped 1.2 percent. Las Vegas Sands fell as much as 2.1 percent before closing higher.   LIVE DRILL    Las Vegas has been at the forefront of active shooter training. ( Image Source: sinclairstoryline)Las Vegas Active Shooter Drills Back in 2014'
 ' state systems with outsized pretensions to power have reacted to their environments in two ways. The first strategy'
 ' dec

In [7]:
# Step 5: Split Dataset
if len(df) == 0:
    print("Error: No valid rows in the dataset after filtering.")
    exit()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
# Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [9]:
# Step 7: Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)


In [10]:
# Step 8: Predict and Evaluate
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8792419568091671

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.85      0.87      2155
           1       0.87      0.91      0.89      2383

    accuracy                           0.88      4538
   macro avg       0.88      0.88      0.88      4538
weighted avg       0.88      0.88      0.88      4538


Confusion Matrix:
 [[1833  322]
 [ 226 2157]]
