## Load The Dataset

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('fake.csv')
df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
sum=0
for i in df['subject']:
    if (i!='News'):
        sum+=1
print(sum)

14431


## Combine and Shuffle the Dataset

In [6]:

# Load the datasets
fake_df = pd.read_csv('fake.csv')
true_df = pd.read_csv('true.csv')

# Add a label column to each dataset
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # Real news

# Combine the datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the combined dataset
df = df.sample(frac=1).reset_index(drop=True)

print(df.head())


                                               title  \
0  BENGHAZI WIDOW Hits Back At Hillary’s Heartles...   
1  Mali's regional elections delayed by security ...   
2  BLACK REPUBLICAN AND BRILLIANT NEUROSURGEON AN...   
3  Trump picks Boeing executive Shanahan to becom...   
4  HYPOCRISY ON STEROIDS: Check Out HATEFUL Trump...   

                                                text       subject  \
0  Dorothy Woods, the wife of an ex-Navy SEAL kil...      politics   
1  BAMAKO (Reuters) - Mali s government said it h...     worldnews   
2                                                         politics   
3  WASHINGTON (Reuters) - Senior Boeing (BA.N) ex...  politicsNews   
4  Lady Gaga, who protested in front of Trump Tow...     left-news   

                 date  label  
0        Jun 30, 2016      0  
1  November 27, 2017       1  
2         May 5, 2015      0  
3     March 16, 2017       1  
4        Jan 14, 2017      0  


## Preprocess The Dataset

In [7]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if any)
df = df.dropna()


title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [8]:
## lets clean the dataset to remove any special characters
## extra space and also convert all text into lower case.

import re

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r"\'", "'", text)  # Handle special characters
    return text

# Apply text cleaning
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)

# Combine title and text
df['content'] = df['title'] + ' ' + df['text']

# Display the first few rows of the combined dataset with cleaned text
df.head()


Unnamed: 0,title,text,subject,date,label,content
0,benghazi widow hits back at hillary s heartles...,dorothy woods the wife of an ex navy seal kill...,politics,"Jun 30, 2016",0,benghazi widow hits back at hillary s heartles...
1,mali s regional elections delayed by security ...,bamako reuters mali s government said it has d...,worldnews,"November 27, 2017",1,mali s regional elections delayed by security ...
2,black republican and brilliant neurosurgeon an...,,politics,"May 5, 2015",0,black republican and brilliant neurosurgeon an...
3,trump picks boeing executive shanahan to becom...,washington reuters senior boeing ba n executiv...,politicsNews,"March 16, 2017",1,trump picks boeing executive shanahan to becom...
4,hypocrisy on steroids check out hateful trump ...,lady gaga who protested in front of trump towe...,left-news,"Jan 14, 2017",0,hypocrisy on steroids check out hateful trump ...


## Data Analysis

## Feature Extraction

In [9]:
## split the data into train and test.
## using tfdfvectorizer for converting the text into numerical features

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = df['content']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


## Model Training and Evaluation

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train the model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)


# Predict on the test set
y_pred = lr_model.predict(X_test_tfidf)

# Calculate and print the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.9906
Precision: 0.9875
Recall: 0.9927
F1-score: 0.9901

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4758
           1       0.99      0.99      0.99      4222

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



## Model Testing with user input 

In [11]:
from joblib import dump

# Assuming rf_model is your RandomForestClassifier model
dump(lr_model, 'fakenews.pkl')

['fakenews.pkl']

In [16]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r"\'", "'", text)  # Handle special characters
    return text

# Function to preprocess input for prediction
def preprocess_input(text):
    cleaned_text = clean_text(text)
    combined_text = cleaned_text  # Assuming input is already combined title and text
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    tfidf_text = tfidf_vectorizer.fit_transform([combined_text])
    return tfidf_text

# Function to predict using the loaded model
def predict_fake_news(input_text, model):
    processed_text = preprocess_input(input_text)
    prediction = model.predict(processed_text)[0]
    if prediction == 0:
        return "Fake News"
    else:
        return "Real News"

# Example user input
user_input = """
benghazi widow hits back at hillary s heartles...
"""


# Predict using the model
prediction = predict_fake_news(user_input, lr_model)
print(f'Prediction: {prediction}')


ValueError: X has 13 features, but LogisticRegression is expecting 5000 features as input.