In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack

In [3]:
# import opendatasets as od
# dataset_url = "https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets"

# od.download(dataset_url)

In [35]:
dataTrue = pd.read_csv("True.csv")

In [36]:
dataFake = pd.read_csv("Fake.csv")

In [37]:
# 1 - Fake
# 0 - Real

In [38]:
dataTrue["isFake"] = 0

In [39]:
dataFake["isFake"] = 1

In [40]:
df = pd.concat([dataTrue, dataFake], ignore_index=True)

In [41]:
df.head()

Unnamed: 0,title,text,subject,date,isFake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [42]:
df.tail()

Unnamed: 0,title,text,subject,date,isFake
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1
44897,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",1


In [43]:
df.shape

(44898, 5)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   isFake   44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [45]:
# checking if any null values are present
df.isnull().sum()

title      0
text       0
subject    0
date       0
isFake     0
dtype: int64

In [46]:
# label encoder can be used when we have multiple categories and want to avoid manually assigning labels. For this problem statmenet, we have done it manually as shown in one of the above steps

In [47]:
# dropping the date column since it is unique for every news and will not provide much detail for model training
df = df.drop("date", axis=1)

In [48]:
df = df.rename(columns={'label': 'isFake'})

In [49]:
# 0 - Real
# 1 - Fake
df.head()

Unnamed: 0,title,text,subject,isFake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,0


In [50]:
# Separate features (X) and target variable (Y)
X = df[['title', 'text']]  # Features
y = df['isFake']  # Target variable

In [51]:
df["subject"].value_counts()

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64

In [52]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [25]:
X_test

Unnamed: 0,title,text
22216,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th..."
27917,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...
25007,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...
1377,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...
32476,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...
...,...,...
15578,Saudi banks freeze accounts of suspects detain...,RIYADH/DUBAI (Reuters) - Saudi Arabian banks h...
29394,"Trump Ratchets Up Islamophobic Lie, Gleefully...",Donald Trump is an unapologetic Islamophobic b...
3120,Travel ban challengers urge U.S. top court to ...,WASHINGTON (Reuters) - Opponents of President ...
25388,Obama Just OBLITERATED The GOP’s Threat To Ki...,"This election is all about the Supreme Court, ..."


TF-IDF: Term Frequency - Inverse Document Frequency

A metric that indicates how important a word is to a document in a collection. It weighs the improtance of each word in a document based on how often it appears in that document and how often it appears accross all documents in the collection.

TF: Number of times a term t appears in a document
IDF: Logarithm of total number of documents divided by no. of docs that contain term
TF-IDF: TF * IDF

Basically allows us to find the most relevant and distinctive words per document.

In [54]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Define a function for text pre-processing
def text_preprocessing(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    # Join the tokens back into a string
    text = ' '.join(tokens)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

# Apply text pre-processing to the title and text columns
X_train['title'] = X_train['title'].apply(text_preprocessing)
X_train['text'] = X_train['text'].apply(text_preprocessing)
X_test['title'] = X_test['title'].apply(text_preprocessing)
X_test['text'] = X_test['text'].apply(text_preprocessing)

# Combine the title and text columns into a single column
X_train['combined'] = X_train['title'] + ' ' + X_train['text']
X_test['combined'] = X_test['title'] + ' ' + X_test['text']

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the training data and transform both the training and testing data
X_train_tfidf = vectorizer.fit_transform(X_train['combined'])
X_test_tfidf = vectorizer.transform(X_test['combined'])

### Now trying out various algorithms

In [None]:
# # Define a list of classification algorithms to apply
# algorithms = [
#     LogisticRegression(),
#     MultinomialNB(),
#     SVC(),
#     RandomForestClassifier()
# ]

# # Loop through each algorithm and apply it to the data
# for algorithm in algorithms:
#     algorithm.fit(X_train_tfidf, y_train)
#     y_pred = algorithm.predict(X_test_tfidf)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f'Algorithm: {algorithm.__class__.__name__}, Accuracy: {accuracy:.3f}')

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)
y_pred = log_reg.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Algorithm: Logistic Regression, Accuracy: {accuracy:.3f}')

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Algorithm: Multinomial Naive Bayes, Accuracy: {accuracy:.3f}')

In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Algorithm: Support Vector Machine, Accuracy: {accuracy:.3f}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Algorithm: Random Forest Classifier, Accuracy: {accuracy:.3f}')

In [None]:
def pre_process_new_article(title, text):
    # Apply text pre-processing to both title and text
    title = text_preprocessing(title)
    text = text_preprocessing(text)
    
    # Combine the title and text columns into a single column
    article = title + ' ' + text
    
    # Apply TF-IDF vectorization
    article_tfidf = vectorizer.transform([article])
    
    return article_tfidf

In [None]:
user_title = input("Enter article title: ")
user_text = input("Enter article text: ")

user_article_tfidf = pre_process_new_article(user_title, user_text)

# Pass the pre-processed article to your ML model for prediction
prediction = your_ml_model.predict(user_article_tfidf)

In [116]:
def get_label(prediction):
    if prediction[0] == 0:
        return "real!!"
    else:
        return "fake!!"

In [117]:
print(f"The news is {get_label(prediction)}")

The news is fake!!
