In [1]:
# imporitng all the dependencies

# Importing libraries for numerical and data handling tasks
import numpy as np  # For mathematical operations and working with arrays
import pandas as pd  # For handling and analyzing tabular data

# Importing libraries for text processing
import re  # For finding patterns in text (e.g., matching words, emails, etc.)
from nltk.corpus import stopwords  # For removing common words like "the", "is"
from nltk.stem import PorterStemmer  # For reducing words to their root form (e.g., "running" → "run")

# Importing libraries for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer # For converting text into numbers (TF-IDF method)
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression  # For building a classification model (e.g., spam detection)
from sklearn.metrics import accuracy_score  # For calculating the accuracy of the model's predictions
import string

from sklearn.preprocessing import LabelEncoder

In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Dubai
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
news_dataset = pd.read_csv("news.csv")

In [5]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
# data preprocessing 
news_dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [7]:
news_dataset.shape

(6335, 4)

In [8]:
print(news_dataset["label"].dtypes)

object


In [9]:
print(news_dataset["label"].unique())

['FAKE' 'REAL']


In [10]:
# news_dataset["label"] = news_dataset["label"].map({
#     "FAKE" : 0,
#     "REAL" : 1
# })

In [11]:
# conversion of categorical to numerical
# 
le = LabelEncoder()
le.fit_transform(news_dataset["label"])
news_dataset["label"] = le.fit_transform(news_dataset["label"])
print(news_dataset["label"].unique())

[0 1]


In [12]:
news_dataset.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [13]:
# separating the data and label

x = news_dataset["text"]
y = news_dataset["label"]

In [14]:
# Streaming :
# streaming is the process of reducing a word to its root word 

# Initialize the stemmer
port_stem = PorterStemmer()

def stemming(content):
     # Remove any non-alphabetic characters and convert to lowercase
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()

    # split the content into words
    stemmed_content = stemmed_content.split()

    # get english stopwords
    stop_words = stopwords.words("english")

    # stem the word, and remove the sstopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stop_words]

    # join back all the content 
    stemmed_content = " ".join(stemmed_content)

    return stemmed_content

In [15]:
# apply this on the original dataset
news_dataset["text"] = news_dataset["text"].apply(stemming)

In [16]:
# separating variables
x = news_dataset["text"]
y = news_dataset["label"]

In [17]:
print(x)

0       daniel greenfield shillman journal fellow free...
1       googl pinterest digg linkedin reddit stumbleup...
2       u secretari state john f kerri said monday sto...
3       kayde king kaydeek novemb lesson tonight dem l...
4       primari day new york front runner hillari clin...
                              ...                        
6330    state depart told republican nation committe c...
6331    p pb stand plutocrat pentagon post oct wikimed...
6332    anti trump protest tool oligarchi reform alway...
6333    addi ababa ethiopia presid obama conven meet l...
6334    jeb bush suddenli attack trump matter jeb bush...
Name: text, Length: 6335, dtype: object


In [18]:
print(y)

0       0
1       0
2       1
3       0
4       1
       ..
6330    1
6331    0
6332    0
6333    1
6334    1
Name: label, Length: 6335, dtype: int64


In [19]:
# convert categorical to numerical
vectorizer = TfidfVectorizer()
vectorizer.fit(x)

x = vectorizer.transform(x)

In [20]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1598597 stored elements and shape (6335, 43733)>
  Coords	Values
  (0, 106)	0.02761351227547292
  (0, 212)	0.023325671822305483
  (0, 271)	0.054392547007153
  (0, 328)	0.060358339691341766
  (0, 357)	0.013921754610837712
  (0, 451)	0.020578335403474236
  (0, 578)	0.07906058794712593
  (0, 617)	0.019850889079147057
  (0, 620)	0.01867769280554434
  (0, 622)	0.044611051373187424
  (0, 648)	0.015493583340569357
  (0, 680)	0.019224695171899866
  (0, 978)	0.0267316264615063
  (0, 999)	0.020738114075644642
  (0, 1022)	0.0371485992478585
  (0, 1033)	0.014808690907292291
  (0, 1085)	0.028815013247480384
  (0, 1089)	0.018102417756284926
  (0, 1197)	0.022505695388032227
  (0, 1206)	0.010675511636980443
  (0, 1549)	0.02507693986049084
  (0, 1665)	0.025967702221880915
  (0, 1713)	0.041128098634750046
  (0, 1744)	0.027618563562476325
  (0, 1783)	0.018872240981032625
  :	:
  (6334, 39650)	0.02398512016423436
  (6334, 39971)	0.0407182753213

In [21]:
print(y)

0       0
1       0
2       1
3       0
4       1
       ..
6330    1
6331    0
6332    0
6333    1
6334    1
Name: label, Length: 6335, dtype: int64


In [22]:
# train test split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [23]:
# apply the model 
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

In [24]:
# accuracy score on the training data
x_train_pred = lr_model.predict(x_train)
train_data_accuracy = accuracy_score(x_train_pred, y_train)

In [25]:
# accuracy score on the testing data
x_test_pred = lr_model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_pred, y_test)

In [26]:
print("Accuracy score for Training Data : ",train_data_accuracy)
print("Accuracy score for Testing Data : ", test_data_accuracy)

Accuracy score for Training Data :  0.9518547750591949
Accuracy score for Testing Data :  0.9163378058405682


In [27]:
import joblib

joblib.dump(lr_model,"lr_model_jani.pkl")

['lr_model_jani.pkl']

In [28]:
new_model = joblib.load("lr_model_jani.pkl")

In [29]:
new_model.predict(x_test)

array([1, 0, 0, ..., 1, 1, 1], shape=(1267,))

In [30]:
# now we here should the predictive system
x_new = x_test[1]

prediction = lr_model.predict(x_new)
print(prediction)

if(prediction[0] == 0):
    print("The news is Real ")
else:
    print("The news is Fake ")

[0]
The news is Real 
