In [1]:
import pandas as pd
import numpy as np

In [2]:
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer # Helps converting text to vectors(numbers)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [3]:
#!pip install matplotlib

In [4]:
import matplotlib.pyplot as plt

In [5]:
#Loading the dataset
df=pd.read_csv("news.csv")
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [6]:
dataset= df.drop("Unnamed: 0", axis=1) # columnwise deletion
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [7]:
x= dataset['text']
y=dataset['label']

In [8]:
x[0]



In [9]:
y[1]

'FAKE'

In [10]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [11]:
#TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=53)

In [12]:
X_train

2576                                                     
1539    Report Copyright Violation Do you think there ...
5163    The election in 232 photos, 43 numbers and 131...
2615    Email Ever wonder what’s on the mind of today’...
4270    Wells Fargo is Rotting from the Top Down Wells...
                              ...                        
662     —Debby Borza stood before a wall of photos of ...
3261    Presumptive Republican nominee Donald Trump ha...
5883    December's job growth numbers are in, and they...
2933    In a wide-ranging discussion, Trump also said ...
797     Top officials of the Cruz campaign are convinc...
Name: text, Length: 4244, dtype: object

In [13]:
X_train.shape

(4244,)

In [14]:
y_train.shape

(4244,)

In [15]:
y_test.shape

(2091,)

In [16]:
# You can do counter vectorization either before or after train test split
# Stop words are (is this was in of to the) which dont carry more meaning

In [17]:
Tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [18]:
Tfidf_vectorizer

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'


The difference between fit() and fit_transform() (especially in the context of CountVectorizer or similar scikit-learn classes) can be summarized as follows:

fit()
This method learns or "fits" the model parameters from the input data. For CountVectorizer, fit() means it goes through the input documents and builds (learns) the vocabulary: a mapping of unique tokens (words or n-grams) to feature indices. It does not transform the data into any vector representation yet. You call fit() on the training data to learn the vocabulary only.

transform()
This method applies the learned transformation to data. Using the vocabulary created by fit(), it converts the input text documents into a feature matrix (e.g., a sparse matrix of word counts). You use transform() on data after vocabulary has been learned, typically on the test set or any new/unseen data, ensuring that features correspond exactly to those learned from training data.

fit_transform()
This is a convenience method that combines fit() and transform() into a single step: it learns the vocabulary from the input data and then transforms that same data into the feature matrix in one call. This is usually used on the training data for efficiency and convenience.

In [20]:
Tfidf_train = Tfidf_vectorizer.fit_transform(X_train)
print(Tfidf_train)
Tfidf_test = Tfidf_vectorizer.transform(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1119820 stored elements and shape (4244, 56922)>
  Coords	Values
  (1, 42470)	0.07711040274149526
  (1, 12105)	0.15008066461476866
  (1, 54177)	0.13782629144711137
  (1, 50628)	0.061296988343109586
  (1, 15924)	0.3479045460649079
  (1, 44520)	0.4973826512693341
  (1, 51896)	0.11596517664605868
  (1, 35783)	0.30902690818827977
  (1, 35256)	0.12628385718450857
  (1, 21881)	0.21271688045815978
  (1, 42534)	0.06081715886809217
  (1, 8399)	0.08729542880625335
  (1, 29531)	0.1454406205718245
  (1, 15927)	0.4973826512693341
  (1, 25686)	0.13550453594288983
  (1, 49203)	0.1672740861784377
  (1, 16814)	0.10404977746548139
  (1, 36087)	0.12648679854389897
  (1, 21568)	0.1007920919566398
  (1, 25684)	0.1030420922189754
  (1, 38823)	0.06048803110658644
  (1, 47506)	0.14539060877460044
  (1, 36831)	0.10772488937433067
  (2, 16972)	0.1606296088662543
  (2, 762)	0.48803966069171073
  :	:
  (4243, 41435)	0.02969665315895183
  (4243, 53607)	

In [21]:
Tfidf_train.shape
# Here all the texts are converted to rows and uniqyue text columns and shows only values that are non zero
# There are 4244 rows and 56922 unique words in count train

(4244, 56922)

In [22]:
# You can find the exact unioque word by passing its index as input
feature_names = Tfidf_vectorizer.get_feature_names_out()
word = feature_names[36087]
print(word)

opposite


In [24]:
dir(Tfidf_train)

['T',
 '__abs__',
 '__add__',
 '__array_priority__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__idiv__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmatmul__',
 '__rmul__',
 '__round__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '_add_dense',
 '_add_sparse',
 '_allow_nd',
 '_arg_min_or_max',
 '_arg_min_or_max_axis',
 '_ascontainer',
 '_asfptype',
 '_asindices',
 '_binopt',
 '_broadcast_to',
 '_bsr_container',
 '_container',
 '_coo_container',
 '_csc_contai

In [25]:
Tfidf_vectorizer.get_feature_names_out(). shape

(56922,)

In [26]:
len(Tfidf_vectorizer.get_feature_names_out())

56922

In [27]:
print(Tfidf_train.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
Tfidf_train[1]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (1, 56922)>

In [29]:
clf = MultinomialNB()

clf.fit(Tfidf_train, y_train)
pred = clf.predict(Tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.857


In [30]:
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
cm

array([[ 739,  269],
       [  31, 1052]])

In [31]:
from sklearn.metrics import classification_report

report=classification_report(y_test, pred)

In [32]:
print(report)

              precision    recall  f1-score   support

        FAKE       0.96      0.73      0.83      1008
        REAL       0.80      0.97      0.88      1083

    accuracy                           0.86      2091
   macro avg       0.88      0.85      0.85      2091
weighted avg       0.88      0.86      0.85      2091



In [33]:
print(f"accuracy:   {score:.3f}")

accuracy:   0.857


# DEPLOYMENT

In [39]:
dataset["text"][540]

"(Before It's News)\nCompare and contrast New Jersey and Florida voting protocols, In Florida the information on your voter registration card and ID have to match, you are issued a 12″ printout (similar to a cash register receipt) showing your name, date of birth and address which you then must confirm, and that is placed on a clear […]"

In [40]:
Tfidf_train[[540]]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 134 stored elements and shape (1, 56922)>

In [36]:
#count_vectorizer = CountVectorizer(stop_words='english')
#count_train = count_vectorizer.fit_transform(X_train)
#print(count_train)
#count_test = count_vectorizer.transform(X_test)


In [41]:
#count_input=count_vectorizer.transform([[dataset["text"][0]]])
X_train[[540]]

540    (Before It's News)\nCompare and contrast New J...
Name: text, dtype: object

In [42]:
clf.predict(Tfidf_train[[540]])

array(['FAKE'], dtype='<U4')

In [43]:
y_train[540]

'FAKE'