In [4]:
# import library

import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

In [5]:
# Load dataset

df=pd.read_csv('news.csv', index_col=None)
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [6]:
# drop dataset
# column wise - axis=1

dataset=df.drop("Unnamed: 0",axis=1)
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [8]:
# output 

y=dataset["label"]

In [9]:
# input
# taken text as input

X_train, X_test, y_train, y_test = train_test_split(dataset['text'], y, test_size=0.33, random_state=53)

# Hashing Vector

In [32]:
# Training set
# fit = model creation
# fit_transfrom -  model creation + prediction
# Testing set
# transform - with trainset, predict test set

# Examples of stop words in English are “a,” “the,” “is,” “are,” etc.
# to eliminate words that are so widely used, that they carry very little useful information.

# binary=True eliminates all negative values

hashing_vector = HashingVectorizer(stop_words='english',n_features=2**4, binary= True)
count_train = hashing_vector.fit_transform(X_train)
print(count_train)
print(count_train.shape)
count_test = hashing_vector.transform(X_test)

  (1, 0)	0.2886751345948129
  (1, 1)	0.2886751345948129
  (1, 2)	0.2886751345948129
  (1, 3)	0.2886751345948129
  (1, 6)	0.2886751345948129
  (1, 7)	0.2886751345948129
  (1, 8)	0.2886751345948129
  (1, 9)	0.2886751345948129
  (1, 10)	0.2886751345948129
  (1, 12)	0.2886751345948129
  (1, 13)	0.2886751345948129
  (1, 14)	0.2886751345948129
  (2, 0)	0.35355339059327373
  (2, 3)	0.35355339059327373
  (2, 5)	0.35355339059327373
  (2, 6)	0.35355339059327373
  (2, 7)	0.35355339059327373
  (2, 9)	0.35355339059327373
  (2, 11)	0.35355339059327373
  (2, 13)	0.35355339059327373
  (3, 1)	0.2672612419124244
  (3, 2)	0.2672612419124244
  (3, 3)	0.2672612419124244
  (3, 4)	0.2672612419124244
  (3, 5)	0.2672612419124244
  :	:
  (4242, 7)	0.25
  (4242, 8)	0.25
  (4242, 9)	0.25
  (4242, 10)	0.25
  (4242, 11)	0.25
  (4242, 12)	0.25
  (4242, 13)	0.25
  (4242, 14)	0.25
  (4242, 15)	0.25
  (4243, 0)	0.25
  (4243, 1)	0.25
  (4243, 2)	0.25
  (4243, 3)	0.25
  (4243, 4)	0.25
  (4243, 5)	0.25
  (4243, 6)	0.25
  

In [21]:
 # get_feature_names_out([input_features]) - Get output feature names for transformation.

# len(hashing_vector.get_feature_names_out())

In [33]:
# Numbers - vector
# Text pre processing Technique
# Now, Text are converted into numbers

print(count_train.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.28867513 0.28867513 0.28867513 ... 0.28867513 0.28867513 0.        ]
 [0.35355339 0.         0.         ... 0.35355339 0.         0.        ]
 ...
 [0.25       0.25       0.25       ... 0.25       0.25       0.25      ]
 [0.25       0.25       0.25       ... 0.25       0.25       0.25      ]
 [0.25       0.25       0.25       ... 0.25       0.25       0.25      ]]


# MultinomialNB() - Navie Bayes - ML Algorithm

In [34]:
# Model Creation

clf = MultinomialNB()
clf.fit(count_train, y_train)
pred = clf.predict(count_test)

# Confusion Matrix
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])

accuracy:   0.497


In [28]:
# classification report

from sklearn.metrics import classification_report

report=classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

        FAKE       0.49      1.00      0.66      1008
        REAL       1.00      0.03      0.06      1083

    accuracy                           0.50      2091
   macro avg       0.74      0.51      0.36      2091
weighted avg       0.75      0.50      0.35      2091



In [35]:
dataset["text"]

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [36]:
dataset["text"][0]



In [37]:
count_train[[0]]

<1x16 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [38]:
# Deployment - Predict whether it is fake or real
# Use counter vector to predict

hashing_vector = HashingVectorizer(stop_words='english')
count_train = hashing_vector.fit_transform(X_train)
#print(count_train)
#count_test = tf_idf.transform(X_test)

In [39]:
X_train[[0]]

0    Daniel Greenfield, a Shillman Journalism Fello...
Name: text, dtype: object

In [40]:
# Predict count_train[0] -  Predict as FAKE - row 0 is FAKE

clf.predict(count_train[[0]])

ValueError: X has 1048576 features, but MultinomialNB is expecting 16 features as input.