In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

In [2]:
df=pd.read_csv('news.csv')

In [3]:
df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [5]:
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

In [6]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [8]:
pac=PassiveAggressiveClassifier(max_iter=50)
new_pac = pac.fit(tfidf_train,y_train)

y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.98%


In [11]:
pickle.dump(new_pac, open('fake_news.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('fake_news_tfidf_vectorizer.pkl', 'wb'))

In [9]:
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[592,  46],
       [ 43, 586]], dtype=int64)

In [12]:
inp = '''Sri Lanka will not allow other countries to achieve their geopolitical needs by introducing “separatism under the guise of power devolution” in the island nation, President Gotabaya Rajapaksa has said.

“The government does not wish to be associated with the power struggles in the Indian Ocean region by the global giants,” Mr. Rajapaksa said, adding that the sovereignty of Sri Lanka would not be betrayed, a front-page report in the state-run Daily News said on Monday.'''

In [13]:
type(tfidf_test)

scipy.sparse.csr.csr_matrix

In [14]:
inp_ser = pd.Series(data = inp)

In [15]:
tfidf_inp_ser=tfidf_vectorizer.transform(inp_ser)
ans=pac.predict(tfidf_inp_ser)

In [16]:
print(ans)

['REAL']


In [17]:
y_pred=pac.predict(tfidf_test)
print(y_pred)

['REAL' 'FAKE' 'REAL' ... 'REAL' 'FAKE' 'REAL']


In [31]:
type(ans)

numpy.ndarray

In [43]:
new_inp = '''Sri Lanka will not allow other countries to achieve their geopolitical needs by introducing “separatism under the guise of power devolution” in the island nation, President Gotabaya Rajapaksa has said.

“The government does not wish to be associated with the power struggles in the Indian Ocean region by the global giants,” Mr. Rajapaksa said, adding that the sovereignty of Sri Lanka would not be betrayed, a front-page report in the state-run Daily News said on Monday.'''

In [35]:
pac.predict(np.array([[new_inp]]))

  X = check_array(X, accept_sparse='csr')


ValueError: X has 1 features per sample; expecting 61651

In [34]:
tfidf_train

<5068x61651 sparse matrix of type '<class 'numpy.float64'>'
	with 1337098 stored elements in Compressed Sparse Row format>

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
DT = DecisionTreeClassifier()
DT.fit(tfidf_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [40]:
y_pred=DT.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 80.9%


In [49]:
DT.predict(tfidf_inp_ser)

array(['REAL'], dtype=object)