# Importing the Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

What is a PassiveAggressiveClassifier?

Passive Aggressive algorithms are online learning algorithms. Such an algorithm remains passive for a correct classification outcome, and turns aggressive in the event of a miscalculation, updating and adjusting. Unlike most other algorithms, it does not converge. Its purpose is to make updates that correct the loss, causing very little change in the norm of the weight vector.

# Data Collection and Preprocessing

In [2]:
df=pd.read_csv("news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#number of rows and columns
df.shape

(6335, 4)

# Label Encoding

In [4]:
#label fake as 0;real as 1;
df.loc[df['label']=='FAKE','label',]=0
df.loc[df['label']=='REAL','label',]=1

In [5]:
x=df['text']
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [6]:
y=df['label']
y

0       0
1       0
2       1
3       0
4       1
       ..
6330    1
6331    0
6332    0
6333    1
6334    1
Name: label, Length: 6335, dtype: object

# Splitting the data into training and testing dataset

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [8]:
print(x.shape,x_train.shape,x_test.shape)

(6335,) (5068,) (1267,)


In [9]:
#Now fit and transform the vectorizer on the train set, and transform the vectorizer on the test set.
feature_extraction=TfidfVectorizer(stop_words='english',max_df=0.7)
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [10]:
#convert y_train and y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [11]:
print(x_train)

20      With little fanfare this fall, the New York de...
961     Donald Trump’s supporters deserve to have thei...
1715    Jay Parini , a poet and novelist, teaches at M...
2371    Former Rhode Island Gov. Lincoln Chafee ended ...
5159    BROOKLYN, Iowa - Ted Cruz has made a name for ...
                              ...                        
1099    (CNN) Hillary Clinton declared victory early T...
2514    posted by Eddie Astronomers have recorded myst...
3606    Hillary Clinton and Donald Trump look to rebou...
5704    Principal Krystal Hardy has dedicated herself ...
2575    \nA new email released as part of the Wikileak...
Name: text, Length: 5068, dtype: object


In [12]:
print(x_train_features)

  (0, 20235)	0.0331607458824967
  (0, 57605)	0.05161603744955192
  (0, 52708)	0.04663879708434075
  (0, 56393)	0.030134560037667565
  (0, 21722)	0.03145165749593157
  (0, 4624)	0.053429993709189286
  (0, 16125)	0.07300658945665849
  (0, 44657)	0.03093677761514883
  (0, 49795)	0.041920370249217756
  (0, 25032)	0.02740882795867995
  (0, 38978)	0.04071521350043149
  (0, 7184)	0.05082119972739067
  (0, 17087)	0.03908895325296196
  (0, 29603)	0.04876299809816476
  (0, 9989)	0.060056520479806516
  (0, 42603)	0.06903399409609802
  (0, 28699)	0.06752368343492604
  (0, 30015)	0.03311203762612271
  (0, 49018)	0.039282971982935455
  (0, 29120)	0.027178284952699566
  (0, 26497)	0.036328155257765095
  (0, 29050)	0.04353381476471955
  (0, 40002)	0.06309530237397165
  (0, 31796)	0.028097299237288407
  (0, 46351)	0.055868584018087954
  :	:
  (5067, 46178)	0.03722939703381832
  (5067, 27025)	0.06383301938620951
  (5067, 34161)	0.042084777283773085
  (5067, 53097)	0.05173922424876295
  (5067, 47805)	0.1

# Model Training

In [13]:
classifier=PassiveAggressiveClassifier()

In [14]:
classifier.fit(x_train_features,y_train)

PassiveAggressiveClassifier()

# Model Evaluation

In [15]:
training_data_prediction=classifier.predict(x_train_features)
training_accuracy_score=accuracy_score(y_train,training_data_prediction)
print("Accuracy score of training data:",training_accuracy_score)

Accuracy score of training data: 1.0


In [16]:
testing_data_prediction=classifier.predict(x_test_features)
testing_accuracy_score=accuracy_score(y_test,testing_data_prediction)
print("Accuracy score of testing data:",testing_accuracy_score)

Accuracy score of testing data: 0.9400157853196527


# Building a Predictive System

In [17]:
input_text=["Daniel Greenfield, a Shillman Journalism Fello..."]
#convert text to feature vectors
input_data_features=feature_extraction.transform(input_text)
#making prediction
prediction=classifier.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print("The news is real(i.e.,not fake)")
else:
    print("The news is fake")

[0]
The news is fake
