In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('fake_or_real_news.csv')

In [3]:
df.shape

(6335, 4)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
labels.value_counts(1)*100

REAL    50.055249
FAKE    49.944751
Name: label, dtype: float64

In [7]:
import re
# from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus=[]
for i in range(0,len(df)):
    message = re.sub('[^a-zA-Z]',' ',df['text'][i])
    message = message.lower()
    message = message.split()
    message = [lemmatizer.lemmatize(word) for word in message ]
    message = ' '.join(message)
    corpus.append(message)

In [8]:
X_train,X_test,y_train,y_test = train_test_split(corpus,labels,test_size=0.2,random_state = 24)

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)
#fit and transform
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [10]:
pac =  PassiveAggressiveClassifier()
pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier()

In [11]:
y_pred = pac.predict(tfidf_test)
y_pred

array(['FAKE', 'REAL', 'REAL', ..., 'FAKE', 'REAL', 'FAKE'], dtype='<U4')

In [12]:
score= accuracy_score(y_test,y_pred)
score

0.9265982636148382

In [13]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[616,  50],
       [ 43, 558]], dtype=int64)

In [14]:
calculation = pd.DataFrame(np.c_[y_test,y_pred], columns = ["Original label","predict label"])
calculation.head(11)

Unnamed: 0,Original label,predict label
0,FAKE,FAKE
1,REAL,REAL
2,REAL,REAL
3,FAKE,FAKE
4,FAKE,FAKE
5,REAL,REAL
6,REAL,REAL
7,REAL,REAL
8,REAL,REAL
9,FAKE,FAKE


In [15]:
a = df.loc[2]['text']
a

'U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.\n\nKerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.\n\nThe visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.\n\nThe French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, 

In [16]:
import re
strr = input("Enter a Message: ")
print("-------------------------------")
examples = strr

#preprocess
a = re.sub('[^a-zA-Z]',' ',examples)
a = a.lower()
a = a.split()
a = [lemmatizer.lemmatize(word) for word in a ]
a = ' '.join(a)  
print(a)
print("--------------------------------")
#apply
example_counts = tfidf_vectorizer.transform([a])
prediction =pac.predict(example_counts)
prediction[0]

# if prediction[0]==0:
#     print("This is Negative Review")
# elif prediction[0]==1:
#     print("This is Positive Review")

Enter a Message: The weather is so hot
-------------------------------
the weather is so hot
--------------------------------


'REAL'

In [17]:
import re
strr = input("Enter a Message: ")
print("-------------------------------")
examples = strr

#preprocess
a = re.sub('[^a-zA-Z]',' ',examples)
a = a.lower()
a = a.split()
a = [lemmatizer.lemmatize(word) for word in a ]
a = ' '.join(a)  
print(a)
print("--------------------------------")
#apply
example_counts = tfidf_vectorizer.transform([a])
prediction =pac.predict(example_counts)
prediction[0]

# if prediction[0]==0:
#     print("This is Negative Review")
# elif prediction[0]==1:
#     print("This is Positive Review")

Enter a Message: I love my home so much
-------------------------------
i love my home so much
--------------------------------


'FAKE'