In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
dataframe = pd.read_csv('realfake.csv')

In [3]:
dataframe.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
dataframe.tail()

Unnamed: 0.1,Unnamed: 0,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [5]:
dataframe.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [6]:
def replaceword(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [7]:
dataframe["text"] = dataframe["text"].apply(replaceword)

In [8]:
dataframe.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,daniel greenfield a shillman journalism fello...,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,u s secretary of state john f kerry said mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november the le...,FAKE
4,875,The Battle of New York: Why This Primary Matters,it s primary day in new york and front runners...,REAL


In [9]:
x = dataframe['text']
y = dataframe['label']

In [10]:
x

0       daniel greenfield  a shillman journalism fello...
1       google pinterest digg linkedin reddit stumbleu...
2       u s  secretary of state john f  kerry said mon...
3         kaydee king   kaydeeking  november    the le...
4       it s primary day in new york and front runners...
                              ...                        
6330    the state department told the republican natio...
6331    the  p  in pbs should stand for  plutocratic  ...
6332     anti trump protesters are tools of the oligar...
6333    addis ababa  ethiopia  president obama convene...
6334    jeb bush is suddenly attacking trump  here s w...
Name: text, Length: 6335, dtype: object

In [11]:
y

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
y_train

2402    REAL
1922    REAL
3475    FAKE
6197    REAL
4748    FAKE
        ... 
4931    REAL
3264    REAL
1653    FAKE
2607    FAKE
2732    REAL
Name: label, Length: 5068, dtype: object

In [13]:
y_train

2402    REAL
1922    REAL
3475    FAKE
6197    REAL
4748    FAKE
        ... 
4931    REAL
3264    REAL
1653    FAKE
2607    FAKE
2732    REAL
Name: label, Length: 5068, dtype: object

In [14]:
tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)

LOGISTIC REGRESSION

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
LR = LogisticRegression()
LR.fit(tfid_x_train,y_train)

In [17]:
LR_pred = LR.predict(tfid_x_test)
score = accuracy_score(y_test,LR_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 91.48%


In [18]:
LR_cf = confusion_matrix(y_test,LR_pred, labels=['FAKE','REAL'])
print(LR_cf)

[[571  44]
 [ 64 588]]


DECISION TREE CLASSIFIER

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
DT = DecisionTreeClassifier()
DT.fit(tfid_x_train,y_train)

In [21]:
DT_pred = DT.predict(tfid_x_test)
score = accuracy_score(y_test,DT_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 81.14%


In [22]:
DT_cf = confusion_matrix(y_test,DT_pred, labels=['FAKE','REAL'])
print(DT_cf)

[[495 120]
 [119 533]]


RANDOM FOREST CLASSIFIER

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
RFC = RandomForestClassifier()
RFC.fit(tfid_x_train,y_train)

In [25]:
RFC_pred = RFC.predict(tfid_x_test)
score = accuracy_score(y_test,RFC_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 89.58%


In [26]:
RFC_cf = confusion_matrix(y_test,RFC_pred, labels=['FAKE','REAL'])
print(RFC_cf)

[[543  72]
 [ 60 592]]


PASSIVE AGRESSIVE CLASSIFIER

In [27]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [28]:
PAC = PassiveAggressiveClassifier()
PAC.fit(tfid_x_train,y_train)

In [29]:
PAC_pred = PAC.predict(tfid_x_test)
score = accuracy_score(y_test,PAC_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.82%


In [30]:
PAC_cf = confusion_matrix(y_test,PAC_pred, labels=['FAKE','REAL'])
print(PAC_cf)

[[566  49]
 [ 42 610]]


K-NEAREST CLASSIFIER

In [31]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
KNN = KNeighborsClassifier()
KNN.fit(tfid_x_train,y_train)

In [33]:
KNN_pred = KNN.predict(tfid_x_test)
score = accuracy_score(y_test,KNN_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 56.27%


In [34]:
KNN_cf = confusion_matrix(y_test,KNN_pred, labels=['FAKE','REAL'])
print(KNN_cf)

[[615   0]
 [554  98]]


In [35]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = LR.predict(vectorized_input_data)
    print(prediction)

In [36]:
fake_news_det('U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sundayâ€™s unity march against terrorism.')

['REAL']


In [37]:
fake_news_det("""Go to Article 
President Barack Obama has been campaigning hard for the woman who is supposedly going to extend his legacy four more years. The only problem with stumping for Hillary Clinton, however, is sheâ€™s not exactly a candidate easy to get too enthused about.  """)

['FAKE']


# FILE DUMP

LOGISTIC REGRESSION

In [38]:
import pickle

In [39]:
pickle.dump(LR,open('LRmodel.pkl', 'wb'))

In [40]:
loaded_model = pickle.load(open('LRmodel.pkl', 'rb'))

In [41]:
def fake_news_det1(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = loaded_model.predict(vectorized_input_data)
    print(prediction)

In [42]:
fake_news_det1('U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sundayâ€™s unity march against terrorism.')

['REAL']


In [43]:
fake_news_det1("""Go to Article 
President Barack Obama has been campaigning hard for the woman who is supposedly going to extend his legacy four more years. The only problem with stumping for Hillary Clinton, however, is sheâ€™s not exactly a candidate easy to get too enthused about.  """)

['FAKE']
