In [1]:
!pip install numpy pandas scikit-learn



In [2]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

In [3]:
df = pd.read_csv('news.csv')

df.shape
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\r\nI’m not an immigrant, but my grandparent...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [4]:
print(df.isnull().sum())

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64


In [5]:
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
text = df.text
text.head()

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [7]:
title = df.title
title.head()

0                         You Can Smell Hillary’s Fear
1    Watch The Exact Moment Paul Ryan Committed Pol...
2          Kerry to go to Paris in gesture of sympathy
3    Bernie supporters on Twitter erupt in anger ag...
4     The Battle of New York: Why This Primary Matters
Name: title, dtype: object

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=5) #change test_size acc. 

In [9]:
x_train

5667    Russian state media reported that many of the ...
5357    Republican presidential hopeful Donald Trump c...
4634    Email \r\nThe various communities in the state...
1292    Most are familiar with the “Jaywalking” quizze...
5058    Snowden’s former employer hires ex-FBI directo...
                              ...                        
3046    By Shane Trejo This brave new world we are liv...
1725    Migrant Crisis Disclaimer \r\nWe here at the D...
4079    Europe or Russia: Could Moldova's presidential...
2254    With House Republicans pushing for a governmen...
2915    Ask him, and he'll tell you himself. "I'm very...
Name: text, Length: 5068, dtype: object

In [10]:
x_test

1227      Bill Clinton is a sex-addicted ‘monster’ who...
5803    Putin Takes On The NWO, October 2016 # thinkbi...
4976    Washington (CNN) Bernie Sanders' campaign on F...
1112    Banana Republic Election in the United States?...
6083    Email \r\n\r\nDemocratic Vice President Joe Bi...
                              ...                        
4502      Recipient Email => \r\nThere’s playing with ...
5363    Donald Trump, trailing narrowly in presidentia...
5660    Interviews The FBI probe was inevitable “becau...
4955    But he rejects the idea that Clinton's campaig...
1931    Wikileaks Just Released Her Full Isis Donor Li...
Name: text, Length: 1267, dtype: object

In [11]:
y_train

5667    REAL
5357    REAL
4634    FAKE
1292    REAL
5058    FAKE
        ... 
3046    FAKE
1725    FAKE
4079    FAKE
2254    REAL
2915    REAL
Name: label, Length: 5068, dtype: object

In [12]:
y_test

1227    FAKE
5803    FAKE
4976    REAL
1112    FAKE
6083    FAKE
        ... 
4502    FAKE
5363    REAL
5660    FAKE
4955    REAL
1931    FAKE
Name: label, Length: 1267, dtype: object

In [13]:
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train=tfidf_vectorizer.fit_transform(x_train) 

tfidf_test=tfidf_vectorizer.transform(x_test)

In [14]:
model=PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train,y_train)

y_pred=model.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.16%


In [15]:
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[599,  38],
       [ 36, 594]], dtype=int64)

In [26]:
input_data = ["Bush died"]
vectorized_input_data = tfidf_vectorizer.transform(input_data)
prediction = model.predict(vectorized_input_data)
print(prediction)

['REAL']


In [30]:
# filename = 'projmodel'
# pickle.dump(model, open(filename, 'wb'))

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)
    
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)