In [1]:
#From nltk, download 'averaged_perceptron_tagger', 'wordnet', 'stopwords'
import pandas as pd
import numpy as np
import sklearn as skl
data = pd.read_csv('G:/Users/Krunal/Desktop/news.csv')

In [2]:
#Original Data
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#Checking for null values
data.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [4]:
#No null values present.
#Value counts
data.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [5]:
#Categories are balanced.
#Defining preprocessing function
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocessing(col):
    #Lower case
    lower = col.apply(str.lower)

    #Lemmatizing
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized = lower.apply(lambda x: ' '.join(lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in str(x).split()))
    
    #removing stopwords and extra spaces
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    rem_stopwords = lemmatized.apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
    
    #removing numbers
    rem_num = rem_stopwords.apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
    
    #removing punctuations
    import string
    import re
    rem_punc = rem_num.str.replace('[^\w\s]','')
    
    #removing words of length 1
    rem_one = rem_punc.apply(lambda x: ' '.join([word for word in x.split() if len(word)>1]))

    
    return rem_one

In [6]:
#Processing data for vectorization
#news_processed was processed using the commands given below
#data = pd.concat([pd.get_dummies(data.label).drop('FAKE',axis=1),data.drop(['label','Unnamed: 0'],axis=1)],axis=1)
#data['xdata'] = data.title + " " + data.text
#data = data.drop(['title', 'text'], axis = 1)
#data['xdata'] = preprocessing(data['xdata'])
data = pd.read_csv('G:/Users/Krunal/Desktop/news_processed.csv')

In [12]:
#Processed Data
data.head()

Unnamed: 0.1,Unnamed: 0,REAL,xdata
0,0,0,smell hillarys fear daniel greenfield shillman...
1,1,0,watch exact moment paul ryan commit political ...
2,2,1,kerry go paris gesture sympathy us secretary s...
3,3,0,bernie supporter twitter erupt anger dnc we tr...
4,4,1,battle new york primary matter primary day new...


In [7]:
#splitting data into training and testing sets of x and y
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
tf = TfidfVectorizer(ngram_range = (1,3), min_df = 5)
xset = data.xdata
yset = data.REAL
xtrain, xtest, ytrain, ytest = train_test_split(xset, yset, test_size=0.33, random_state = 232)

In [8]:
#vectorizing text
xtraintf = tf.fit_transform(xtrain.values.astype(str))
xtesttf = tf.transform(xtest.values.astype(str))

In [9]:
#initializing and training model
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(random_state = 232).fit(xtraintf,ytrain)

In [10]:
#Predicting and calculating accuracy
ypred = pac.predict(xtesttf)
print(str(np.mean(ypred == ytest)*100) + "% accuracy")

95.3610712577714% accuracy


In [11]:
#Generating confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, ypred)

array([[1000,   41],
       [  56,  994]], dtype=int64)