In [1]:
import numpy as np 
import pandas as pd 
import gensim
import nltk as nl
from sklearn.feature_extraction import _stop_words
import os
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split

In [2]:
os.chdir('D:/Hackathon')

# Loading Dataset

In [3]:
Headlines = pd.read_csv('buzzfeed-v02.csv', usecols =['Text'],sep=';').dropna()
Headlines1 = pd.read_csv('fake_or_real_news.csv', usecols =['title'],sep=';').dropna()
Headlines2 = pd.read_csv('UnreliableNewsData/full.csv', usecols =['Text'],sep=';').dropna()
Headlines.head()

Unnamed: 0,Text
0,"A little less than a decade ago, hockey fans w..."
1,The writers of the HBO series The Sopranos too...
2,Despite claims from the TV news outlet to offe...
3,After receiving 'subpar' service and experienc...
4,After watching his beloved Seattle Mariners pr...


In [4]:
Headlines1 = Headlines1.drop_duplicates('title')
Headlines = Headlines.drop_duplicates('Text')
Headlines2 = Headlines2.drop_duplicates('Text')

In [5]:
Headlines1 = Headlines1.rename(columns={'title': 'headline_text'})
Headlines = Headlines.rename(columns={'Text': 'headline_text'})
Headlines2 = Headlines2.rename(columns={'Text': 'headline_text'})

# Labelling

In [6]:
#Creating lable for datasets
#buzfeed-headlines dataset will be used as real headlines
#fake-or-real-news-dataset & unreliable-news dataset will be used as fake headlines
Headlines['fake'] = 0
Headlines1['fake'] = 1
Headlines2['fake'] = 1

In [7]:
data1 = pd.concat([Headlines,Headlines1,Headlines2]).astype(str)
data = data1.copy()
print('Training dataset contains: {} Real headlines and {} Fake headlines.'.format(len(Headlines),len(Headlines1)+len(Headlines2)))

Training dataset contains: 38720 Real headlines and 44977 Fake headlines.


# Data Processing

In [8]:
nltk_stopwords = nl.corpus.stopwords.words('english')
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
sklearn_stopwords = _stop_words.ENGLISH_STOP_WORDS
combined_stopwords = sklearn_stopwords.union(nltk_stopwords,gensim_stopwords)

In [9]:
print('NLTK has {} stop words'.format(len(nltk_stopwords)))
print('Gensim has {} stop words'.format(len(gensim_stopwords)))
print('Sklearn has {} stop words'.format(len(sklearn_stopwords)))
print('Combined stopwords list has {} stop words'.format(len(combined_stopwords)))

NLTK has 179 stop words
Gensim has 337 stop words
Sklearn has 318 stop words
Combined stopwords list has 390 stop words


In [10]:
porter_stemmer = PorterStemmer() 

In [11]:
data['headline_text'] = data['headline_text'].apply(lambda x: x.lower())
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([porter_stemmer.stem(word) for word in x.split()]))
data['headline_text'] = data['headline_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (combined_stopwords)]))

# Splitting Dataset

In [12]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(data['headline_text'], data['fake'], test_size=0.2, random_state=7)

# Construct models with TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [14]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = word_tokenize, max_features = 300)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
tfidf_features = tfidf_vectorizer.get_feature_names()

In [None]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC(kernel='linear')
knn = KNeighborsClassifier()
nb = MultinomialNB()

dt.fit(tfidf_train, y_train)
rf.fit(tfidf_train, y_train)
svc.fit(tfidf_train, y_train)
knn.fit(tfidf_train, y_train)
nb.fit(tfidf_train, y_train)

In [None]:
print ("Testing Acc. of Decision Tree: {} %".format(round(dt.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Random Forest: {} %".format(round(rf.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of SVC: {} %".format(round(svc.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of K-NN: {} %".format(round(knn.score(tfidf_test, y_test) * 100, 2)))
print ("Testing Acc. of Naive Bayesian: {} %".format(round(nb.score(tfidf_test, y_test) * 100, 2)))

In [None]:
tfidf_train = tfidf_train.todense()
tfidf_test = tfidf_test.todense()

neural_network = Sequential()
neural_network.add(Dense(64, input_dim=len(tfidf_features), activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(64, activation='relu'))
neural_network.add(Dropout(0.1))
neural_network.add(Dense(1, activation='sigmoid'))
neural_network.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = neural_network.fit(tfidf_train, y_train, epochs=50, batch_size=512, verbose=0)
_,test_acc = neural_network.evaluate(tfidf_test,y_test,verbose=0)
print ("Testing Acc. of DNN: {} %".format(round(test_acc * 100, 2)))