# Fake news detection

In [1]:
import re
import glob
import numpy as np
import os
import json
import argparse
import time
import codecs
import string
import codecs
import random
import scipy.sparse as sp

from random import randrange
from scipy.sparse import csr_matrix, csc_matrix, hstack, coo_matrix
from gensim.matutils import Scipy2Corpus, corpus2csc
from gensim.models.logentropy_model import LogEntropyModel
from collections import defaultdict, Counter
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords as sw
from string import punctuation

## Feature extraction functions

In [2]:
#Extracts word-ngrams, when n=1 is equal to bag of words
def wordNgrams(text, n):
    ngrams = []
    text = [word for word in text.split() if word not in string.punctuation]
    ngrams = [' '.join(text[i:i+n])+'' for i in range(len(text)-n+1)]
    return ngrams

In [3]:
text='پاکستان کے وزیراعظم عمران خان سعودی عرب کے دارالحکومت ریاض میں ملک میں سرمایہ کاری کے حوالے سے سالانہ کانفرنس میں شرکت کر رہے ہیں حکومت پاکستان کا کہنا ہے کہ سعودی عرب نے پاکستان کو معاشی بحران سے نمٹنے میں مدد کے لیے ایک سال کے لیے تین ارب ڈالر دینے پر اتفاق کیا ہے دفترِ خارجہ کی جانب سے منگل کی شب جاری ہونے والے اعلامیے میں بتایا گیا ہے کہ یہ فیصلہ'
wordNgrams(text,2)

['پاکستان کے',
 'کے وزیراعظم',
 'وزیراعظم عمران',
 'عمران خان',
 'خان سعودی',
 'سعودی عرب',
 'عرب کے',
 'کے دارالحکومت',
 'دارالحکومت ریاض',
 'ریاض میں',
 'میں ملک',
 'ملک میں',
 'میں سرمایہ',
 'سرمایہ کاری',
 'کاری کے',
 'کے حوالے',
 'حوالے سے',
 'سے سالانہ',
 'سالانہ کانفرنس',
 'کانفرنس میں',
 'میں شرکت',
 'شرکت کر',
 'کر رہے',
 'رہے ہیں',
 'ہیں حکومت',
 'حکومت پاکستان',
 'پاکستان کا',
 'کا کہنا',
 'کہنا ہے',
 'ہے کہ',
 'کہ سعودی',
 'سعودی عرب',
 'عرب نے',
 'نے پاکستان',
 'پاکستان کو',
 'کو معاشی',
 'معاشی بحران',
 'بحران سے',
 'سے نمٹنے',
 'نمٹنے میں',
 'میں مدد',
 'مدد کے',
 'کے لیے',
 'لیے ایک',
 'ایک سال',
 'سال کے',
 'کے لیے',
 'لیے تین',
 'تین ارب',
 'ارب ڈالر',
 'ڈالر دینے',
 'دینے پر',
 'پر اتفاق',
 'اتفاق کیا',
 'کیا ہے',
 'ہے دفترِ',
 'دفترِ خارجہ',
 'خارجہ کی',
 'کی جانب',
 'جانب سے',
 'سے منگل',
 'منگل کی',
 'کی شب',
 'شب جاری',
 'جاری ہونے',
 'ہونے والے',
 'والے اعلامیے',
 'اعلامیے میں',
 'میں بتایا',
 'بتایا گیا',
 'گیا ہے',
 'ہے کہ',
 'کہ یہ',
 'یہ فیصلہ']

In [4]:
#Extracts character n-grams
def charNgrams(text, n):
    ngrams = []
    ngrams = [text[i:i+n]+'_cng' for i in range(len(text)-n+1)]
    return ngrams

In [5]:
charNgrams(text, 1)

['پ_cng',
 'ا_cng',
 'ک_cng',
 'س_cng',
 'ت_cng',
 'ا_cng',
 'ن_cng',
 ' _cng',
 'ک_cng',
 'ے_cng',
 ' _cng',
 'و_cng',
 'ز_cng',
 'ی_cng',
 'ر_cng',
 'ا_cng',
 'ع_cng',
 'ظ_cng',
 'م_cng',
 ' _cng',
 'ع_cng',
 'م_cng',
 'ر_cng',
 'ا_cng',
 'ن_cng',
 ' _cng',
 'خ_cng',
 'ا_cng',
 'ن_cng',
 ' _cng',
 'س_cng',
 'ع_cng',
 'و_cng',
 'د_cng',
 'ی_cng',
 ' _cng',
 'ع_cng',
 'ر_cng',
 'ب_cng',
 ' _cng',
 'ک_cng',
 'ے_cng',
 ' _cng',
 'د_cng',
 'ا_cng',
 'ر_cng',
 'ا_cng',
 'ل_cng',
 'ح_cng',
 'ک_cng',
 'و_cng',
 'م_cng',
 'ت_cng',
 ' _cng',
 'ر_cng',
 'ی_cng',
 'ا_cng',
 'ض_cng',
 ' _cng',
 'م_cng',
 'ی_cng',
 'ں_cng',
 ' _cng',
 'م_cng',
 'ل_cng',
 'ک_cng',
 ' _cng',
 'م_cng',
 'ی_cng',
 'ں_cng',
 ' _cng',
 'س_cng',
 'ر_cng',
 'م_cng',
 'ا_cng',
 'ی_cng',
 'ہ_cng',
 ' _cng',
 'ک_cng',
 'ا_cng',
 'ر_cng',
 'ی_cng',
 ' _cng',
 'ک_cng',
 'ے_cng',
 ' _cng',
 'ح_cng',
 'و_cng',
 'ا_cng',
 'ل_cng',
 'ے_cng',
 ' _cng',
 'س_cng',
 'ے_cng',
 ' _cng',
 'س_cng',
 'ا_cng',
 'ل_cng',
 'ا_cng',
 'ن_cng',


In [10]:
def load_diccionario(ruta):
    terms = set()#Dictionary of slangs
    try:
        tmp = open(ruta, "r")     
        while True :
            linea = tmp.readline()                                                                                   
            #linea = to_unicode(linea) 
            if (not linea) or (linea == ""):                                                                               
                break;                                                                                                      
            linea = linea.rstrip()
            terms.add(linea.lower())
        return (terms)
    except IOError as e:
        print ("Error: "+ruta+" I/O error({0}): {1}".format(e.errno, e.strerror))
        exit(1)

In [13]:
#Extracts function words n-grams with a pre-loaded dictionary
def funcNgrams(text, n):
    stop_words = load_diccionario('stop_words.txt')
    patt=r'\b(' + ('|'.join(re.escape(key) for key in stop_words)).lstrip('|') + r')\b'
    pattern = re.compile(patt)
    text = re.sub(r"(\n+|\r+|(\r\n)+)", " ", text)
    text = re.sub(r" +", " ", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(r"[" + punctuation + "]*", "", text)
    terms = pattern.findall(text)
    n_grams=[('_'.join(terms[i:i+n])) + "_fwn" for i in range(len(terms)-n+1)]

    return n_grams

In [14]:
len(funcNgrams(text, 2))

35

In [15]:
def extract_features(text,cn,wn,fn):
    text = text.lower()
    #text=clean_text(text)
    features = []
    for n in wn:
        if n != 0:
            features.extend(wordNgrams(text,n))
    for n in cn:
        if n != 0:
            features.extend(charNgrams(text,n))
    for n in fn:
            if n != 0:
                features.extend(funcNgrams(text,n))
    return features

In [18]:
# Extracts all features in a set of 'texts' and return as a string separated with the simbol '&%$'
def process_texts(texts,cn,wn,fn):
    occurrences=defaultdict(int)
    featuresList=[]
    featuresDict=Counter()
    for (text) in texts:
        features=extract_features(text,cn,wn,fn)
        featuresDict.update(features)
        featuresList.append('&%$'.join(features))
    return featuresList, featuresDict

## Reading and preparing the corpus

In [20]:
def preprocessText(text):
    #here remove text
    cleantext=re.sub("\d+", "0", text)
    return cleantext

In [21]:
#utility function for reading files
def read_txt_files(files):
    text=[]
    topic=[]
    for i,file_path in enumerate(files):
        print('news',file_path)
        with open(file_path,'r') as infile:
            cleantext=preprocessText(infile.read())
            text.append(cleantext)
            #print(file_path)
            file_topic=''.join(re.findall('[A-Za-z]',file_path.split('/')[3].split('.')[0]))
            #print (file_topic)
            topic.append(file_topic)
    return text, topic

In [22]:
#reading the path of real and fake news for training
train_path_real='Corpus/Train/Real/'
train_path_fake='Corpus/Train/Fake/'

real_news, real_news_topics = read_txt_files(sorted(glob.glob(train_path_real+'*.txt')))
fake_news, fake_news_topics = read_txt_files(sorted(glob.glob(train_path_fake+'*.txt')))

#contatenating real and fake news in one variable for training
train_texts = np.concatenate((real_news, fake_news))
train_labels = np.concatenate((np.ones(len(real_news)), np.zeros(len(fake_news))))
train_topics = np.concatenate((real_news_topics, fake_news_topics))

news Corpus/Train/Real/bus1.txt
news Corpus/Train/Real/bus10.txt
news Corpus/Train/Real/bus11.txt
news Corpus/Train/Real/bus12.txt
news Corpus/Train/Real/bus13.txt
news Corpus/Train/Real/bus14.txt
news Corpus/Train/Real/bus15.txt
news Corpus/Train/Real/bus16.txt
news Corpus/Train/Real/bus17.txt
news Corpus/Train/Real/bus18.txt
news Corpus/Train/Real/bus19.txt
news Corpus/Train/Real/bus2.txt
news Corpus/Train/Real/bus20.txt
news Corpus/Train/Real/bus21.txt
news Corpus/Train/Real/bus22.txt
news Corpus/Train/Real/bus23.txt
news Corpus/Train/Real/bus24.txt
news Corpus/Train/Real/bus25.txt
news Corpus/Train/Real/bus26.txt
news Corpus/Train/Real/bus27.txt
news Corpus/Train/Real/bus28.txt
news Corpus/Train/Real/bus29.txt
news Corpus/Train/Real/bus3.txt
news Corpus/Train/Real/bus30.txt
news Corpus/Train/Real/bus31.txt
news Corpus/Train/Real/bus32.txt
news Corpus/Train/Real/bus33.txt
news Corpus/Train/Real/bus34.txt
news Corpus/Train/Real/bus35.txt
news Corpus/Train/Real/bus36.txt
news Corpus/T

news Corpus/Train/Fake/sp62.txt
news Corpus/Train/Fake/sp63.txt
news Corpus/Train/Fake/sp65.txt
news Corpus/Train/Fake/sp68.txt
news Corpus/Train/Fake/sp8.txt
news Corpus/Train/Fake/sp9.txt
news Corpus/Train/Fake/tch1.txt
news Corpus/Train/Fake/tch10.txt
news Corpus/Train/Fake/tch11.txt
news Corpus/Train/Fake/tch12.txt
news Corpus/Train/Fake/tch13.txt
news Corpus/Train/Fake/tch14.txt
news Corpus/Train/Fake/tch15.txt
news Corpus/Train/Fake/tch16.txt
news Corpus/Train/Fake/tch17.txt
news Corpus/Train/Fake/tch18.txt
news Corpus/Train/Fake/tch19.txt
news Corpus/Train/Fake/tch2.txt
news Corpus/Train/Fake/tch20.txt
news Corpus/Train/Fake/tch21.txt
news Corpus/Train/Fake/tch22.txt
news Corpus/Train/Fake/tch23.txt
news Corpus/Train/Fake/tch24.txt
news Corpus/Train/Fake/tch25.txt
news Corpus/Train/Fake/tch26.txt
news Corpus/Train/Fake/tch27.txt
news Corpus/Train/Fake/tch28.txt
news Corpus/Train/Fake/tch29.txt
news Corpus/Train/Fake/tch3.txt
news Corpus/Train/Fake/tch30.txt
news Corpus/Train/Fak

In [23]:
print ('Train:')
print ('\t Real:',len(real_news))
print ('\t Fake:',len(fake_news))

Train:
	 Real: 350
	 Fake: 288


In [24]:
#reading the path of real and fake news for testing
test_path_real='Corpus/Test/Real/'
test_path_fake='Corpus/Test/Fake/'

real_news, real_news_topics = read_txt_files(sorted(glob.glob(test_path_real+'*.txt')))
fake_news, fake_news_topics = read_txt_files(sorted(glob.glob(test_path_fake+'*.txt')))

#contatenating real and fake news in one variable for testing
test_texts = np.concatenate((real_news, fake_news))
test_labels = np.concatenate((np.ones(len(real_news)), np.zeros(len(fake_news))))
test_topics = np.concatenate((real_news_topics, fake_news_topics))

news Corpus/Test/Real/bus100.txt
news Corpus/Test/Real/bus71.txt
news Corpus/Test/Real/bus72.txt
news Corpus/Test/Real/bus73.txt
news Corpus/Test/Real/bus74.txt
news Corpus/Test/Real/bus75.txt
news Corpus/Test/Real/bus76.txt
news Corpus/Test/Real/bus77.txt
news Corpus/Test/Real/bus78.txt
news Corpus/Test/Real/bus79.txt
news Corpus/Test/Real/bus80.txt
news Corpus/Test/Real/bus81.txt
news Corpus/Test/Real/bus82.txt
news Corpus/Test/Real/bus83.txt
news Corpus/Test/Real/bus84.txt
news Corpus/Test/Real/bus85.txt
news Corpus/Test/Real/bus86.txt
news Corpus/Test/Real/bus87.txt
news Corpus/Test/Real/bus88.txt
news Corpus/Test/Real/bus89.txt
news Corpus/Test/Real/bus90.txt
news Corpus/Test/Real/bus91.txt
news Corpus/Test/Real/bus92.txt
news Corpus/Test/Real/bus93.txt
news Corpus/Test/Real/bus94.txt
news Corpus/Test/Real/bus95.txt
news Corpus/Test/Real/bus96.txt
news Corpus/Test/Real/bus97.txt
news Corpus/Test/Real/bus98.txt
news Corpus/Test/Real/bus99.txt
news Corpus/Test/Real/hlth100.txt
news 

In [25]:
print ('Test:')
print ('\t Real:',len(real_news))
print ('\t Fake:',len(fake_news))

Test:
	 Real: 150
	 Fake: 112


## Parametrization and feature extraction

In [81]:
# Parameters
cnvalues=[4] #character n-grams
wnvalues=[0] # word n-grams
fnvalues=[0] # function words n-grams

In [82]:
#Train feature extraction
print('Extracting features')
train_features, dicOfFeatures = process_texts(train_texts,cnvalues,wnvalues,fnvalues)

vectorizer = CountVectorizer(lowercase=False, min_df=2, tokenizer=lambda x: x.split('&%$')) #--> we can change this
train_data = vectorizer.fit_transform(train_features)
train_data = train_data.astype(float)
print('\t', 'labels', len(train_labels))
print('\t', 'texts', len(train_texts))
print('\t', 'vocabulary size',len(dicOfFeatures))
print ('\t','Train shape:',train_data.shape)
print('\t', 'class dictribution',Counter(train_labels))

Extracting features
	 labels 638
	 texts 638
	 vocabulary size 55261
	 Train shape: (638, 36272)
	 class dictribution Counter({1.0: 350, 0.0: 288})


In [83]:
# Test feature extraction
print('Extracting Test features')
test_features,dicOfFeaturesTest = process_texts(test_texts,cnvalues,wnvalues,fnvalues)
test_data = vectorizer.transform(test_features)
test_data = test_data.astype(float)

print('\t', 'texts', len(test_texts))
print('\t', 'vocabulary size',len(dicOfFeaturesTest))

print ('\t','Test shape:',test_data.shape)

Extracting Test features
	 texts 262
	 vocabulary size 41118
	 Test shape: (262, 36272)


## Frequency threshold

In [84]:
N=5
X=train_data
values=np.array(X.sum(axis=0)).ravel()
thresholdMask=(values >= N)*1
indices_zero = list(np.nonzero(thresholdMask == 0)[0])
all_cols = np.arange(X.shape[1])
cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, indices_zero)))[0]
train_data = X[:, cols_to_keep]
#####
scaled_train_data=train_data
print('Train shape:',scaled_train_data.shape)

Train shape: (638, 20681)


In [85]:
# umbral de frecuencia, la N ya no se define
Z=test_data
all_cols = np.arange(Z.shape[1])
cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, indices_zero)))[0]
test_data = Z[:, cols_to_keep]
scaled_test_data=test_data
print('Test shape:',scaled_test_data.shape)

Test shape: (262, 20681)


## Weighting schemes

In [None]:
#print ('only frecuency:',test_data)
feature_weight='logent' # possible values: binary, logent, tfidf, norm, relat
print ('Train:',scaled_train_data.shape)
print ('Test:',scaled_test_data.shape)

if feature_weight == 'binary':
    scaled_train_data = preprocessing.Binarizer().fit_transform(scaled_train_data)
    scaled_test_data = preprocessing.Binarizer().fit_transform(scaled_test_data)
    print ("feature_weight = binary")
    
elif feature_weight == 'logent':
    Xc = Scipy2Corpus(scaled_train_data)
    log_ent = LogEntropyModel(Xc)
    X = log_ent[Xc]
    X = corpus2csc(X)
    scaled_train_data = sp.csc_matrix.transpose(X)
    
    Xtest = Scipy2Corpus(scaled_test_data)
    X = log_ent[Xtest]
    X = corpus2csc(X, scaled_train_data.shape[1])
    scaled_test_data = sp.csc_matrix.transpose(X)
    print ("feature_weight = logent")
    
elif feature_weight == 'tfidf':
    transformer = TfidfTransformer()
    scaled_train_data = transformer.fit_transform(scaled_train_data)
    scaled_test_data = transformer.transform(scaled_test_data)
    print ("feature_weight = tfidf")
    
elif feature_weight=='norm':
    #scaled_train_data = preprocessing.normalize(scaled_train_data, norm='l2')
    #Scaling data
    max_abs_scaler = preprocessing.MaxAbsScaler()
    scaled_train_data = max_abs_scaler.fit_transform(scaled_train_data)
    scaled_test_data = max_abs_scaler.transform(scaled_test_data)
    print ("feature_weight = norm")
    
elif feature_weight=='relat':
    s = scaled_train_data.sum(axis = 1)
    scaled_train_data = coo_matrix(np.nan_to_num(scaled_train_data/s))

    s = scaled_test_data.sum(axis = 1)
    scaled_test_data = coo_matrix(np.nan_to_num(scaled_test_data/s))
    print ("feature_weight = relat")
    
else:
    print ("feature_weight = tf")
    
print ('Train:',scaled_train_data.shape)
print ('Test:',scaled_test_data.shape)

Train: (638, 20681)
Test: (262, 20681)


## Classification Process - Training

In [78]:
#Utility function
originalclass=[]
predictedclass=[]
def classification_report_with_f1_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return f1_score(y_true, y_pred) # return accuracy score

In [79]:
print('Training Classifier')
    
# Applying classification algorithms
clf=LinearSVC(C=0.01,class_weight='balanced', random_state=85)
clfSVC=SVC(C=0.01, kernel='linear',class_weight='balanced')
clfMnb=MultinomialNB()
clfBnb=BernoulliNB()
clfLG=LogisticRegression(solver='lbfgs', tol=0.001, C=0.01,class_weight='balanced')

clf.fit(scaled_train_data, train_labels)
nested_score = cross_val_score(clf, X=scaled_train_data, y=train_labels, cv=10, scoring=make_scorer(classification_report_with_f1_score))
#cvScoreLinearSVC=cross_val_score(clf, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print(classification_report(originalclass, predictedclass))
print('10-Fold Cross-validation Linear SVC',nested_score.mean())

cvScoreLG=cross_val_score(clfLG, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Logistic Regression',cvScoreLG)

cvScoreMnb=cross_val_score(clfMnb, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)

cvScoreBnb=cross_val_score(clfBnb, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Bernoulli Naive Bayes',cvScoreBnb)

Training Classifier
             precision    recall  f1-score   support

        0.0       0.58      0.63      0.60       288
        1.0       0.67      0.62      0.64       350

avg / total       0.63      0.62      0.62       638

10-Fold Cross-validation Linear SVC 0.6057708773676397
10-Fold Cross-validation Logistic Regression 0.5898404026020847
10-Fold Cross-validation Multinomial Naive Bayes 0.7357299770385957
10-Fold Cross-validation Bernoulli Naive Bayes 0.5615320494748738


## Classification Process - Testing

In [80]:
predictions=clf.predict(scaled_test_data)
print(classification_report(test_labels, predictions))
print('Accuracy',accuracy_score(test_labels, predictions))
print('F1-score',f1_score(test_labels, predictions))

             precision    recall  f1-score   support

        0.0       0.76      0.71      0.74       112
        1.0       0.80      0.83      0.81       150

avg / total       0.78      0.78      0.78       262

Accuracy 0.7824427480916031
F1-score 0.8143322475570033
