# Fake news detection

In [73]:
import re
import glob
import numpy as np
import os
import json
import argparse
import time
import codecs
import string
import codecs
import random
import scipy.sparse as sp

from random import randrange
from scipy.sparse import csr_matrix, csc_matrix, hstack, coo_matrix
#from gensim.matutils import Scipy2Corpus, corpus2csc
#from gensim.models.logentropy_model import LogEntropyModel
from collections import defaultdict, Counter
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords as sw
from string import punctuation

## Feature extraction functions

In [45]:
#Extracts word-ngrams, when n=1 is equal to bag of words
def wordNgrams(text, n):
    ngrams = []
    text = [word for word in text.split() if word not in string.punctuation]
    ngrams = [' '.join(text[i:i+n])+'' for i in range(len(text)-n+1)]
    return ngrams

In [46]:
#Extracts character n-grams
def charNgrams(text, n):
    ngrams = []
    ngrams = [text[i:i+n]+'_cng' for i in range(len(text)-n+1)]
    return ngrams

In [47]:
#Extracts function words n-grams with a pre-loaded dictionary
def funcNgrams(text, n):
    stop_words = load_diccionario('Urdu-stop-words')
    patt=r'\b(' + ('|'.join(re.escape(key) for key in stop_words)).lstrip('|') + r')\b'
    pattern = re.compile(patt)
    text = re.sub(r"(\n+|\r+|(\r\n)+)", " ", text)
    text = re.sub(r" +", " ", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(r"[" + punctuation + "]*", "", text)
    terms = pattern.findall(text)
    n_grams=[('_'.join(terms[i:i+n])) + "_fwn" for i in range(len(terms)-n+1)]

    return n_grams

In [48]:
def load_diccionario(ruta):
    terms = set()#Dictionary of slangs
    try:
        tmp = open(ruta, "r")     
        while True :
            linea = tmp.readline()                                                                                   
            #linea = to_unicode(linea) 
            if (not linea) or (linea == ""):                                                                               
                break;                                                                                                      
            linea = linea.rstrip()
            terms.add(linea.lower())
        return (terms)
    except IOError as e:
        print ("Error: "+ruta+" I/O error({0}): {1}".format(e.errno, e.strerror))
        exit(1)

In [78]:
def extract_features(text,cn,wn,fn):
    text = text.lower()
    #text=clean_text(text)
    features = []
    for n in wn:
        if n != 0:
            features.extend(wordNgrams(text,n))
    for n in cn:
        if n != 0:
            features.extend(charNgrams(text,n))
    for n in fn:
            if n != 0:
                features.extend(funcNgrams(text,n))
    return features

In [76]:
# Extracts all features in a set of 'texts' and return as a string separated with the simbol '&%$'
def process_texts(texts,cn,wn,fn):
    occurrences=defaultdict(int)
    featuresList=[]
    featuresDict=Counter()
    for (text) in texts:
        features=extract_features(text,cn,wn,fn)
        featuresDict.update(features)
        featuresList.append('&%$'.join(features))
    return featuresList, featuresDict

## Reading and preparing the corpus

In [40]:
#utility function for reading files
def read_txt_files(files):
    text=[]
    topic=[]
    for i,file_path in enumerate(files):
        with open(file_path,'r') as infile:
            text.append(infile.read())
            file_topic=''.join(re.findall('[A-Za-z]',file_path.split('/')[4].split('.')[0]))
            topic.append(file_topic)
    return text, topic

In [60]:
#reading the path of real and fake news for training
train_path_real='../corpus/train/real/'
train_path_fake='../corpus/train/fake/'

real_news, real_news_topics = read_txt_files(glob.glob(train_path_real+'*.txt'))
fake_news, fake_news_topics = read_txt_files(glob.glob(train_path_fake+'*.txt'))

#contatenating real and fake news in one variable for training
train_texts = np.concatenate((real_news, fake_news))
train_labels = np.concatenate((np.ones(len(real_news)), np.zeros(len(fake_news))))
train_topics = np.concatenate((real_news_topics, fake_news_topics))

In [61]:
print ('Train:')
print ('\t Real:',len(real_news))
print ('\t Fake:',len(fake_news))

Train:
	 Real: 493
	 Fake: 2


In [62]:
#reading the path of real and fake news for testing
test_path_real='../corpus/test/real'
test_path_fake='../corpus/test/fake'

real_news, real_news_topics = read_txt_files(glob.glob(test_path_real+'*.txt'))
fake_news, fake_news_topics = read_txt_files(glob.glob(test_path_fake+'*.txt'))

#contatenating real and fake news in one variable for testing
test_texts = np.concatenate((real_news, fake_news))
test_labels = np.concatenate((np.ones(len(real_news)), np.zeros(len(fake_news))))
test_topics = np.concatenate((real_news_topics, fake_news_topics))

In [63]:
print ('Test:')
print ('\t Real:',len(real_news))
print ('\t Fake:',len(fake_news))

Test:
	 Real: 0
	 Fake: 0


## Parametrization and feature extraction

In [79]:
# Parameters
cnvalues=[3] #character n-grams
wnvalues=[2,3] # word n-grams
fnvalues=[0] # function words n-grams

print('Extracting features')
train_features, dicOfFeatures = process_texts(train_texts,cnvalues,wnvalues,fnvalues)

vectorizer = CountVectorizer(lowercase=False, min_df=2, tokenizer=lambda x: x.split('&%$')) #--> we can change this
train_data = vectorizer.fit_transform(train_features)
train_data = train_data.astype(float)
print('\t', 'labels', len(train_labels))
print('\t', 'tweets', len(train_texts))
print('\t', 'vocabulary size',len(dicOfFeatures))
print('\t', 'class dictribution',Counter(train_labels))

Extracting features
	 labels 495
	 tweets 495
	 vocabulary size 304569
	 class dictribution Counter({1.0: 493, 0.0: 2})


## Frequency threshold

In [81]:
N=5
X=train_data
values=np.array(X.sum(axis=0)).ravel()
thresholdMask=(values >= N)*1
indices_zero = list(np.nonzero(thresholdMask == 0)[0])
all_cols = np.arange(X.shape[1])
cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, indices_zero)))[0]
train_data = X[:, cols_to_keep]
#####

print(train_data.shape)
    
scaled_train_data=train_data
    

(495, 17333)


## Weighting schemes

In [82]:
feature_weight='tf' # possible values: binary, logent, tfidf, norm, relat

if feature_weight == 'binary':
    scaled_train_data = preprocessing.Binarizer().fit_transform(scaled_train_data)

elif feature_weight == 'logent':
    Xc = Scipy2Corpus(scaled_train_data)
    log_ent = LogEntropyModel(Xc)
    X = log_ent[Xc]
    X = corpus2csc(X)
    scaled_train_data = sp.csc_matrix.transpose(X)

elif feature_weight == 'tfidf':
    transformer = TfidfTransformer()
    scaled_train_data = transformer.fit_transform(scaled_train_data)

elif feature_weight=='norm':
    #scaled_train_data = preprocessing.normalize(scaled_train_data, norm='l2')
    #Scaling data
    max_abs_scaler = preprocessing.MaxAbsScaler()
    scaled_train_data = max_abs_scaler.fit_transform(train_data)
elif feature_weight=='relat':
    s = scaled_train_data.sum(axis = 1)
    scaled_train_data = coo_matrix(np.nan_to_num(scaled_train_data/s))

else:
    print ("feature_weight = tf")

feature_weight = tf


In [84]:
#Utility function
originalclass=[]
predictedclass=[]
def classification_report_with_f1_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return f1_score(y_true, y_pred) # return accuracy score

## Classification Process - Training

In [None]:
print('Training Classifier')
    
# Applying classification algorithms
clf=LinearSVC(C=0.01,class_weight='balanced')
clfSVC=SVC(C=0.01, kernel='linear',class_weight='balanced')
clfMnb=MultinomialNB()
clfBnb=BernoulliNB()
clfLG=LogisticRegression(solver='lbfgs', tol=0.001, C=0.01,class_weight='balanced')

nested_score = cross_val_score(clf, X=scaled_train_data, y=train_labels, cv=10, scoring=make_scorer(classification_report_with_f1_score))
#cvScoreLinearSVC=cross_val_score(clf, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print(classification_report(originalclass, predictedclass))
print('10-Fold Cross-validation Linear SVC',nested_score.mean())

cvScoreLG=cross_val_score(clfLG, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Logistic Regression',cvScoreLG)

cvScoreMnb=cross_val_score(clfMnb, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)

cvScoreBnb=cross_val_score(clfBnb, scaled_train_data, train_labels, cv=10, scoring='f1').mean()
print('10-Fold Cross-validation Bernoulli Naive Bayes',cvScoreBnb)

Training Classifier




             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00         2
        1.0       1.00      1.00      1.00       493

avg / total       0.99      0.99      0.99       495

10-Fold Cross-validation Linear SVC 0.9959579463100949


