In [8]:
import math
import os
import pandas as pd
import numpy as np
import scipy.sparse
import scipy.io
import nltk.data
import nltk.tokenize
import nltk.stem
from nltk.corpus import stopwords
from collections import Counter
import numpy as np

def extract_words(text, stemmer = None, remove_stopwords = False):
    """
    Strategy used:
    1. Tokenize
    2. Stemming
    3. Stop word removal
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    if stemmer is None:
        words = [token.lower() for token in tokens]
    else:
        words = [stemmer.stem(word.lower()) for word in tokens]
    if remove_stopwords:
        words = [word for word in words if word not in stopwords.words('english')]

    return words

def build_vocabulary(documents):
    """
    Creating a list of the total unique worlds found in the corpus.
    """
    vocabulary = set()
    for doc in documents:
        vocabulary.update([word for word in doc])
    vocabulary = list(vocabulary)
    return vocabulary

def get_idf_matrix(vocabulary, documents):
    """
        The inverse document frequency acts as a weight for 
        processing the TF matrix. 
        
        IDF(w) = log((1 + n_d)/ (1 + df(d, t))) + 1
        
    """
    n_of_doc = len(documents)
    frequency_dic = {} # a dictionary to keep track of df(d, t)
    
    for word in vocabulary:
        for d in documents:
            if word in documents:
                if word in frequency_dic:
                    frequency_dic[word] += 1
                else:
                    frequency_dic[word] = 1
                    
    idf = []
    
    for word in vocabulary:
        if word in frequency_dic: 
            freq = frequency_dic[word]
        else:
            freq = 0
            
        idf.append(math.log((float)(1 + n_of_doc) / ((float)(1+ freq)) + 1, 2))
        
    return scipy.sparse.diags(np.squeeze(np.asarray(idf)))

def get_tf_vectors(vocabulary, documents):    
    
    # starting pointer for row
    row_ptr = 0
    # data[n] is stored in the matrix[row[n]col[n]]
    row = []
    col = []
    data = []

    for d in documents:   
        col_index = 0
        
        for word in vocabulary:    
            if word in d:
                term_freq = d.count(word)
                row.append(row_ptr)
                col.append(col_index)
                data.append(d.count(word))
            col_index += 1
            
        row_ptr += 1

    return scipy.sparse.csr_matrix((data, (row, col)), shape=(row_ptr, len(vocabulary)))    

def get_log_tf_vectors(vocabulary, documents):    
    
    # starting pointer for row
    row_ptr = 0
    # data[n] is stored in the matrix[row[n]col[n]]
    row = []
    col = []
    data = []

    for d in documents:   
        col_index = 0
        
        for word in vocabulary:    
            if word in d:
                term_freq = d.count(word)
                row.append(row_ptr)
                col.append(col_index)
                data.append( 1 + math.log(d.count(word), 2))
            col_index += 1
            
        row_ptr += 1

    return scipy.sparse.csr_matrix((data, (row, col)), shape=(row_ptr, len(vocabulary)))    

"""
    l2_norm implementation taken from
    London Machine Learning Study Group: http://www.meetup.com
    /London-Machine-Learning-Study-Group/members/
"""
def l2_normalized_matrix(matrix):
    """
    Normalises a sparse matrix by scaling its rows individually to L2 unit norm

    The new row values are computed as
    
        ||x|| = sqrt(sum(x^2))
        
    For efficiency, the resulting new matrix is formed by computing
    
    normalized_matrix = 
        transpose(transpose transpose(matrix) * l2_norm)
        
    where matrix is the original sparse matrix and l2_norm is diagonal 
    matrix of the reciprocals of sqrt(sum(x^2))
    
    Parameters
    ----------
    matrix     : a sparse matrix to be normalized
    
    Returns
    -------
    An L2 normalised sparse matrix based on the input matrix
    
    """     
    # Compute the L2 norms
    l2_norm = np.sqrt(matrix.power(2).sum(axis=1))
    
    # Get the reciprocals
    with np.errstate(divide="ignore", invalid="ignore"):
        l2_norm = np.reciprocal(l2_norm)
        # Treat infinity and NaN as 0
        l2_norm[~np.isfinite(l2_norm)] = 0  # -inf inf NaN   
    
    # Form a diagonal matrix of the reciprocals
    l2_norm = scipy.sparse.diags(np.squeeze(np.asarray(l2_norm)))           
        
    # Compute the normalised matrix
    normalized_matrix = (matrix.T * l2_norm).T
    
    return normalized_matrix
       
def mtx_save(file_name, matrix):
    scipy.io.mmwrite(file_name, matrix)

def encode_labels(labelsDF):
    labelsDF = pd.Categorical(labelsDF)
    catLabelsDF = labelsDF.codes

    return catLabelsDF

def labels_save(file_name, labels):
    labels.tofile(file_name, sep='\n')

# Read a data set
dataDF = pd.read_csv("../data/fake_or_real_news.csv", 
                     sep=',',
                     nrows=100,
                     lineterminator='\n', names = ["title", "text", "label"])

print("Init & apply stemmer")
snowball = nltk.stem.snowball.EnglishStemmer()
dataDF["Words"] = dataDF.apply(lambda row: extract_words(row['text'], snowball), axis=1)

print("building vocabulary")
vocabulary = build_vocabulary(dataDF["Words"])

tf_matrix = get_tf_vectors(vocabulary, dataDF["Words"])
idf_matrix = get_idf_matrix(vocabulary, dataDF["Words"])
print("building tf-idf matrix")

tf_idf_matrix = tf_matrix * idf_matrix
tf_idf_matrix = l2_normalized_matrix(tf_idf_matrix)

labels = encode_labels(dataDF["label"])

print("Saving the TFxIDF matrix and the corresponding values")
mtx_save("training.mtx", tf_idf_matrix)
labels_save("labels.csv", labels)
print("finished!")

Init & apply stemmer
building vocabulary
building tf-idf matrix
Saving the TFxIDF matrix and the corresponding values
finished!


In [9]:
"""
    The seperation of feature extraction file from 
    modelling file is for better modularity of the 
    project. So that we could plug in different 
    model for our training data.
    
"""

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

import numpy as np
import scipy
import timeit
from scipy import io

np.random.seed(1234)

labels = np.fromfile("./labels.csv", sep='\n')
tf_idf_matrix = io.mmread("./training.mtx")
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, 
                                                    labels, 
                                                    test_size=0.15, 
                                                    random_state=1234)
start_time = timeit.default_timer()

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

"""
    This file fits the TF-IDF matrix data that we
    processsed to the default Multinomial class
    
    Test results:
    
    Env: tf_idf_matrix trained using the first 1000 row
         test_size: 0.10
         random_state: 1234
    with L2 normalization:
        Using naive term frequency count: 0.69 accuracy
        Using log term frequency count: 0.46 accuracy ... hmm.
        
    without L2:
        Using naive term: 0.80
        Using log term: 0.45
"""

print("Elapsed time: %f sec" % (timeit.default_timer() - start_time))
print(accuracy_score(y_test, y_pred))

Elapsed time: 0.001578 sec
0.6
