In [11]:
import pickle

# numpy and pandas for data handling
import numpy as np
import pandas as pd

# BeautifulSoup for parsing XML
from bs4 import BeautifulSoup

# nltk for various NLP tasks (Natural Language Toolkit)
from nltk import word_tokenize
from nltk.corpus import stopwords

# difflib for measuring similarity of text
from difflib import SequenceMatcher as textSimilarity

# sklearn objects for feature extraction, classification and cross validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB as bnb

In [12]:
def nykredit_xls_to_dataframe():
    """
    Parses Nykredit_feed.xls and returns selected columns of the spreadsheet as a Pandas dataframe
    """
    try:
        NK_data = pickle.load(open("NK_data", "rb"))
        print ("NK_data loaded")
    except:
        # loads the spreadsheet as a pandas.DataFrame
        xls_file = pd.ExcelFile("Nykredit_feed.xls")
        nykredit_data = xls_file.parse('tmp62.tmp')
        nykredit_text = xls_file.parse('ArticleText')
        
        # choose which columns to use from the spreadsheet:
        selected_parameters = ['ArticleKey', 'ArticleDate', 'QualitativeScore',
                               'Headline', 'Kilde', 'Raw Xml']
        NK_data = nykredit_data.reindex(columns=selected_parameters)

        # add a column containing the texts extracted from the xml:
        NK_data['Text'] = NK_data.apply(extract_text_from_xml, axis=1)
        
        # change column name 'Kilde'=>'Source'
        NK_data.columns = [['ArticleKey', 'ArticleDate', 'QualitativeScore',
                            'Headline', 'Source', 'Raw Xml', 'Text']]
        pickle.dump(NK_data, open("NK_data", "wb"))
        print ('NK_data constructed and saved to file')
        
    return NK_data

def extract_text_from_xml(row):
    """
    Parses the raw XML string in row and returns the text/article contained in row.XML
    """    
    soup = BeautifulSoup(row['Raw Xml'], 'lxml')
    p_blocks = soup.findAll('p')
    
    output = []
    for p in p_blocks:
        if p.string: # some p blocks contains a None element
            output.append(p.string)
    return " ".join(output)

NK_data = nykredit_xls_to_dataframe()

NK_data constructed and saved to file


In [13]:
def duplicate_indexes(dataFrame):
    """
    Returns the indexes of duplicate and semi duplicates texts in dataFrame.
    
    difflib.SequenceMatcher (textSimilarity) is a rather calculation heavy function. Hence
    the nested while loop in order not to compare a text that has been deemed a duplicate
    to another unseen text.
    """
    try:
        duplicates_list = pickle.load(open("duplicates", "rb"))
        print ("duplicates loaded")
    except:
        duplicates_list = []
        i = 0
        while i<dataFrame.shape[0]-1:
            if not i in duplicates_list: # text[i] is a duplicate => no need to check
                j = i+1
                while j<dataFrame.shape[0]:
                    if not j in duplicates_list: # text[j] is a duplicate => no need to check
                        text_similarity = textSimilarity(None, dataFrame.ix[i].Text, dataFrame.ix[j].Text).quick_ratio()
                        if text_similarity > 0.95:
                            duplicates_list.append(j)
                            print((i, j))
                            j += 1
                    j += 1
            i += 1
        pickle.dump(duplicates_list, open("duplicates", "wb"))
    return duplicates_list

duplicates = duplicate_indexes(NK_data)
NK_unique_data = pd.DataFrame.copy(NK_data.drop(duplicates))

duplicates loaded


In [15]:
# Creating feature vectors using sklearn

def texts_to_feature_vectors(dataframe):
    """
    Based on the dataframe given in input, texts_to_feature_vectors builds and returns
    an array of feature vectors representing the texts in the dataframe and an array of
    the appertaining labels.
    """
    
    NK_positive_negative = pd.DataFrame.copy(dataframe[dataframe.QualitativeScore!=-1])
    target = np.array(NK_positive_negative.QualitativeScore) # list of corresponding labels
    print("target length:", target.shape)
    
    texts = np.array(NK_positive_negative.Text)
    
    # Initializing the CountVectorizer object
    # vectorizer counts the number of times the tokens appear in the document
    vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, \
                                 stop_words=stopwords.words("Danish"), max_features=5000)

    # Creating the feature vectors
    feature_vectors = vectorizer.fit_transform(texts).toarray()
    print ("fv length:", feature_vectors.shape, "\n")
    
    return feature_vectors, target

X, y = texts_to_feature_vectors(NK_unique_data)

def crossvalidate_algorithms(algos, f_vectors, target):
    """
    crossvalidate_algorithms performs a k-fold stratified shuffled split validation
    of an array of classifiers on the set of feature vectors given in input.
    """
    
    # StratifiedShuffleSplit is an iterator for generating stratified and
    # shuffled splits for the cross validation
    sss = StratifiedShuffleSplit(target, n_iter=5, test_size=0.1, random_state=0)
    
    scores = {}
    for algo in algos:
        s = cross_validation.cross_val_score(algo(), f_vectors, target, cv=sss)
        scores[algo.__name__] = s
        print (algo.__name__, ":", s, "\nMean accuracy:", round(100*s.mean(), 2), "\n")
    return scores

algorithm_scores = crossvalidate_algorithms([bnb, svm.LinearSVC], X, y)

target length: (1356,)
fv length: (1356, 5000) 

BernoulliNB : [ 0.80147059  0.84558824  0.83088235  0.79411765  0.83823529] 
Mean accuracy: 82.21 

LinearSVC : [ 0.78676471  0.83823529  0.84558824  0.75        0.82352941] 
Mean accuracy: 80.88 



In [84]:
def createFreqDist(dataframe):
    """
    Creates an nltk frequency distribution for the input dataframe.
    """
    
    allTexts = ""
    for index, row in dataframe.iterrows():
        allTexts += row.Text + " "
    frequencyDistribution = nltk.FreqDist(word_tokenize(allTexts))
    return frequencyDistribution
frequencyDistribution = createFreqDist(NK_unique_data)

def trimFreqDist(freqDist, loCut, hiCut):
    """
    trimFreqDist removes the least and most common tokens of the frequency distribution
    according to the values defined by loCut and hiCut.
    """
    
    print("frequencyDistribution.B() =", frequencyDistribution.B(), ", loCut =", loCut, ", hiCut =", hiCut)
    fd = frequencyDistribution.copy()
    FreqDistList = fd.most_common(fd.B()) # Ordered list of (token, freq) from the most to least common tokens
    for (token, freq) in FreqDistList: # Remove tokens that only appear once
        if freq==1:
            fd.pop(token)
            
    FreqDistList = fd.most_common(fd.B()) # Update list after removing token of frequency 1
    loCutIndex = int(fd.B()*loCut) # The B() method returns the number of tokens in the freqDist
    hiCutIndex = int(fd.B()*hiCut)
    
    for (token, freq) in FreqDistList[:loCutIndex]:
        fd.pop(token)
    for (token, freq) in FreqDistList[hiCutIndex:]:
        fd.pop(token)
    print(fd.B())
    return fd

trimmedFreqDist = trimFreqDist(frequencyDistribution, 0.1, 0.9)

frequencyDistribution.B() = 39920 , loCut = 0.1 , hiCut = 0.9
15618
