In [24]:
import numpy as np
import pandas as pd
import string, xlrd, nltk, unicodedata, pickle, os, re, time
from nltk import word_tokenize
from collections import defaultdict, Counter
from difflib import SequenceMatcher as textSimilarity # for measuring similarity of text
import pickle

from bs4 import BeautifulSoup # for parsing XML

start = time.time()

def nykredit_xls_to_dataframe():
    """
    Parses Nykredit_feed.xls and returns selected columns of the spreadsheet as a Pandas dataframe
    """
    try:
        NK_data = pickle.load(open("NK_data", "rb"))
        print ("NK_data loaded")
    except:        
        # loads the spreadsheet as a pandas.DataFrame
        xls_file = pd.ExcelFile("Nykredit_feed.xls")
#        print (xls_file.sheet_names)
        nykredit_data = xls_file.parse('tmp62.tmp')
        nykredit_text = xls_file.parse('ArticleText')
        
        # choose which columns to use from the spreadsheet:
        selected_parameters = ['ArticleKey', 'ArticleDate', 'QualitativeScore', 'Headline', 'Kilde', 'Raw Xml']
        NK_data = nykredit_data.reindex(columns=selected_parameters)

        # add a column containing the texts extracted from the xml:
        NK_data['Text'] = NK_data.apply(extract_text_from_xml, axis=1)
        # change column name 'Kilde'=>'Source'
        NK_data.columns = [['ArticleKey', 'ArticleDate', 'QualitativeScore', 'Headline', 'Source', 'Raw Xml', 'Text']]
        pickle.dump(NK_data, open("NK_data", "wb"))
        
    return NK_data

def extract_text_from_xml(row):
    """
    Parses the raw XML string in row and returns the text/article contained in row.XML
    """    
    soup = BeautifulSoup(row['Raw Xml'], 'lxml')
    p_blocks = soup.findAll('p')
    
    output = []
    for p in p_blocks:
        if p.string: # some p blocks contains a None element
            output.append(p.string)
    return " ".join(output)

def tokenize_text(row):
    """
    Returns tokens in row.Text using nltk.word_tokenize
    """
    return word_tokenize(row.Text)

NK_data = nykredit_xls_to_dataframe()

NK_data loaded


In [25]:
def duplicate_indexes(dataFrame):
    """
    Returns the indexes of duplicate and semi duplicates texts in dataFrame.
    """
    try:
        duplicates_list = pickle.load(open("duplicates", "rb"))
        print ("duplicates loaded")
    except:
        duplicates_list = []
        i = 0
        while i<dataFrame.shape[0]-1:
            if not i in duplicates_list: # text[i] is a duplicate => no need to check
                j = i+1
                while j<dataFrame.shape[0]:
                    if not j in duplicates_list: # text[j] is a duplicate => no need to check
                        text_similarity = textSimilarity(None, dataFrame.ix[i].Text, dataFrame.ix[j].Text).quick_ratio()
                        if text_similarity > 0.95:
                            duplicates_list.append(j)
                            print((i, j))
                            j += 1
                    j += 1
            i += 1
        pickle.dump(duplicates_list, open("duplicates", "wb"))
    return duplicates_list

duplicates = duplicate_indexes(NK_data)
NK_unique_data = pd.DataFrame.copy(NK_data.drop(duplicates))

duplicates loaded


In [26]:
NK_unique_data.QualitativeScore.value_counts()

 1    714
 0    642
-1     37
Name: QualitativeScore, dtype: int64

In [27]:
neutral_index = np.arange(NK_unique_data.ix[(NK_unique_data.QualitativeScore==-1)].shape[0])
NK_neutral = NK_unique_data.ix[(NK_unique_data.QualitativeScore==-1)]
#NK_neutral

In [29]:
def createFreqDist(data):
    allTexts = ""
    for index, row in NK_unique_data.iterrows():
        allTexts += row.Text + " "
    frequencyDistribution = nltk.FreqDist(word_tokenize(allTexts))
    return frequencyDistribution
frequencyDistribution = createFreqDist(NK_unique_data)
len(frequencyDistribution)

39920

In [50]:
def trimFreqDist(freqDist, loCut, hiCut):
    """
    trimFreqDist removes the least and most common tokens of the frequency distribution
    according to the values defined by loCut and hiCut.
    """
    
    print("frequencyDistribution.B() =", frequencyDistribution.B(), ", loCut =", loCut, ", hiCut =", hiCut)
    fd = frequencyDistribution.copy()
    FreqDistList = fd.most_common(fd.B()) # Ordered list of (token, freq) from the most to least common tokens
    for (token, freq) in FreqDistList:
        if freq==1:
            fd.pop(token)
    FreqDistList = fd.most_common(fd.B()) # Update list after removing token of frequency 1
    loCutIndex = int(fd.B()*loCut) # The B() method returns the number of tokens in the freqDist
    hiCutIndex = int(fd.B()*hiCut)
    
    for (token, freq) in FreqDistList[:loCutIndex]:
        fd.pop(token)
    for (token, freq) in FreqDistList[hiCutIndex:]:
        fd.pop(token)
    print(fd.B())
    return fd
trimmedFreqDist = trimFreqDist(frequencyDistribution, 0.1, 0.9)

frequencyDistribution.B() = 39920 , loCut = 0.1 , hiCut = 0.9
15618
