In [1]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "./inputdata/"

## Preprocess Data

#### File information

In [3]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['100554newsML.txt', '100593newsML.txt', '100618newsML.txt', '130040newsML.txt', '137871newsML.txt'] 
 ['./inputdata/100554newsML.txt', './inputdata/100593newsML.txt', './inputdata/100618newsML.txt', './inputdata/130040newsML.txt', './inputdata/137871newsML.txt']


In [4]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'./inputdata/100554newsML.txt': 'Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.\nThe long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt and unpaid interest throws the company a lifeline which could secure what is still likely to be a difficult future.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of bankruptcy but leaves current shareholders, who have already seen their investment dwindle, owning only 54.5 percent of the company.\n"We have fixed and capped the interest payments and arranged only to pay what is available in cash," Eurotunnel co-chairman Alastair Morton told reporters at a news conference. "Avoiding having to do this again is the name of the game."\nMorton said the plan provides the Anglo-French company with the medium term fin

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [5]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [6]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [7]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [8]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [9]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [10]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.
The long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt a


In [11]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['Channel', 'tunnel', 'operator', 'Eurotunnel', 'on', 'Monday', 'announced', 'details', 'of', 'a', 'deal', 'giving', 'bank', 'creditors', '45.5', 'percent', 'of', 'the', 'company', 'in', 'return', 'for', 'wiping', 'out', '1.0', 'billion', 'pounds', '(', '$', '1.6']


In [12]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['Channel', 'tunnel', 'operator', 'Eurotunnel', 'Monday', 'announced', 'details', 'deal', 'giving', 'bank', 'creditors', '45.5', 'percent', 'company', 'return', 'wiping', '1.0', 'billion', 'pounds', '(', '$', '1.6', 'billion', ')', 'massive', 'debts', '.', 'The', 'long-awaited', 'highly']


In [13]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

['channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'detail', 'deal', 'give', 'bank', 'creditor', '45.5', 'percent', 'compani', 'return', 'wipe', '1.0', 'billion', 'pound', '(', '$', '1.6', 'billion', ')', 'massiv', 'debt', '.', 'the', 'long-await', 'highli']


In [14]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

['channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'detail', 'deal', 'give', 'bank', 'creditor', '45.5', 'percent', 'compani', 'return', 'wipe', '1.0', 'billion', 'pound', '1.6', 'billion', 'massiv', 'debt', 'the', 'long-await', 'highli', 'complex', 'restructur', 'nearli', 'nearli']


In [15]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['channel', 'tunnel', 'oper', 'eurotunnel', 'monday', 'announc', 'detail', 'deal', 'give', 'bank', 'creditor', '45.5', 'percent', 'compani', 'return', 'wipe', '1.0', 'billion', 'pound', '1.6', 'billion', 'massiv', 'debt', 'the', 'long-await', 'highli', 'complex', 'restructur', 'nearli', 'nearli']


### Wrap into a function to be used by NLTK

In [16]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [17]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    print('                ', end="")   #bank space for formatting output
    for n in range(len(fileNames)):
        print('{0:18}'.format(fileNames[n]), end="")    #file names
    print()
    for i in range(len(term)):
        print('{0:8}'.format(term[i]), end='\t|  ')     #the term
        for j in range(numValues):
            print('{0:.12f}'.format(values[i][j]), end='   ') #the value, corresponding to the file name, for the term
        print()

In [18]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = "../results/tfid.txt"
    outFile = open(filePath, 'a')
    title = "TFIDF\n"
    outFile.write(title)
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    outFile.write('               \t')   #bank space for formatting output
    for n in range(len(fileNames)):
        outFile.write('{0:18}'.format(fileNames[n]))    #file names
    outFile.write("\n")
    for i in range(len(term)):
        outFile.write('{0:15}'.format(term[i]))     #the term
        outFile.write('\t|  ')
        for j in range(numValues):
            outFile.write('{0:.12f}'.format(values[i][j])) #the value, corresponding to the file name, for the term
            outFile.write('   ')
        outFile.write("\n")

    outFile.close()

In [19]:
# TODO: modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    #print(cosine_similarity(tfs[0], tfs[1]))
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    numFiles = len(fileNames)
    names = []
    print('                   ', end="")    #formatting
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                print(fileNames[k], end='   ')
            print()

        print(fileNames[i], end='   ')
        for n in range(numFiles):
            #print(fileNames[n], end='\t')
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            #print(numValue, end='\t')
            names.append(fileNames[n])
            print(" {0:.8f}".format(numValue), end='         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        print()
    print("\n\n=============================================================================================\n")

In [20]:
def calc_and_write_CosineSimilarity_for_all(tfs, fileNames):
    filePath = "../results/cosine_similarity.txt"
    outFile = open(filePath, 'a')
    title = "COSINE SIMILARITY\n"
    outFile.write(title)
    numFiles = len(fileNames)
    names = []
    outFile.write('                   ')
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                outFile.write(fileNames[k])
                outFile.write('   ')
            outFile.write("\n")
        outFile.write(fileNames[i])
        outFile.write('   ')

        for n in range(numFiles):
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            names.append(fileNames[n])
            outFile.write('{0:.8f}'.format(numValue))
            outFile.write('         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        outFile.write("\n")

    outFile.close()

## Wrap Everything into `Main()`

In [21]:
def main(printResults=True):
    baseFolderPath = "./inputdata/"

    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # calculate tfidf
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names()
    
    if printResults:
        # print results
        print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:
        # write results to file
        write_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)   
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames)

In [22]:
main()

                100554newsML.txt  100593newsML.txt  100618newsML.txt  130040newsML.txt  137871newsML.txt  
'm      	|  0.095165597630   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
's      	|  0.090693834344   0.139427353384   0.145592889457   0.089203368600   0.093595320091   
1.0     	|  0.031866754923   0.039192056059   0.040925145224   0.000000000000   0.000000000000   
1.56    	|  0.000000000000   0.094428412110   0.049302038059   0.000000000000   0.000000000000   
1.6     	|  0.047582798815   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
1.85    	|  0.047582798815   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
10      	|  0.031866754923   0.039192056059   0.040925145224   0.000000000000   0.000000000000   
10.40   	|  0.031866754923   0.039192056059   0.040925145224   0.000000000000   0.000000000000   
100     	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
113.5   	| 

david   	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.039284057781   
deal    	|  0.187651231972   0.098908779308   0.103282567033   0.026366766409   0.000000000000   
debt    	|  0.191200529536   0.156768224238   0.163700580897   0.000000000000   0.000000000000   
debt-for-equ	|  0.031866754923   0.039192056059   0.040925145224   0.000000000000   0.000000000000   
decemb  	|  0.031866754923   0.039192056059   0.040925145224   0.000000000000   0.000000000000   
decid   	|  0.000000000000   0.000000000000   0.000000000000   0.037758608010   0.031694130465   
decis   	|  0.000000000000   0.000000000000   0.000000000000   0.037758608010   0.031694130465   
depreci 	|  0.047582798815   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
describ 	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.039284057781   
despit  	|  0.047582798815   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
develop 	|  0.03

run     	|  0.038389502478   0.000000000000   0.049302038059   0.000000000000   0.000000000000   
safekeep	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
said    	|  0.204061127273   0.223083765415   0.232948623132   0.156105895050   0.168471576164   
sale    	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
salomon 	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
say     	|  0.000000000000   0.000000000000   0.000000000000   0.093601642767   0.000000000000   
scale   	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
scenario	|  0.047582798815   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
scotland	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
scratch 	|  0.000000000000   0.000000000000   0.000000000000   0.046800821384   0.000000000000   
search  	|  0.000000