In [2]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "./inputdata/"

## Preprocess Data

#### File information

In [31]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [ fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName)) ]
    listOfFilePaths = [ join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName)) ]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['100554newsML.txt', '100593newsML.txt', '100618newsML.txt', '130040newsML.txt', '137871newsML.txt'] 
 ['./inputdata/100554newsML.txt', './inputdata/100593newsML.txt', './inputdata/100618newsML.txt', './inputdata/130040newsML.txt', './inputdata/137871newsML.txt']


In [35]:
# Get document contents
def create_docContentDict( filePaths ):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict( filePaths )
print(rawContentDict)

{'./inputdata/100554newsML.txt': 'Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.\nThe long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt and unpaid interest throws the company a lifeline which could secure what is still likely to be a difficult future.\nThe deal, announced simultaneously in Paris and London, brings the company back from the brink of bankruptcy but leaves current shareholders, who have already seen their investment dwindle, owning only 54.5 percent of the company.\n"We have fixed and capped the interest payments and arranged only to pay what is available in cash," Eurotunnel co-chairman Alastair Morton told reporters at a news conference. "Avoiding having to do this again is the name of the game."\nMorton said the plan provides the Anglo-French company with the medium term fin

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [49]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [50]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [ word for word in contentsTokenized if word not in stop_word_set ]
    return filteredContents

#### Stemming

In [51]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [ porterStemmer.stem(word) for word in contentsTokenized ]
    return filteredContents

#### Remove Punctuation

In [52]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [ word for word in contentsTokenized if word not in excludePuncuation ]
    return filteredContents

#### Convert terms to lowercase

In [53]:
def convertItemsToLower(contentsRaw):
    filteredContents = [ term.lower() for term in contentsRaw ]
    return filteredContents

### Test that functions are working as expected

In [47]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

Channel tunnel operator Eurotunnel on Monday announced details of a deal giving bank creditors 45.5 percent of the company in return for wiping out 1.0 billion pounds ($1.6 billion) of its massive debts.
The long-awaited but highly complex restructuring of nearly nearly nine billion pounds of debt a


In [55]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

NameError: name 'content_tokenized' is not defined

In [None]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

In [None]:
# Test stemming
content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
print(content_test_stemmed[:30])

In [None]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_stemmed)

# visually inspect
print(content_test_cleaned[:30])

In [None]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

### Wrap into a function to be used by NLTK

In [None]:
# process data without writing inspection file information to file
def processData_inspect(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Calcuate TFIDF

In [None]:
tfidf = TfidfVectorizer(tokenizer=processData_inspect, stop_words='english')
tfs = tfidf.fit_transform( rawContentDict.values() )

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [12]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all( term, values, fileNames ):
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    print('                ', end="")   #bank space for formatting output
    for n in range(len(fileNames)):
        print('{0:18}'.format(fileNames[n]), end="")    #file names
    print()
    for i in range(len(term)):
        print('{0:8}'.format(term[i]), end='\t|  ')     #the term
        for j in range(numValues):
            print( '{0:.12f}'.format(values[i][j]), end='   ' ) #the value, corresponding to the file name, for the term
        print()

In [13]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all( term, values, fileNames ):
    filePath = "../results/tfid.txt"
    outFile = open(filePath, 'a')
    title = "TFIDF\n"
    outFile.write(title)
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    outFile.write('               \t')   #bank space for formatting output
    for n in range(len(fileNames)):
        outFile.write('{0:18}'.format(fileNames[n]))    #file names
    outFile.write("\n")
    for i in range(len(term)):
        outFile.write('{0:15}'.format(term[i]))     #the term
        outFile.write('\t|  ')
        for j in range(numValues):
            outFile.write( '{0:.12f}'.format(values[i][j]) ) #the value, corresponding to the file name, for the term
            outFile.write('   ')
        outFile.write("\n")

    outFile.close()

In [14]:
# TODO: should modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all( tfs, fileNames ):
    #print(cosine_similarity(tfs[0], tfs[1]))
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    numFiles = len(fileNames)
    names = []
    print('                   ', end="")    #formatting
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                print(fileNames[k], end='   ')
            print()

        print(fileNames[i], end='   ')
        for n in range(numFiles):
            #print(fileNames[n], end='\t')
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            #print(numValue, end='\t')
            names.append(fileNames[n])
            print(" {0:.8f}".format(numValue), end='         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        print()
    print("\n\n=============================================================================================\n")

In [15]:
def calc_and_write_CosineSimilarity_for_all( tfs, fileNames ):
    filePath = "../results/cosine_similarity.txt"
    outFile = open(filePath, 'a')
    title = "COSINE SIMILARITY\n"
    outFile.write(title)
    numFiles = len(fileNames)
    names = []
    outFile.write('                   ')
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                outFile.write(fileNames[k])
                outFile.write('   ')
            outFile.write("\n")
        outFile.write(fileNames[i])
        outFile.write('   ')

        for n in range(numFiles):
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            names.append(fileNames[n])
            outFile.write('{0:.8f}'.format(numValue))
            outFile.write('         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        outFile.write("\n")

    outFile.close()

## Wrap Everything into `Main()`

In [19]:
def main(printResults=True):
    baseFolderPath = "./inputdata/"

    fileNames, filePathList = returnListOfFilePaths( baseFolderPath )

    rawContentDict = create_docContentDict( filePathList )

    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform( rawContentDict.values() )

    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names()
    
    if printResults:
        # print results
        print_TFIDF_for_all( tfs_Term, tfs_Values, fileNames )
        calc_and_print_CosineSimilarity_for_all( tfs, fileNames )
    else:
        # write results to file
        write_TFIDF_for_all( tfs_Term, tfs_Values, fileNames )   
        calc_and_write_CosineSimilarity_for_all( tfs, fileNames )

In [21]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
main()