In [1]:
# Python 3.4.3
from numpy import zeros, dot, savetxt
from numpy.linalg import norm
 
# Definition of a cosine distance function
# according to scipy.spatial.distance.cosine function's description 
def cosine_distance(u, v):
    return 1.0 - (dot(u, v) / (norm(u) * norm(v)))
 
if __name__ == "__main__":
    with open("sentences.txt") as f:
        # Let's determine a number of lines in a text file
        lines = sum(1 for _ in f)
        f.seek(0)
        
        import re
        words = {}
        # The 'lcount' variable controlls the index of a current line in the text file,
        # when the 'wcount' variable controlls the index of a unique word.
        lcount, wcount = 0, 0
        for line in f:
            # Here we compile a pattern object, read a line from the text file,
            # and split it into tokens (words).
            p = re.compile(r"[^a-z]+")
            tokens = p.split(line.lower())
            # In this case we will have only one empty token at the end of a list.
            # Let's remove it.
            tokens.pop()
            for token in tokens:
                # If a token is not presented in the dictionary yet then we add it.
                # The value of the 'occurrences' entry is a list object where every item represents
                # a specific line in the text file. The first item represents the first line, the second item
                # represents the sencond line etc. 
                if token not in words:
                    words[token] = {
                        "index": wcount,
                        "occurrences": [0] * lines
                    }
                    wcount += 1
                # If we find the same token in the same list of tokens (in the same line)
                # then we just ignore it.
                elif words[token]["occurrences"][lcount] != 0:
                    continue
                
                # Here we register how many times a token occured in a list of tokens (in a line)    
                words[token]["occurrences"][lcount] = tokens.count(token)    
            lcount += 1
        
        # Here we create a numpy-array, filled with zeros
        arr = zeros((lines, len(words)))
        
        # Now for every word in the dictionary we take a number of its occurrences
        # in every line and put it to the array.  
        for word in words:
            i, j = 0, words[word]["index"]
            for occ in words[word]["occurrences"]:
                arr[i, j] = occ
                i += 1
    
        #savetxt("foo.csv", arr, delimiter=",")
        
        # Finally we calculate a cosine distance between the first sentence (line) and
        # other sentences in the text file
        dist = [] 
        u = arr[0,] 
        for i in range(1, lines):
            v = arr[i,]
            dist.append({"index": i, "distance": cosine_distance(u, v)})    
        
        dist.sort(key=lambda x: x["distance"])
        print("The 1st closest sentence is a sentence #%d with a cosine distance of %.2f.\n"\
        "The 2nd closest sentence is a sentence #%d with a cosine distance of %.2f." % (
            dist[0]["index"],
            dist[0]["distance"],
            dist[1]["index"],
            dist[1]["distance"]
        ))

The 1st closest sentence is a sentence #6 with a cosine distance of 0.73.
The 2nd closest sentence is a sentence #4 with a cosine distance of 0.78.
