In [1]:
import math
    # math.acos(x) is the arccosine of x.
    # math.sqrt(x) is the square root of x.

import string
    # string.join(words,sep) takes a given list of words,
    #    and returns a single string resulting from concatenating them
    #    together, separated by the string sep .
    # string.lower(word) converts word to lower-case

##################################
# Operation 1: read a text file ##
##################################
def read_file(filename):
    """ 
    Read the text file with the given filename;
    return a list of the lines of text in the file.
    """
    try:
        fp = open(filename)
        L = fp.readlines()
    except IOError:
        print ("Error opening or reading input file: ",filename)
    return L

#################################################
# Operation 2: split the text lines into words ##
#################################################
def get_words_from_line_list(L):
    """
    Parse the given list L of text lines into words.
    Return list of all words found.
    """

    word_list = []
    for line in L:
        words_in_line = get_words_from_string(line)
        word_list = word_list + words_in_line
    return word_list

def get_words_from_string(line):
    """
    Return a list of the words in the given input string,
    converting each word to lower-case.

    Input:  line (a string)
    Output: a list of strings 
              (each string is a sequence of alphanumeric characters)
    """
    word_list = []          # accumulates words in line
    character_list = []     # accumulates characters in word
    emptysymb = ""
    for c in line:
        if c.isalnum():
            character_list.append(c)
        elif len(character_list)>0:
            word = emptysymb.join(character_list)
            word = word.lower()
            word_list.append(word)
            character_list = []
    if len(character_list)>0:
        word = emptysymb.join(character_list)
        word = word.lower()
        word_list.append(word)
    return word_list

##############################################
# Operation 3: count frequency of each word ##
##############################################
def count_frequency(word_list):
    """
    Return a list giving pairs of form: (word,frequency)
    """
    L = []
    for new_word in word_list:
        for entry in L:
            if new_word == entry[0]:
                entry[1] = entry[1] + 1
                break
        else:
            L.append([new_word,1])
    return L

###############################################################
# Operation 4: sort words into alphabetic order             ###
###############################################################
def insertion_sort(A):
    """
    Sort list A into order, in place.

    From Cormen/Leiserson/Rivest/Stein,
    Introduction to Algorithms (second edition), page 17,
    modified to adjust for fact that Python arrays use 
    0-indexing.
    """
    for j in range(len(A)):
        key = A[j]
        # insert A[j] into sorted sequence A[0..j-1]
        i = j-1
        while i>-1 and A[i]>key:
            A[i+1] = A[i]
            i = i-1
        A[i+1] = key
    return A
    
#############################################
## compute word frequencies for input file ##
#############################################
def word_frequencies_for_file(filename):
    """
    Return alphabetically sorted list of (word,frequency) pairs 
    for the given file.
    """

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)
    insertion_sort(freq_mapping)

    print ("File",filename,":",)
    print (len(line_list),"lines,",)
    print (len(word_list),"words,",)
    print (len(freq_mapping),"distinct words")

    return freq_mapping

def inner_product(L1,L2):
    """
    Inner product between two vectors, where vectors
    are represented as alphabetically sorted (word,freq) pairs.

    Example: inner_product([["and",3],["of",2],["the",5]],
                           [["and",4],["in",1],["of",1],["this",2]]) = 14.0 
    """
    sum = 0.0
    i = 0
    j = 0
    while i<len(L1) and j<len(L2):
        # L1[i:] and L2[j:] yet to be processed
        if L1[i][0] == L2[j][0]:
            # both vectors have this word
            sum += L1[i][1] * L2[j][1]
            i += 1
            j += 1
        elif L1[i][0] < L2[j][0]:
            # word L1[i][0] is in L1 but not L2
            i += 1
        else:
            # word L2[j][0] is in L2 but not L1
            j += 1
    return sum

def vector_angle(L1,L2):
    """
    The input is a list of (word,freq) pairs, sorted alphabetically.

    Return the angle between these two vectors.
    """
    numerator = inner_product(L1,L2)
    denominator = math.sqrt(inner_product(L1,L1)*inner_product(L2,L2))
    return math.acos(numerator/denominator)

def main1(filename_1,filename_2):
    sorted_word_list_1 = word_frequencies_for_file(filename_1)
    sorted_word_list_2 = word_frequencies_for_file(filename_2)
    distance = vector_angle(sorted_word_list_1,sorted_word_list_2)
    print ("The distance between the documents is: %0.6f (radians)"%distance)

In [2]:
file1 = "data/t1.verne.txt" ## taille : 51 ko
file2 = "data/t2.bobsey.txt" ## taille : 256 ko
file3 = "data/t3.lewis.txt" ## taille : 1 Mo
file4 = "data/t4.arabian.txt" ## taille : 3 Mo
file5 = "data/t5.churchill.txt" ## taille : 9 Mo
file6 = "data/t8.shakespeare.txt" ## taille : 5 Mo

In [3]:
main1(file3,file2)

File data/t3.lewis.txt :
15996 lines,
182355 words,
8530 distinct words
File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
The distance between the documents is: 0.574160 (radians)


In [4]:
def main2(filename_1,filename_2):
    import profile
    profile.run("main1('"+filename_1+"','"+filename_2+"')")

In [5]:
main2(file2,file3)

File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
         3308925 function calls in 7.922 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    7.922    3.961 3000793489.py:109(word_frequencies_for_file)
        2    0.000    0.000    0.000    0.000 3000793489.py:14(read_file)
        1    0.000    0.000    7.922    7.922 3000793489.py:163(main1)
        2    4.438    2.219    5.672    2.836 3000793489.py:29(get_words_from_line_list)
    22663    0.672    0.000    1.234    0.000 3000793489.py:41(get_words_from_string)
        2    2.125    1.062    2.125    1.062 3000793489.py:70(count_frequency)
        1    0.125    0.125    0.125    0.125 3000793489.py:87(insertion_sort)
       18    0.000    0.000    0.000    0.000 :0(__exit__)
        1    0.000    0.000    0.000    0.000 :0(acquire)
  1241422    0.234    0.000    0.234    0.000 :0(append)
      158    0.000    0.000    0.

KeyboardInterrupt: 

In [6]:
def get_words_from_line_list(L):
    """
    Parse the given list L of text lines into words.
    Return list of all words found.
    """

    word_list = []
    for line in L:
        words_in_line = get_words_from_string(line)
        word_list.extend(words_in_line)
    return word_list

In [7]:
main2(file2,file3)

File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
File data/t3.lewis.txt :
15996 lines,
182355 words,
8530 distinct words
The distance between the documents is: 0.574160 (radians)
         3375479 function calls in 8.656 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.016    0.008    1.750    0.875 2783323097.py:1(get_words_from_line_list)
        2    0.000    0.000    8.625    4.312 3000793489.py:109(word_frequencies_for_file)
        3    0.016    0.005    0.016    0.005 3000793489.py:127(inner_product)
        2    0.000    0.000    0.000    0.000 3000793489.py:14(read_file)
        1    0.000    0.000    0.016    0.016 3000793489.py:153(vector_angle)
        1    0.016    0.016    8.656    8.656 3000793489.py:163(main1)
    22663    0.875    0.000    1.734    0.000 3000793489.py:41(get_words_from_string)
        2    4.922    2.461    4.922    2.461 3000793489.py:70(count_frequency)


In [9]:
def count_frequency(word_list):
    """
    Return a list giving pairs of form: (word,frequency)
    """
    
    dico = {}
    for new_word in word_list :
        if new_word not in dico.keys() :
            dico[new_word] = 1
        else :
            dico[new_word] += 1
    return list(dico.items())

In [10]:
main2(file2,file3)

File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
File data/t3.lewis.txt :
15996 lines,
182355 words,
8530 distinct words
The distance between the documents is: 0.574160 (radians)
         3595773 function calls in 3.672 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.031    0.016    2.016    1.008 2783323097.py:1(get_words_from_line_list)
        2    0.000    0.000    3.656    1.828 3000793489.py:109(word_frequencies_for_file)
        3    0.000    0.000    0.016    0.005 3000793489.py:127(inner_product)
        2    0.000    0.000    0.000    0.000 3000793489.py:14(read_file)
        1    0.000    0.000    0.016    0.016 3000793489.py:153(vector_angle)
        1    0.000    0.000    3.672    3.672 3000793489.py:163(main1)
    22663    1.031    0.000    1.969    0.000 3000793489.py:41(get_words_from_string)
        2    1.500    0.750    1.500    0.750 3000793489.py:87(insertion_sort)
 

In [11]:
intab = string.punctuation+string.ascii_uppercase
outtab = " "*len(string.punctuation)+string.ascii_lowercase

tab = str.maketrans(intab,outtab)

def get_words_from_string(line):
    """
    Return a list of the words in the given input string,
    converting each word to lower-case.

    Input:  line (a string)
    Output: a list of strings 
              (each string is a sequence of alphanumeric characters)
    """
    line = line.translate(tab)
    word_list = line.split()
    return word_list

In [12]:
main2(file2,file3)

File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
File data/t3.lewis.txt :
15996 lines,
182355 words,
8530 distinct words
The distance between the documents is: 0.574160 (radians)
         366846 function calls in 1.844 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.016    0.008    0.016    0.008 2783323097.py:1(get_words_from_line_list)
        2    0.000    0.000    1.828    0.914 3000793489.py:109(word_frequencies_for_file)
        3    0.000    0.000    0.000    0.000 3000793489.py:127(inner_product)
        2    0.000    0.000    0.000    0.000 3000793489.py:14(read_file)
        1    0.000    0.000    0.000    0.000 3000793489.py:153(vector_angle)
        1    0.016    0.016    1.844    1.844 3000793489.py:163(main1)
        2    1.672    0.836    1.672    0.836 3000793489.py:87(insertion_sort)
    22663    0.000    0.000    0.000    0.000 3794080073.py:6(get_words_from_string)
   

In [19]:
def merge_sort(A):
  
    if len(A) > 1 :
        m = int(len(A)/2)
        L = merge_sort(A[:m])
        R = merge_sort(A[m:])
        return merge(L, R)
    return A


def merge(L,R):

    A = []
    i = 0
    j = 0

    while i < len(L) and j < len(R) :
        if L[i] <= R[j] :
            A.append(L[i])
            i += 1
        else :
            A.append(R[j])
            j += 1
    while i < len(L) :
        A.append(L[i])
        i += 1
    while j < len(R) :
        A.append(R[j])
        j += 1

    return A

In [20]:
#############################################
## compute word frequencies for input file ##
#############################################
def word_frequencies_for_file(filename):
    """
    Return alphabetically sorted list of (word,frequency) pairs 
    for the given file.
    """

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)
    freq_mapping = merge_sort(freq_mapping)

    print ("File",filename,":",)
    print (len(line_list),"lines,",)
    print (len(word_list),"words,",)
    print (len(freq_mapping),"distinct words")

    return freq_mapping


In [21]:
main2(file2,file3)

File data/t2.bobsey.txt :
6667 lines,
49785 words,
3354 distinct words
File data/t3.lewis.txt :
15996 lines,
182355 words,
8530 distinct words
The distance between the documents is: 0.574160 (radians)
         917375 function calls (893611 primitive calls) in 0.562 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.547    0.273 2246852195.py:4(word_frequencies_for_file)
  23766/2    0.031    0.000    0.328    0.164 2779681331.py:1(merge_sort)
    11882    0.141    0.000    0.297    0.000 2779681331.py:11(merge)
        2    0.016    0.008    0.031    0.016 2783323097.py:1(get_words_from_line_list)
        3    0.016    0.005    0.016    0.005 3000793489.py:127(inner_product)
        2    0.000    0.000    0.000    0.000 3000793489.py:14(read_file)
        1    0.000    0.000    0.016    0.016 3000793489.py:153(vector_angle)
        1    0.000    0.000    0.562    0.562 3000793489.py:163(main1)


In [None]:
def word_frequencies_for_file(filename):
    """
    Return alphabetically sorted list of (word,frequency) pairs 
    for the given file.
    """

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)

    print ("File",filename,":",)
    print (len(line_list),"lines,",)
    print (len(word_list),"words,",)
    print (len(freq_mapping),"distinct words")

    return freq_mapping

def inner_product(D1,D2):
    """
    Inner product between two vectors, where vectors
    are represented as dictionaries of (word,freq) pairs.

    Example: inner_product({"and":3,"of":2,"the":5},
                           {"and":4,"in":1,"of":1,"this":2}) = 14.0
    """
   
    # VOTRE CODE ICI.

In [None]:
def inner_product(L1,L2):
    """
    Inner product between two vectors, where vectors
    are represented as alphabetically sorted (word,freq) pairs.

    Example: inner_product([["and",3],["of",2],["the",5]],
                           [["and",4],["in",1],["of",1],["this",2]]) = 14.0 
    """
    sum = 0.0
    i = 0
    j = 0
    while i<len(L1) and j<len(L2):
        # L1[i:] and L2[j:] yet to be processed
        if L1[i][0] == L2[j][0]:
            # both vectors have this word
            sum += L1[i][1] * L2[j][1]
            i += 1
            j += 1
        elif L1[i][0] < L2[j][0]:
            # word L1[i][0] is in L1 but not L2
            i += 1
        else:
            # word L2[j][0] is in L2 but not L1
            j += 1
    return sum