In this exercise, we want to find urls that matches given regular expression.

D = the content containing lots of urls (maybe big data)
RE = given set of regular expressions

In [1]:
# import libraries
import re
from difflib import SequenceMatcher
from datetime import datetime
import multiprocessing
from multiprocessing import Process, Queue

I developed a greedy version of finding similarity between two strings given the time, but a better version is the SequenceMatcher in difflib.

In [2]:
def MyGreedySimilarityStrings(X, Y):
    total_similarity = 0
    
    for start_on in range(len(X)):
        similarity_count = 0
        
        str_ret = ""
        
        str_ret += str(start_on) + ","
        # find first similarity on the other str
        found_str_chunk = False
        index_on_str1 = start_on
        index_on_str2 = 0
        last_index_str2_equal = 0
        while not found_str_chunk:
            #print(X[index_on_str1], Y[index_on_str2])
            if index_on_str1 < len(X) and index_on_str2 < len(Y) and X[index_on_str1] == Y[index_on_str2]:
                index_on_str1 += 1
                similarity_count += 1
                
                last_index_str2_equal = index_on_str2
                str_ret = str_ret + str(Y[index_on_str2])
                
            index_on_str2 += 1
            
            if (index_on_str2 >= len(Y)):
                index_on_str2 = last_index_str2_equal + 1
                index_on_str1 += 1
            if (index_on_str1 >= len(X)):
                found_str_chunk = True

        str_ret += "," + str(similarity_count)
        #print(str_ret)
        
        total_similarity = max(total_similarity, similarity_count)
    return total_similarity

In [3]:
# test for MyGreedySimilarityStrings
X = "abxcd"
Y = "abcd"
print( "Length of LCS is ", MyGreedySimilarityStrings(X , Y) )

Length of LCS is  4


In [4]:
def RetError(str1, str2):
    useSequenceMatcher = True
    if useSequenceMatcher:
        s = SequenceMatcher(None, str1, str2)
        return 1 - s.ratio()
    
    a = MyGreedySimilarityStrings(str1, str2)
    return 1 - a / max(len(str1),len(str2))

In [5]:
# startIdx and endIdx is the index on the big data to be process
# RE is the regular expression at hand for processing
def IsMatch(worker_id, output_queue, Part_D, RE, startIdx, endIdx):
    out = {} # a dictionary for output
    out['worker_id'] = worker_id
    out['time'] = datetime.now()
    p = re.compile(RE)
    for i in range(startIdx, endIdx):
        x = p.search(Part_D[i])
        if x:
            out['error'+str(i)] = 0 
        else:
            out['error'+str(i)] = RetError(Part_D[i], RE)
    
    out['deltatime'] = datetime.now() - out['time']
    output_queue.put(out)

In [6]:
# test for IsMatch
RE = ['https://MyWeb.com/(.*)']

D = ['https://MyWeb.com/1',
     'https://MyWeb.com/2',
     'https://MyWeb.com/3',
     'https://YourWeb.com/1',
     'https://YourWeb.com/2',
     'https://YourWeb.com/3',
    ]

output = Queue()

for i in range(len(RE)):
    IsMatch(i, output, D, RE[i], 0, len(D))
    
print(output.get())

{'worker_id': 0, 'time': datetime.datetime(2019, 11, 17, 13, 35, 21, 117891), 'error0': 0, 'error1': 0, 'error2': 0, 'error3': 0.2558139534883721, 'error4': 0.2558139534883721, 'error5': 0.2558139534883721, 'deltatime': datetime.timedelta(0)}


In [7]:
# test multi-processing

from multiprocessing import Process, Queue

#Having the function definition here results in
#AttributeError: Can't get attribute 'f' on <module '__main__' (built-in)>

#The solution seems to be importing the function from a separate file.
# utils contains the above functions
import utils

if __name__ == '__main__':
   # Define an output queue
   output=Queue()

   # Setup a list of processes that we want to run
   p = Process(target=utils.test, args=('somebody', output))

   # Run process
   p.start()

   # Exit the completed process
   p.join()

   # Get process results from the output queue
   result = output.get(p)

   print(result)

hello somebody


In [8]:
import utils

if __name__ == '__main__':
    RE = ['https://MyWeb.com/(.*)',
          'https://YourWeb.com/(.*)',]

    D = ['https://MyWeb.com/1',
         'https://MyWeb.com/2',
         'https://MyWeb.com/3',
         'https://YourWeb.com/1',
         'https://YourWeb.com/2',
         'https://YourWeb.com/3',
        ]
    
    jobs = []

    output = Queue()

    for i in range(len(RE)):
        p = Process(target=utils.IsMatch, args=(i, output, D, RE[i], 0, len(D)))
        jobs.append(p)
        p.start()

    results = []
    for p in jobs:
        p.join()
        results.append(output.get(p))

    print(results)

[{'worker_id': 0, 'time': datetime.datetime(2019, 11, 17, 13, 35, 21, 352264), 'error0': 0, 'error1': 0, 'error2': 0, 'error3': 0.2558139534883721, 'error4': 0.2558139534883721, 'error5': 0.2558139534883721, 'deltatime': datetime.timedelta(0, 0, 998)}, {'worker_id': 1, 'time': datetime.datetime(2019, 11, 17, 13, 35, 21, 353262), 'error0': 0.2558139534883721, 'error1': 0.2558139534883721, 'error2': 0.2558139534883721, 'error3': 0, 'error4': 0, 'error5': 0, 'deltatime': datetime.timedelta(0)}]
