In [13]:
import os
import sys
import cProfile
import operator
import itertools

#### Basic Usage of Generator

Generators are sepecial function that returns a lazy iterator which don't store the object in memory.
Generators can be created either through generator functions or expressions (also called generator comprehension, as oppose to list comprehension).

Generator functions look just like a regular function, but it use the `yield` statement instead of `return`.Once the Python yield statement is hit, the program suspends function execution and returns the yielded value to the caller. (In contrast, `return` stops function execution completely.) When a function is suspended, the state of that function is saved. This includes any variable bindings local to the generator, the instruction pointer, the internal stack, and any exception handling. This allows you to resume function execution whenever you call one of the generator’s methods. In this way, all function evaluation picks back up right after yield. 
Until all values are evaluated, iteration on generator will exit, if you call `next()` again, `StopIteration` exception will be raised. 

In [6]:
def csv_reader(file_name):
    #Open() returns a generator object that you can lazily iterate through line by line, 
    #Comparatively, `file.read().split("\n")` will loads everything into memory at once 
    for row in open(file_name, "r"):
        yield row
        
#csv_gen = (row for row in open(file_name))

Once have a called generator, which needs to be initialized ahead, we can iterate over it be repeatedly calling the `next()`function. As the `infinite_sequence`, each time `next` is called, the previously yielded variable num is incremented and then yield again. 

In [7]:
def infinite_sequence():
    num = 0
    while True:
        yield num
        num += 1

In [9]:
num_gen = infinite_sequence()
print("type of num_gen", type(num_gen))
print("Using next to iterate the item in generator")
print(next(num_gen))
print(next(num_gen))
print(next(num_gen))
print(next(num_gen))

type of num_gen <class 'generator'>
Using next to iterate the item in generator
0
1
2
3


Similar to list comprehension, generator expression/comprehension allow programer to create a genearator object in a few line of code, without holding the entre object in memory before iteration.
Instead of using bracket for list, just simply use parenthesis to turn it into generator.

However, it can be faster to evaluate a short list comprehension than equivalent generator expression. 

In [12]:
nums_squared_lc = [num**2 for num in range(500)]
print("inspect the size of list comprehension:", sys.getsizeof(nums_squared_lc))
nums_squared_gc = (num**2 for num in range(500))
print("inspect the size of generator comprehension:", sys.getsizeof(nums_squared_gc))

inspect the size of list comprehension: 4272
inspect the size of generator comprehension: 88


In [17]:
cProfile.run('sum([i * 2 for i in range(10000)])')
print("-------------------")
cProfile.run('sum((i * 2 for i in range(10000)))')

         5 function calls in 0.001 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001    0.001    0.001 <string>:1(<listcomp>)
        1    0.000    0.000    0.001    0.001 <string>:1(<module>)
        1    0.000    0.000    0.001    0.001 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {built-in method builtins.sum}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}


-------------------
         10005 function calls in 0.003 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    10001    0.001    0.000    0.001    0.000 <string>:1(<genexpr>)
        1    0.000    0.000    0.002    0.002 <string>:1(<module>)
        1    0.000    0.000    0.003    0.003 {built-in method builtins.exec}
        1    0.001    0.001    0.002    0.002 {built-in method builtins.sum}
      

#### Advanced Generator Methods 
`.send()` will send a (modified) value back to the generator to be picked up after `yield` statement.

`.throw()` is useful in any areas where you might need to catch an exception to controle where to stop iterating 

`.close()` alose allows you to stop a generator and it will raise `StopIteration` error, which often used to signal the end of a finite iterator.

In [34]:
# palindrome detector case 

def is_palindrome(num):
    # by default, single-digit number is not palindrome
    if num //10 == 0:
        return False
    as_negative = -1 if num <0 else 1
    
    abs_num = abs(num)
    reversed_num = 0
    
    while abs_num:
        last_digit = abs_num % 10
        abs_num = abs_num // 10
        reversed_num = reversed_num * 10 + last_digit
        
    reversed_num= reversed_num * as_negative
    if reversed_num == num:
        return True
    return False
    
    
def infinite_palindrome():
    num = 0
    while True:
        if is_palindrome(num):
            # yield_val takes the value is yieled, allows us to manipulated the value, e.g. with .send() a value
            # back to the generator. When the execution continues after `yiled`, it will take the value that is sent
            yield_val = (yield num)
            if yield_val is not None:
                num = yield_val
        num += 1
        

def execute_palinfrome_print():
    pal_gen = infinite_palindrome()
    
    for pal_val in pal_gen:
        print(pal_val)
        digits = len(str(pal_val))
        # once a palindrome value is found, it sends 10** digits to `yield_value`, then the program update num by increment of 1
        # and check for palindrome again
        if digits >= 5:
            pal_gen.throw(ValueError("No need to look for larger palindrome!"))
            # pal_gen.close()
        pal_gen.send(10**digits)
        

In [26]:
palindrome_test_case = [(12321, True), (1001, True), (1010, False), (1234, False)]
for test_num, expected in palindrome_test_case:
    assert is_palindrome(test_num)== expected

In [35]:
execute_palinfrome_print()

11
111
1111
10101


ValueError: No need to look for larger palindrome!

#### Creating Datapipeline with Generator

In [43]:
data_folder = os.path.dirname(os.path.dirname(os.getcwd())) + "/data/"
fname_techcrunch = data_folder + "techcrunch.csv"
print("fname_techcrunch:", fname_techcrunch)

fname_techcrunch: /Users/shisi/ipython_projects/data/techcrunch.csv


In [89]:
# create a generator expression lines to yield each line of a file
lines = (line for line in open(fname_techcrunch))
# iterate through `lines` generator and turn each line to a list of values
line_values = (s.rstrip().split(",") for s in lines)
# call `next()` once to get a list of the column names from your CSV file
col_names = next(line_values)
# create dictionaries where the keys are the column names across all line of values
company_dicts = (dict(zip(col_names, data)) for data in line_values)

In [85]:
# filter for series A funding arounds
funding = (
    int(company_dict["raisedAmt"])
    for company_dict in company_dicts
    if company_dict["round"] == "a"
)
# get the total raised funding amount
total_series_a = sum(funding)
print(f"Total series A fundraising: ${total_series_a}")

Total series A fundraising: $4376015000


In [90]:
# Bonus point: To get the average amount raised
funding_arounds = ((1, int(company_dict["raisedAmt"]))
                    for company_dict in company_dicts
                    if company_dict["round"] == "a"
                  )

cnt_rounds, total_series_a = list(map(sum, zip(*funding_arounds)))

avg_series_a = total_series_a / cnt_rounds
print(f"Average series A fundingraised: ${avg_series_a}")

Average series A fundingraised: $7531867.469879518
