# Iterables and generator functions

## Iterables

Iterables: object that can be iterated (for example a list, dict, set, tuples), under the hood iterables have a `__iter__` function

In [None]:
# example (thanks chatGPT)
class MyIterable:
    def __init__(self, start):
        self.start = start

    # this function returns the iterable (it returns 'self')
    def __iter__(self):
        self.index = 0
        return self

    # this functions is called to 'iterate' through it
    def __next__(self):
        if self.index >= self.start:
            raise StopIteration
        # counting the numbers down
        value = self.start - self.index
        self.index += 1
        return value

    # support for length 
    def __len__(self):
        return self.start

In [None]:
# make new MyIterable, set 'start' to 5
c = MyIterable(5)

# length of iterable means calling __len__
print('length', len(c))

# iterate
for num in c:
    print('Number is', num)

## Generator functions

Generator functions are functions that 'yield' one result from an iterable (instead of returning everything in one go). 
We use these function when processing many (large) files. Instead of reading all files into memory, and then processing each file one by one, a generator function reads one file at the time.



In [None]:
def create_generator():
    # instead, we read 100,000 files of 20MB each
    for i in [1, 5, 11, 22]:
        print('about to yield', i)
        # return one file of 20MB
        yield i

In [None]:
# new generator 
my_gen = create_generator()
my_gen

In [None]:
# it doesn't know its length
len(my_gen)

In [None]:
for x in my_gen:
    print('just asked for a number ', x)

### Combined: Generator function as iterable

In [None]:
# example

import gensim
import os, string, re, glob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english') )

class MyDocuments():

    def __init__(self, folder):
        self.folder = folder
        self.files = glob.glob('{}/*.txt'.format(folder))
            
    def __len__ (self):
        return (len(self.files))
    
    def __iter__(self):
        for file in self.files:
            with open( file, encoding='utf-8') as f:
                content = f.read() 
            # tokenize
            file_tokens = [x for x in word_tokenize(content) if x.isalpha() and x.lower() not in stopWords and x not in string.punctuation]            
            yield file_tokens
            

In [None]:
# iterable that returns one document at the time
fileList = MyDocuments(r'C:\Users\joost\Documents\teaching\acg7849-python\comment_letters') 

In [None]:
# number of files
len(fileList)

In [None]:
# iterate through tokenized_files (one at the time) 
# it is not subscriptable though
counter = 0
for f in fileList:
    # f is a tokenized document, show first 10 words
    print (counter, f[0:10])
    counter += 1
    if counter == 5:
        break