# Data Preprocessing
The goal of this notebook is to preprocess all code files in the **openpyxl** librairy. We will create a generator that go through all files and exhaust the first lines of each module containing import and module description.

In [2]:
import os
import shutil
import re

In [3]:
base_dir = "openpyxl"

### 1. Create iterator pipeline for data processing

In [5]:
def gen_filepath(base_dir):
    """ Find all filenames in a directory that match a pattern """
    for root, dirs, files in os.walk(base_dir, topdown=True):
        for file in files:
            if '.pyc' not in file and '__' not in file and '.pyd' not in file:
                fpath = os.path.join(root, file)
                yield(fpath)

In [6]:
def gen_opener(filenames):
    """
    Open a sequence of filenames one at a time producing a file object. 
    File is immediately closed prior proceeding to the next iteration.
    """
    for filename in filenames:
        f = open(filename, 'r')
        yield f
        f.close()

In [7]:
def gen_reader(file):
    """ 
    Read a file object.
    Exhaust unnecessary lines when reading a file object 
    """
    # exhaust unecessary lines at the top of the files
    
    # class or def followed by anay characcters followed by semicolon
    pattern = r'(class|def)(.*):' 
    for line in file:
        if re.search(pattern, line):
            break
            
    # yield only if last line is a class or function definition       
    if re.search(pattern, line):
        yield line
        
        for line in file:
            yield line

        yield '\n\n'

In [8]:
def concat_gen(iterators):
    """ 
    Concatenate all iterators 
    The main goal is to yield from file at opening prior clossing them
    """
    for it in iterators:
        yield from it # yield file from gen_opener which will be close after being exhausted

In [9]:
# data pipeline
filenames = gen_filepath(base_dir)
files = gen_opener(filenames)
inside_files = concat_gen(files) # yield from files then close them
lines = gen_reader(inside_files)

In [10]:
counter = 0
for line in lines:
    counter+=1
print(f"There ar {counter} lines total.")

There ar 28806 lines total.


### 2. Concatenate all lines in a single file

Here we generate a single file containing all the lines of code. We won't really use the generator pipeline as we are going to learn from random chunk of text. We need to process all the text at once. 

In [12]:
filenames = gen_filepath(base_dir)
files = gen_opener(filenames)
inside_files = concat_gen(files) # yield from files then close them
lines = gen_reader(inside_files)

# concatenate all in one text file
with open('openpyxl.txt', 'w') as f:
    for line in lines:
        f.write(line)