# Extract documentation for each module and subsequently each of its function

# Import

In [1]:
import ast
import numpy as np
from bs4 import BeautifulSoup as bs

# Load dataset

In [None]:
# codeLines is a dataframe that is extracted out of a JSON file

# Run 

In [5]:
# zip up source_dir located in GitHub remote_url's remote_branch and add it to Spark's source context
remote_url = "https://github.com/tiffanyj41/hermes.git"
remote_branch = "py2vec-docstring"
source_dir = "src"
debug = True

# helper functions
import os
import functools

def _list_all_in_dir(dir_path):
    for path, subdirs, files in os.walk(dir_path):
        for filename in files:
            print os.path.join(path, filename)
            
def _zip_dir(srcdir_path, zipfile_handler):
    try:
        zipfile_handler.writepy(srcdir_path)
    finally:
        zipfile_handler.close()
            
def trackcalls(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        wrapper.has_been_called = True
        return func(*args, **kwargs)
    wrapper.has_been_called = False
    return wrapper

@trackcalls
def _add_zipfile_to_sc(zipfile_path):
    sc.addPyFile(zipfile_path) 
    
import git
import os
import tempfile
import shutil
import zipfile    

# create a temporary directory
tmpdir_path = tempfile.mkdtemp()
if debug: print "temporary directory: %s\n" % tmpdir_path

# ensure file is read/write by creator only
saved_umask = os.umask(0077)

# create a zipfile handler to zip the necessary files
ziptmpdir_path = tempfile.mkdtemp()
if debug: print "temporary directory for zip file: %s\n" % ziptmpdir_path
zipfile_path = ziptmpdir_path + "/hermes_src_2.zip"
if debug: print "zip file's path: %s\n" % zipfile_path
zipfile_handler = zipfile.PyZipFile(zipfile_path, "w")

# make zipfile handler verbose for debugging
zipfile_handler.debug = 3

try:
    # clone "framework" branch from GitHub into temporary directory
    local_branch = git.Repo.clone_from(remote_url, tmpdir_path, branch=remote_branch)
    if debug: print "current branch: %s\n" % local_branch.head.ref
    if debug: print "list all in %s:" % tmpdir_path; _list_all_in_dir(tmpdir_path); print "\n"
        
    # zip "hermes" directory
    if debug: print "zipping: %s\n" % os.path.join(tmpdir_path, source_dir)
    _zip_dir(os.path.join(tmpdir_path, source_dir), zipfile_handler)
    
    # check zip file
    if debug: print "Is zip file %s valid? %s\n" % (zipfile_path, zipfile.is_zipfile(zipfile_path))
    
    # add zip to SparkContext 
    # note: you can only add zip to SparkContext one time
    if not _add_zipfile_to_sc.has_been_called:
        if debug: print "add zip file %s into spark context\n" % zipfile_path
        _add_zipfile_to_sc(zipfile_path)
    else:
        if debug: print "zip file %s is already added into spark context; will not re-add\n" % zipfile_path
    
except IOError as e:
    raise e
else:
    os.remove(zipfile_path)
finally:
    os.umask(saved_umask)
    shutil.rmtree(tmpdir_path)
    shutil.rmtree(ziptmpdir_path)

temporary directory: /tmp/tmpsH6z9Q

temporary directory for zip file: /tmp/tmpQ3j2WI

zip file's path: /tmp/tmpQ3j2WI/hermes_src_2.zip

current branch: py2vec-docstring

list all in /tmp/tmpsH6z9Q:
/tmp/tmpsH6z9Q/.gitignore
/tmp/tmpsH6z9Q/README.md
/tmp/tmpsH6z9Q/LICENSE
/tmp/tmpsH6z9Q/.git/config
/tmp/tmpsH6z9Q/.git/packed-refs
/tmp/tmpsH6z9Q/.git/index
/tmp/tmpsH6z9Q/.git/description
/tmp/tmpsH6z9Q/.git/HEAD
/tmp/tmpsH6z9Q/.git/hooks/applypatch-msg.sample
/tmp/tmpsH6z9Q/.git/hooks/pre-rebase.sample
/tmp/tmpsH6z9Q/.git/hooks/update.sample
/tmp/tmpsH6z9Q/.git/hooks/post-commit.sample
/tmp/tmpsH6z9Q/.git/hooks/commit-msg.sample
/tmp/tmpsH6z9Q/.git/hooks/prepare-commit-msg.sample
/tmp/tmpsH6z9Q/.git/hooks/post-update.sample
/tmp/tmpsH6z9Q/.git/hooks/pre-applypatch.sample
/tmp/tmpsH6z9Q/.git/hooks/post-receive.sample
/tmp/tmpsH6z9Q/.git/hooks/pre-commit.sample
/tmp/tmpsH6z9Q/.git/refs/remotes/origin/HEAD
/tmp/tmpsH6z9Q/.git/refs/heads/py2vec-docstring
/tmp/tmpsH6z9Q/.git/logs/HEAD
/tmp/t

In [6]:
# import the required modules from Hermes
from src.data_prep.model import py2vec_docstring_model  
from src.data_prep import git_vectorize as gv
from src.algorithms import content_based
from src.algorithms import performance_metrics
from src.utils import save_load as sl

In [7]:
py2vecDocstringModel = py2vec_docstring_model.Py2VecDocstringModel(codeLinesDF)
model_dict = py2vecDocstringModel.get_model_dict()

NameError: global name 'word2vec' is not defined

In [None]:
vectorizer = gv.git_vectorize(codeLinesDF, "any_interact", "py2vec", sc, model=model_dict)
user_vector = vectorizer.get_user_vector()
content_vector = vectorizer.get_content_vector()

In [None]:
# Split into a test and training set
train_ratings, test_ratings = user_vector.randomSplit(weights=[90, 10], seed=41)
test_ratings.cache()

In [None]:
predicted = content_based.predict(train_ratings, item_content)
predicted.cache()

In [None]:
rmse = performance_metrics.calculate_rmse_using_rdd(test_ratings, predicted)
print rmse

In [None]:
mae = performance_metrics.calculate_mae_using_rdd(test_ratings, predicted)
print mae

In [None]:
coverage = performance_metrics.calculate_user_coverage(test_ratings, train_ratings, predicted)
print coverage

In [None]:
pred_coverage = performance_metrics.calculate_prediction_coverage(test_ratings, predicted)
print pred_coverage

In [None]:
item_coverage = performance_metrics.calculate_item_coverage(test_ratings, train_ratings, predicted)
print item_coverage

# Test
From here on out, this is the testing phase so that it can be executed like in the "Run" section above.

# Extract docstring

Check out AST: https://github.com/python-git/python/blob/715a6e5035bb21ac49382772076ec4c630d6e960/Lib/test/test_ast.py

We will just extract the docstring from FunctionDef, ClassDef, Import and ImportFrom.

```
exec_tests = [
    # FunctionDef
    "def f(): pass",
    # ClassDef
    "class C:pass",
    # Return
    "def f():return 1",
    # Delete
    "del v",
    # Assign
    "v = 1",
    # AugAssign
    "v += 1",
    # Print
    "print >>f, 1, ",
    # For
    "for v in v:pass",
    # While
    "while v:pass",
    # If
    "if v:pass",
    # Raise
    "raise Exception, 'string'",
    # TryExcept
    "try:\n  pass\nexcept Exception:\n  pass",
    # TryFinally
    "try:\n  pass\nfinally:\n  pass",
    # Assert
    "assert v",
    # Import
    "import sys",
    # ImportFrom
    "from sys import v",
    # Exec
    "exec 'v'",
    # Global
    "global v",
    # Expr
    "1",
    # Pass,
    "pass",
    # Break
    "break",
    # Continue
    "continue",
]
```

In [19]:
# extract all lines in files
# codeLines == [((reponame, filename), (line_num, line))]
codeLines = codeLinesDF.map(
    lambda (
        author,
        author_mail,
        author_time,
        author_timezone,
        comment,
        commit_id,
        committer,
        committer_mail,
        committer_time,
        committer_timezone,
        filename,
        line,
        line_num,
        reponame,
        ):
        ((reponame, filename), (line_num, line))
).cache()

print codeLines.count()
codeLines.take(10)

179637


[((u'numpy', u'setup.py'), (30, u'    import builtins')),
 ((u'numpy', u'setup.py'), (60, u'MINOR               = 12')),
 ((u'numpy', u'setup.py'),
  (90,
   u'# BEFORE importing setuptools, remove MANIFEST. Otherwise it may not be')),
 ((u'numpy', u'setup.py'), (120, u'    if not ISRELEASED:')),
 ((u'numpy', u'setup.py'), (150, u'')),
 ((u'numpy', u'setup.py'), (180, u'')),
 ((u'numpy', u'setup.py'), (210, u'def parse_setuppy_commands():')),
 ((u'numpy', u'setup.py'), (240, u'')),
 ((u'numpy', u'setup.py'),
  (270,
   u'            into a bug, please report it at https://github.com/numpy/numpy/issues.')),
 ((u'numpy', u'setup.py'),
  (300,
   u"              - `git clean -Xdf` (cleans all versioned files, doesn't touch"))]

In [20]:
print "number of code lines: ", codeLines.values().count()

number of code lines:  179637


In [21]:
print "number of files: ", codeLines.keys().distinct().count()

number of files:  356


In [22]:
# append each file's code lines
# fileLines == [((reponame, filename), [(line_num_1, line_1), ..., (line_num_n, line_n)])]
fileLines = codeLines.mapValues(lambda val: [val]).reduceByKey(lambda a, b: a + b)

# sort each file's code lines by line number and extract only the lines from a file
def sortByLineNumberAndExtractLines(listOfCodeLines):
    sortedListOfCodeLines = sorted(listOfCodeLines, key=lambda (line_num, line): line_num)
    fileLines = ""
    for lineNum, line in sortedListOfCodeLines:
        fileLines += line + "\n"
    return fileLines

# fileLines == [((reponame, filename), filelines)]
fileLines = fileLines.mapValues(lambda listOfCodeLines: sortByLineNumberAndExtractLines(listOfCodeLines))
print "number of code lines: ", fileLines.count()
print "fileLines format: ", fileLines.take(1)

number of code lines:  356
fileLines format:  [((u'numpy', u'numpy/f2py/tests/test_callback.py'), u'from __future__ import division, absolute_import, print_function\n\nimport math\nimport textwrap\n\nfrom numpy import array\nfrom numpy.testing import run_module_suite, assert_, assert_equal, dec\nimport util\n\n\nclass TestF77Callback(util.F2PyTest):\n    code = """\n       subroutine t(fun,a)\n       integer a\ncf2py  intent(out) a\n       external fun\n       call fun(a)\n       end\n\n       subroutine func(a)\ncf2py  intent(in,out) a\n       integer a\n       a = a + 11\n       end\n\n       subroutine func0(a)\ncf2py  intent(out) a\n       integer a\n       a = 11\n       end\n\n       subroutine t2(a)\ncf2py  intent(callback) fun\n       integer a\ncf2py  intent(out) a\n       external fun\n       call fun(a)\n       end\n\n       subroutine string_callback(callback, a)\n       external callback\n       double precision callback\n       double precision a\n       character*1 r\ncf

In [23]:
# take one of the files and see the file's lines
"""
teststr = fileLines.values().take(10)[7]
print teststr
"""

'\nteststr = fileLines.values().take(10)[7]\nprint teststr\n'

In [24]:
all_files = fileLines.collect()

In [94]:
import ast
import inspect
import imp
import os

def getModule(parent_module_name, this_module_name):
    # this implementation only works on python 2.7
    parent_module = __import__(parent_module_name, globals(), locals(), this_module_name)
    if this_module_name is None:
        return parent_module
    else:
        this_module = getattr(parent_module_name, this_module_name)
        return this_module
    
"""
import importlib
def getModule(parent_module_name, this_module_name):
    # this implementation only works on python 3
    parent_module_name = importlib.import_module(parent_module_name)
    if this_module_name is None:
        return parent_module
    else: 
        this_module = getattr(parent_module_name, this_module_name)
        return this_module
"""

def __get_repo_docstring(repo_name):
    try:
        repo_module = getModule(repo_name, None)
        docstring = inspect.getdoc(repo_module)
        return docstring
    except Exception:
        return ""


def __get_file_docstring(file_path):
    # this function does not grab the docstring of intermediary modules
    # ie. grandparent_module.parent_module.child_module.granchild_module
    # this function will only grab the docstring of child_module and grandchild_module
    # but not grandparent_module or parent_module
    try:
        parent_module_name = os.path.dirname(file_path).replace("/", ".")
        this_module_name = os.path.splitext(os.path.basename(file_path))[0]
        docstring = inspect.getdoc(getModule(str(parent_module_name), None))
        docstring += inspect.getdoc(getModule(str(parent_module_name), str(this_module_name)))
        return docstring
    except Exception:
        return ""

def __get_import_docstring(ast_module):
    # this function does not grab the docstring of 
    # import libraries within the same project
    
    try:
        # get import library docstring
        import_definitions =  [node for node in ast_module.body if isinstance(node, ast.Import)]
        docstring = ""
        for import_definition in import_definitions:
            import_alias = import_definition.names[0]
            import_module_name = import_alias.name
            import_module = getModule(import_module_name, None)
            docstring += inspect.getdoc(import_module)
        return docstring
    except Exception:
        return ""
    
def __get_import_from_docstring(ast_module):
    # this function does not grab the docstring of 
    # import libraries within the same project
    
    try:
        # get import library docstring
        import_definitions =  [node for node in ast_module.body if isinstance(node, ast.ImportFrom)]
        docstring = ""
        for import_definition in import_definitions:
            import_alias = import_definition.names[0]
            import_module_name = import_alias.name
            import_module = getModule(import_module_name, None)
            tmp_docstring = inspect.getdoc(import_module)
            if tmp_docstring is not None:
                docstring += tmp_docstring
        return docstring
    except Exception:
        return ""

def __get_function_docstring(ast_module):
    try:
        function_definitions = [node for node in ast_module.body if isinstance(node, ast.FunctionDef)]
        docstring = ""
        for function_definition in function_definitions:
            #function_name = function_definition.name
            function_docstring = ast.get_docstring(function_definition)
            if function_docstring is not None:
                docstring += function_docstring
        return docstring
    except Exception:
        return ""

def __get_class_docstring(ast_module):
    try:
        class_definitions = [node for node in ast_module.body if isinstance(node, ast.ClassDef)]
        docstring = ""
        for class_definition in class_definitions:
            #class_name = class_definition.name
            class_docstring = ast.get_docstring(class_definition)
            if class_docstring is not None:
                docstring += class_docstring
            # add the class's functions' docstrings too!
            docstring += __get_class_function_docstring(class_definition.body)
        return docstring
    except Exception:
        return ""
        
def __get_class_function_docstring(function_definitions):
    try:
        docstring = ""
        for function_definition in function_definitions:
            if isinstance(function_definition, ast.FunctionDef):
                #function_name = function_definition.name
                function_docstring = ast.get_docstring(function_definition)
                if function_docstring is not None:
                    docstring += function_docstring
        return docstring
    except Exception:
        return ""

def get_docstring(((repo_name, file_path), file_lines)):
    # returns [((repo_name, file_path), file_docstrings)]
    docstring = ""
    docstring = __get_repo_docstring(repo_name)
    docstring += __get_file_docstring(file_path)
    try:
        # get ast's module from file's lines
        ast_module = ast.parse(file_lines)
    except Exception:
        pass
    else:
        docstring += __get_import_docstring(ast_module)        
        docstring += __get_import_from_docstring(ast_module)
        docstring += __get_function_docstring(ast_module)
        docstring += __get_class_docstring(ast_module)
        
    return ((repo_name, file_path), docstring)

fileDocstrings = fileLines.map(
    lambda ((repo_name, file_path), file_lines): get_docstring(((repo_name, file_path), file_lines))
)

print fileLines.count()
print fileDocstrings.count()
print fileDocstrings.take(1)

356
356
[((u'numpy', u'numpy/f2py/tests/test_callback.py'), 'NumPy\n=====\n\nProvides\n  1. An array object of arbitrary homogeneous items\n  2. Fast mathematical operations over arrays\n  3. Linear Algebra, Fourier Transforms, Random Number Generation\n\nHow to use the documentation\n----------------------------\nDocumentation is available in two forms: docstrings provided\nwith the code, and a loose standing reference guide, available from\n`the NumPy homepage <http://www.scipy.org>`_.\n\nWe recommend exploring the docstrings using\n`IPython <http://ipython.scipy.org>`_, an advanced Python shell with\nTAB-completion and introspection capabilities.  See below for further\ninstructions.\n\nThe docstring examples assume that `numpy` has been imported as `np`::\n\n  >>> import numpy as np\n\nCode snippets are indicated by three greater-than signs::\n\n  >>> x = 42\n  >>> x = x + 1\n\nUse the built-in ``help`` function to view a function\'s docstring::\n\n  >>> help(np.sort)\n  ... # doct

In [97]:
docstrings = fileDocstrings.map(lambda ((repo_name, file_path), docstring): docstring)
print docstrings.count()
print docstrings.take(1)

356
['NumPy\n=====\n\nProvides\n  1. An array object of arbitrary homogeneous items\n  2. Fast mathematical operations over arrays\n  3. Linear Algebra, Fourier Transforms, Random Number Generation\n\nHow to use the documentation\n----------------------------\nDocumentation is available in two forms: docstrings provided\nwith the code, and a loose standing reference guide, available from\n`the NumPy homepage <http://www.scipy.org>`_.\n\nWe recommend exploring the docstrings using\n`IPython <http://ipython.scipy.org>`_, an advanced Python shell with\nTAB-completion and introspection capabilities.  See below for further\ninstructions.\n\nThe docstring examples assume that `numpy` has been imported as `np`::\n\n  >>> import numpy as np\n\nCode snippets are indicated by three greater-than signs::\n\n  >>> x = 42\n  >>> x = x + 1\n\nUse the built-in ``help`` function to view a function\'s docstring::\n\n  >>> help(np.sort)\n  ... # doctest: +SKIP\n\nFor some objects, ``np.info(obj)`` may pr

In [99]:
import re
wordstrings = docstrings.map(lambda docstring: re.sub("[^\w]", " ", docstring).split())
print wordstrings.count()
print wordstrings.take(1)

356
[['NumPy', 'Provides', '1', 'An', 'array', 'object', 'of', 'arbitrary', 'homogeneous', 'items', '2', 'Fast', 'mathematical', 'operations', 'over', 'arrays', '3', 'Linear', 'Algebra', 'Fourier', 'Transforms', 'Random', 'Number', 'Generation', 'How', 'to', 'use', 'the', 'documentation', 'Documentation', 'is', 'available', 'in', 'two', 'forms', 'docstrings', 'provided', 'with', 'the', 'code', 'and', 'a', 'loose', 'standing', 'reference', 'guide', 'available', 'from', 'the', 'NumPy', 'homepage', 'http', 'www', 'scipy', 'org', '_', 'We', 'recommend', 'exploring', 'the', 'docstrings', 'using', 'IPython', 'http', 'ipython', 'scipy', 'org', '_', 'an', 'advanced', 'Python', 'shell', 'with', 'TAB', 'completion', 'and', 'introspection', 'capabilities', 'See', 'below', 'for', 'further', 'instructions', 'The', 'docstring', 'examples', 'assume', 'that', 'numpy', 'has', 'been', 'imported', 'as', 'np', 'import', 'numpy', 'as', 'np', 'Code', 'snippets', 'are', 'indicated', 'by', 'three', 'greater',

# Create Py2Vec Model

In [28]:
from pyspark.mllib.feature import Word2Vec

word2vec = Word2Vec()
word2vec.setMinCount(20) # Default 100
word2vec.setSeed(41)
word2vec.setLearningRate(0.025) # Default 0.025
word2vec.setVectorSize(50) # Default 100
model = word2vec.fit(wordstrings)

In [29]:
# Prep the Word2Vec model for broadcast by converting it to a Python dictionary,
# then ship it to all the nodes in the cluster
model_dict = {k:np.array(list(v)) for k,v in dict(model.getVectors()).iteritems()}

In [30]:
print model_dict

{u'Fortran': array([ 0.02297839,  0.27328795,  0.17778988, -1.1985925 , -0.7691386 ,
       -0.22725247, -0.04331338,  0.5520691 ,  0.52358437, -0.10084201,
       -1.5504944 , -0.2613396 ,  0.23142007, -0.24758275, -0.95215905,
       -0.7983866 , -0.08122747,  0.28283873, -0.01520761, -1.1688166 ,
        0.47566125,  0.2133843 ,  0.65748984,  1.0203304 ,  0.24432151,
       -0.49718478, -0.38549268,  0.27712834, -0.08792873, -1.3555474 ,
       -0.15897965, -0.45683077,  0.7146918 , -0.843222  , -0.51765454,
       -0.03813894, -0.13249576, -0.00496246,  0.5132668 , -0.09224562,
       -0.51973057,  0.44389376,  0.10183978, -0.5817857 ,  0.3730536 ,
        0.09081241,  0.39427316,  0.6154644 , -0.13310893, -0.48543847]), u'prefix': array([ 0.63307995, -1.6539733 ,  0.32624993,  0.35372183,  0.5430198 ,
        0.15663859, -0.30488077,  0.28085494,  0.45687878, -0.54641986,
       -1.5352894 , -0.3396193 ,  0.31436175, -0.19218118, -0.99662834,
       -1.2450931 , -0.9856934 , -0.39

# Set function_map for CV

No need to implement this section. It is already implemented in git_vectorize.py.

In [46]:
def lines_to_file(lines):
    """Construct a file from the lines that make it up.

    Args:
        vals: an iterator over rows of the following form:
            (author, line_num, line_text)

    Returns:
        str: The full text of the file
    """
    output = ""
    for (author, line_num, line) in sorted(lines):
        output += line + '\n'

    return output

class Lexer(ast.NodeVisitor):
    """Parse a node from a AST and return a tuple of the content.

    The returned tuple is of the form:

        (line_number, node_type, name)

    node_type can be either "Import" or "Call".

    """

    def visit_Import(self, node):
        """Called for "import library" statements."""
        items = []
        for item in node.names:
            items.append((node.lineno, "Import", item.name))
        self.generic_visit(node)
        return items

    def visit_ImportFrom(self, node):
        """Called for "from library import object" statements."""
        self.generic_visit(node)
        return [(node.lineno, "Import", node.module)]

    def visit_Call(self, node):
        """Called for function and method calls."""
        id = None
        # Some nodes have their name in the function object
        try:
            id = node.func.id
        except AttributeError:
            pass
        # Others (those called as methods, or with a library name leading) have
        # the name in the attr block
        try:
            id = node.func.value.id + '.' + node.func.attr
        except AttributeError:
            pass

        self.generic_visit(node)

        if id:
            return [(node.lineno, "Call", id)]
        
def run_lexer(row):
    """Run the lexer over the text of a Python file.

    Args:
        row: A row from an RDD of the form ((repo, file), file_text)

    Returns:
        list of tuples: each tuples is of the form (object_type, object_name).
            Returns an empty list on parsing failure.
    """
    uniq_id, content = row
    new_content = []
    # There is a bug in Python when the first line is a coding statement:
    # https://bugs.python.org/issue22221
    for line in content.splitlines():
        if not (line.startswith("#") and "coding:" in line):
            new_content.append(line)

    try:  # Sometimes Python3 files sneak in that can not be parsed
        tree = ast.parse('\n'.join(new_content))
    except Exception as e:
        return []

    output = []
    for node in ast.walk(tree):
        ret = Lexer().visit(node)
        if ret:
            for item in ret:
                output.append((uniq_id, item))

    if output:
        return output
    else:
        return []
    
def clean_names(function):
    """Clean function names.

    Args:
        function (str): a function name.

    Returns:
        str: A string with all characters lowered and only the last word if
            there are multiple period joined words.

    """
    if function is not None:
        out = function.lower()
        out = out.split('.')[-1]
        return out
    
def function_to_vector(function, model):
    """Assigns a vector to a function.

    Args:
        function (str): the function name.
        model (dict): A mapping that words as follows:
            model[function] == vector

    Returns:
        vector: A numpy array, or None if the function was not found in the
            model.
    """
    try:
        return model[function]
    except:
        return None


In [41]:
# set lines_to_functions
loaded_data = codeLinesDF.map(
    lambda (
        author,
        author_mail,
        author_time,
        author_timezone,
        comment,
        commit_id,
        committer,
        committer_mail,
        committer_time,
        committer_timezone,
        filename,
        line,
        line_num,
        repo_name,
    ):
    ((repo_name, filename), ((author, author_mail), line_num, line))
)
# Group data by file so that we can reconstruct the file
grouped_lines = loaded_data.groupByKey()
# Reconstruct the files from their lines so we can run them through the lexer
reconstructed_files = grouped_lines.map(lambda (key, lines): (key, lines_to_file(lines)))
# Make a map of files to functions
file_to_functions = reconstructed_files.flatMap(lambda row: run_lexer(row))
# Map lines to functions
lines_to_functions = file_to_functions.map(
    lambda ((repo, file), (line_num, cont_type, cont)): ((repo, file, line_num), clean_names(cont))
)
print lines_to_functions.count()
print lines_to_functions.take(1)

1385
[((u'numpy', u'numpy/matrixlib/__init__.py', 4), '__future__')]


In [45]:
# set function_map
functions = lines_to_functions\
    .map(lambda (line, function): function)\
    .distinct()

function_map = {k: v for v, k in enumerate(functions.collect())} 
print function_map

{'chebyshev': 108, 'enumerate': 270, 'savez': 153, 'six': 123, 'func': 219, 'distutils': 195, 'arrayterator': 88, 'walk': 225, 'assert_raises': 230, 'random': 210, 'try_compile': 172, 'unixccompiler': 39, '__future__': 162, 'fftpack': 71, 'defmatrix': 20, 'cutdeg': 57, 'isinstance': 191, 'arange': 300, 'save_hashes': 206, 'cython': 74, 'copy': 291, 'group': 12, 'struct': 163, 'fit': 232, 'type_check': 18, 'subprocess': 289, 'fromroots': 157, 'hstack': 294, 'load_hashes': 134, 'add': 175, 'dict': 8, 'runmodule': 220, 'load_cache': 122, 'main': 150, 'match': 183, 'numpydoctest': 310, 'info': 231, 'legendre': 50, 'get': 63, 'read': 51, 'copy_file': 21, 'end': 303, 'repr': 62, 'util': 138, 'tile': 72, 'process_pyx': 69, 'ones': 166, 'function_base': 148, 're': 246, 'sorted': 128, 'run_module_suite': 28, 'floatint': 209, 'common': 106, 'assert_almost_equal': 306, 'as_series': 263, 'search': 218, 'numpyversion': 215, 'extint_mul_64_64': 268, 'zeros': 282, 'makesuite': 144, 'list': 280, 'inte

# Set authorid_functionid for UV

No need to implement this section. It is already implemented in git_vectorize.py.

In [51]:
# get author_map
authors = codeLinesDF.map(
    lambda (
        author,
        author_mail,
        author_time,
        author_timezone,
        comment,
        commit_id,
        committer,
        committer_mail,
        committer_time,
        committer_timezone,
        filename,
        line,
        line_num,
        repo_name,
    ):
    (author, author_mail)
).distinct()

author_map = {k: v for v, k in enumerate(authors.collect())}

In [57]:
# get lines_to_authorid
lines_to_authorid = codeLinesDF.map(
    lambda (
        author,
        author_mail,
        author_time,
        author_timezone,
        comment,
        commit_id,
        committer,
        committer_mail,
        committer_time,
        committer_timezone,
        filename,
        line,
        line_num,
        repo_name,
    ):
    ((repo_name, filename, line_num), author_map[(author, author_mail)])
)

In [58]:
# get lines_to_functions
# already implemented in the previous section

In [59]:
joined_authors_and_functions = lines_to_authorid.join(lines_to_functions)
authorid_functionid = joined_authors_and_functions.map(
        lambda (line, (author_id, function)): (author_id, function_map[function])
)

In [60]:
print authorid_functionid.count()
print authorid_functionid.take(1)

1385
[(158, 210)]


# Get UV & CV

## 1. Get UV & CV Manually

In [31]:
# zip up source_dir located in GitHub remote_url's remote_branch and add it to Spark's source context
remote_url = "https://github.com/lab41/hermes.git"
remote_branch = "master"
source_dir = "src"
debug = True

# helper functions
import os
import functools

def _list_all_in_dir(dir_path):
    for path, subdirs, files in os.walk(dir_path):
        for filename in files:
            print os.path.join(path, filename)
            
def _zip_dir(srcdir_path, zipfile_handler):
    try:
        zipfile_handler.writepy(srcdir_path)
    finally:
        zipfile_handler.close()
            
def trackcalls(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        wrapper.has_been_called = True
        return func(*args, **kwargs)
    wrapper.has_been_called = False
    return wrapper

@trackcalls
def _add_zipfile_to_sc(zipfile_path):
    sc.addPyFile(zipfile_path) 
    
import git
import os
import tempfile
import shutil
import zipfile    

# create a temporary directory
tmpdir_path = tempfile.mkdtemp()
if debug: print "temporary directory: %s\n" % tmpdir_path

# ensure file is read/write by creator only
saved_umask = os.umask(0077)

# create a zipfile handler to zip the necessary files
ziptmpdir_path = tempfile.mkdtemp()
if debug: print "temporary directory for zip file: %s\n" % ziptmpdir_path
zipfile_path = ziptmpdir_path + "/hermes_src_2.zip"
if debug: print "zip file's path: %s\n" % zipfile_path
zipfile_handler = zipfile.PyZipFile(zipfile_path, "w")

# make zipfile handler verbose for debugging
zipfile_handler.debug = 3

try:
    # clone "framework" branch from GitHub into temporary directory
    local_branch = git.Repo.clone_from(remote_url, tmpdir_path, branch=remote_branch)
    if debug: print "current branch: %s\n" % local_branch.head.ref
    if debug: print "list all in %s:" % tmpdir_path; _list_all_in_dir(tmpdir_path); print "\n"
        
    # zip "hermes" directory
    if debug: print "zipping: %s\n" % os.path.join(tmpdir_path, source_dir)
    _zip_dir(os.path.join(tmpdir_path, source_dir), zipfile_handler)
    
    # check zip file
    if debug: print "Is zip file %s valid? %s\n" % (zipfile_path, zipfile.is_zipfile(zipfile_path))
    
    # add zip to SparkContext 
    # note: you can only add zip to SparkContext one time
    if not _add_zipfile_to_sc.has_been_called:
        if debug: print "add zip file %s into spark context\n" % zipfile_path
        _add_zipfile_to_sc(zipfile_path)
    else:
        if debug: print "zip file %s is already added into spark context; will not re-add\n" % zipfile_path
    
except IOError as e:
    raise e
else:
    os.remove(zipfile_path)
finally:
    os.umask(saved_umask)
    shutil.rmtree(tmpdir_path)
    shutil.rmtree(ziptmpdir_path)


temporary directory: /tmp/tmpIbkRmP

temporary directory for zip file: /tmp/tmpn2lztO

zip file's path: /tmp/tmpn2lztO/hermes_src_2.zip

current branch: master

list all in /tmp/tmpIbkRmP:
/tmp/tmpIbkRmP/.gitignore
/tmp/tmpIbkRmP/README.md
/tmp/tmpIbkRmP/LICENSE
/tmp/tmpIbkRmP/.git/config
/tmp/tmpIbkRmP/.git/packed-refs
/tmp/tmpIbkRmP/.git/index
/tmp/tmpIbkRmP/.git/description
/tmp/tmpIbkRmP/.git/HEAD
/tmp/tmpIbkRmP/.git/hooks/applypatch-msg.sample
/tmp/tmpIbkRmP/.git/hooks/pre-rebase.sample
/tmp/tmpIbkRmP/.git/hooks/update.sample
/tmp/tmpIbkRmP/.git/hooks/post-commit.sample
/tmp/tmpIbkRmP/.git/hooks/commit-msg.sample
/tmp/tmpIbkRmP/.git/hooks/prepare-commit-msg.sample
/tmp/tmpIbkRmP/.git/hooks/post-update.sample
/tmp/tmpIbkRmP/.git/hooks/pre-applypatch.sample
/tmp/tmpIbkRmP/.git/hooks/post-receive.sample
/tmp/tmpIbkRmP/.git/hooks/pre-commit.sample
/tmp/tmpIbkRmP/.git/refs/remotes/origin/HEAD
/tmp/tmpIbkRmP/.git/refs/heads/master
/tmp/tmpIbkRmP/.git/logs/HEAD
/tmp/tmpIbkRmP/.git/logs/r

In [32]:
# import the required modules from Hermes
from src.data_prep import git_vectorize as gv
from src.utils import save_load as sl

In [76]:
# git_vectorize's get_content_vector() for "py2vec"
from copy import deepcopy

my_model = sc.broadcast(deepcopy(model_dict))
my_function_map = sc.broadcast(deepcopy(function_map))

my_functions = sc.parallelize(function_map.keys())
cv1 = my_functions\
    .map(lambda function: (my_function_map.value[function], function_to_vector(function, my_model.value)))\
    .filter(lambda (functionid, vector): vector is not None)\
    .filter(lambda (functionid, vector): vector.any())

In [77]:
# git_vectorize's get_user_vector() for "py2vec"
uv1 = authorid_functionid.distinct().map(lambda (user, item): (user, item, 1))

In [None]:
user_vector = uv1
content_vector = cv1

## 2. Get UV & CV using the git_vectorize()

In [78]:
vectorizer = gv.git_vectorize(codeLinesDF, "any_interact", "py2vec", sc, model=model_dict)
uv2 = vectorizer.get_user_vector()
cv2 = vectorizer.get_content_vector()

In [79]:
user_vector = uv2
content_vector = cv2

# Prediction & Performance Metrics
When obtaining UV & CV manually and without TF-IDF, 
* RMSE: 0.589768396263
* MAE: 0.569945074146
* coverage: 68.1818181818
* predicted coverage: 38.8888888889
* item coverage: 33.8658146965

When obtaining UV & CV using git_vectorize() and without TF-IDF,
* RMSE: 
* MAE: 
* coverage: 
* predicted coverage: 
* item coverage: 

In [80]:
# Split into a test and training set
train_ratings, test_ratings = user_vector.randomSplit(weights=[90, 10], seed=41)
test_ratings.cache()

PythonRDD[299] at RDD at PythonRDD.scala:43

In [81]:
from src.algorithms import content_based
from src.algorithms import performance_metrics
from src.utils import save_load as sl

In [82]:
predicted = content_based.predict(train_ratings, content_vector)
predicted.cache()

PythonRDD[320] at RDD at PythonRDD.scala:43

In [83]:
rmse = performance_metrics.calculate_rmse_using_rdd(test_ratings, predicted)
print rmse

0.568112795906


In [84]:
mae = performance_metrics.calculate_mae_using_rdd(test_ratings, predicted)
print mae

0.535798927772


In [85]:
coverage = performance_metrics.calculate_user_coverage(test_ratings, train_ratings, predicted)
print coverage

72.7272727273


In [86]:
pred_coverage = performance_metrics.calculate_prediction_coverage(test_ratings, predicted)
print pred_coverage

42.3076923077


In [87]:
item_coverage = performance_metrics.calculate_item_coverage(test_ratings, train_ratings, content_vector, predicted)
print item_coverage

34.4155844156


## Playing with AST #2

In [82]:
help(ast.alias)

Help on class alias in module _ast:

class alias(AST)
 |  Method resolution order:
 |      alias
 |      AST
 |      __builtin__.object
 |  
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  _fields = ('name', 'asname')
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from AST:
 |  
 |  __delattr__(...)
 |      x.__delattr__('name') <==> del x.name
 |  
 |  __getattribute__(...)
 |      x.__getattribute__('name') <==> x.name
 |  
 |  __init__(...)
 |      x.__init__(...) initializes x; see help(type(x)) for signature
 |  
 |  __reduce__(...)
 |  
 |  __setattr__(...)
 |      x.__setattr__('name', value) <==> x.name = value
 |  
 |  ------------------------------------

In [14]:
import inspect
import importlib

module = ast.parse(u"import numpy as np\nfrom numpy.testing import (assert_array_equal, assert_raises, assert_allclose,TestCase)\nfrom numpy.lib import pad")
print module.body

import_definitions =  [node for node in module.body if isinstance(node, ast.Import)]
print import_definitions
print "\n"
for each_import in import_definitions:
    a = each_import.names[0]
    print "each_import.names[0]: ", a.name
    print "getdoc: ", inspect.getdoc(a.name)
    
# TODO: convert a.name into an import library so that inspect can get the docs

[<_ast.Import object at 0x7f2b8c103290>, <_ast.ImportFrom object at 0x7f2b8c103090>, <_ast.ImportFrom object at 0x7f2b8c103650>]
[<_ast.Import object at 0x7f2b8c103290>]


each_import.names[0]:  numpy
getdoc:  str(object='') -> string

Return a nice string representation of the object.
If the argument is a string, the return value is the same object.


In [None]:
importfrom_definitions = [node for node in module.body if isinstance(node, ast.ImportFrom)]
print importfrom_definitions
print "\n"
for each_import in importfrom_definitions:
    print each_import, ast.get_docstring(each_import)


In [66]:
import ast, numpy
module = ast.parse(teststr)
print "\nmodule: ", module
print "\nmodule.body: ", module.body

function_definitions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
print "\nfunction_definitions: ", function_definitions
print "\nfunction_definitions: ", [f.name for f in function_definitions]

print "\n"
for f in function_definitions:
    print f.name, ast.get_docstring(f)
    
class_definitions = [node for node in module.body if isinstance(node, ast.ClassDef)]
print "\nclass_definitions: ", class_definitions
print "\nclass_definitions: ", [each_class.name for each_class in class_definitions]

print "\n"
for each_class in class_definitions:
    print each_class.name, ast.get_docstring(each_class)

print "\n"
for each_class in class_definitions:
    for fn in each_class.body:
        if isinstance(fn, ast.FunctionDef):
            print fn.name, ast.get_docstring(fn)


module:  <_ast.Module object at 0x7f97946bd6d0>

module.body:  [<_ast.Expr object at 0x7f97946bd5d0>, <_ast.ImportFrom object at 0x7f97a42b3410>, <_ast.Import object at 0x7f97946bd4d0>, <_ast.ImportFrom object at 0x7f97946bd310>, <_ast.ImportFrom object at 0x7f97946bd7d0>, <_ast.ClassDef object at 0x7f97946bd8d0>, <_ast.ClassDef object at 0x7f97942c0250>, <_ast.ClassDef object at 0x7f9794518610>, <_ast.ClassDef object at 0x7f97940aa890>, <_ast.ClassDef object at 0x7f9794248710>, <_ast.ClassDef object at 0x7f979431c790>, <_ast.ClassDef object at 0x7f979444ab10>, <_ast.ClassDef object at 0x7f979441de10>, <_ast.ClassDef object at 0x7f97943af910>, <_ast.ClassDef object at 0x7f97943bb8d0>, <_ast.ClassDef object at 0x7f97943c0410>, <_ast.ClassDef object at 0x7f97943c7f50>, <_ast.ClassDef object at 0x7f97943d2fd0>, <_ast.ClassDef object at 0x7f97943d8a90>, <_ast.ClassDef object at 0x7f97943e4b10>, <_ast.ClassDef object at 0x7f97943ea610>, <_ast.ClassDef object at 0x7f97943379d0>, <_ast.If ob

## Playing with AST #1

In [69]:
help(ast.Import)

Help on class Import in module _ast:

class Import(stmt)
 |  Method resolution order:
 |      Import
 |      stmt
 |      AST
 |      __builtin__.object
 |  
 |  Data and other attributes defined here:
 |  
 |  _fields = ('names',)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from stmt:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from stmt:
 |  
 |  _attributes = ('lineno', 'col_offset')
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from AST:
 |  
 |  __delattr__(...)
 |      x.__delattr__('name') <==> del x.name
 |  
 |  __getattribute__(...)
 |      x.__getattribute__('name') <==> x.name
 |  
 |  __init__(...)
 |      x.__init__(...) initi

In [136]:
import ast
mod = ast.parse("a = 3 * (b + c)")
print "mod: ", mod
print "mod.body: ", mod.body
assignment = mod.body[0]
print "assignment: ", assignment
print "assignment.targets: ", assignment.targets
print "assignment.value: ", assignment.value

 mod:  <_ast.Module object at 0x7fde77c64e50>
mod.body:  [<_ast.Assign object at 0x7fde77c69490>]
assignment:  <_ast.Assign object at 0x7fde77c69490>
assignment.targets:  [<_ast.Name object at 0x7fde77c690d0>]
assignment.value:  <_ast.BinOp object at 0x7fde77c69410>


In [29]:
"""
[((u'numpy', u'setup.py'), (30, u'    import builtins')),
 ((u'numpy', u'setup.py'), (93, u"if os.path.exists('MANIFEST'):")),
 ((u'numpy', u'setup.py'),
  (156, u'    config.set_options(ignore_setup_xxx_py=True,')),
 ((u'numpy', u'setup.py'), (219, u'')),
 ((u'numpy', u'setup.py'), (282, u'            instead:')),
 ((u'numpy', u'setup.py'), (345, u'        description = DOCLINES[0],')),
 ((u'numpy', u'runtests.py'), (22, u'')),
 ((u'numpy', u'runtests.py'),
  (85, u'                              "HTML output goes to build/lcov/"))')),
 ((u'numpy', u'runtests.py'),
  (148, u'        import warnings; warnings.filterwarnings("always")')),
 ((u'numpy', u'runtests.py'),
  (211, u'                commit_a, commit_b = commits'))]
"""

import ast
ast.__file__
import os
ast_filename = os.path.splitext(ast.__file__)[0] + ".py"

with open("/home/cdh_data/anaconda/lib/python2.7/ast.py") as fd:
    print fd
    file_contents = fd.read()
    print file_contents

print ""




<open file '/home/cdh_data/anaconda/lib/python2.7/ast.py', mode 'r' at 0x7f979472d390>
# -*- coding: utf-8 -*-
"""
    ast
    ~~~

    The `ast` module helps Python applications to process trees of the Python
    abstract syntax grammar.  The abstract syntax itself might change with
    each Python release; this module helps to find out programmatically what
    the current grammar looks like and allows modifications of it.

    An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as
    a flag to the `compile()` builtin function or by using the `parse()`
    function from this module.  The result will be a tree of objects whose
    classes all inherit from `ast.AST`.

    A modified abstract syntax tree can be compiled into a Python code object
    using the built-in `compile()` function.

    Additionally various helper functions are provided that make working with
    the trees simpler.  The main intention of the helper functions and this
    module in general is

In [141]:
module = ast.parse(file_contents)
function_definitions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
function_definitions

[<_ast.FunctionDef at 0x7fde7cbe0510>,
 <_ast.FunctionDef at 0x7fde7cbe0910>,
 <_ast.FunctionDef at 0x7fde7cc009d0>,
 <_ast.FunctionDef at 0x7fde7cc06c10>,
 <_ast.FunctionDef at 0x7fde7cc38550>,
 <_ast.FunctionDef at 0x7fde7cc19650>,
 <_ast.FunctionDef at 0x7fde7cc19d90>,
 <_ast.FunctionDef at 0x7fde7cc35310>,
 <_ast.FunctionDef at 0x7fde7cc35c10>,
 <_ast.FunctionDef at 0x7fde7cc3bd50>]

In [142]:
[f.name for f in function_definitions]

['parse',
 'literal_eval',
 'dump',
 'copy_location',
 'fix_missing_locations',
 'increment_lineno',
 'iter_fields',
 'iter_child_nodes',
 'get_docstring',
 'walk']

In [143]:
for f in function_definitions:
    print f.name, ast.get_docstring(f)

parse Parse the source into an AST node.
Equivalent to compile(source, filename, mode, PyCF_ONLY_AST).
literal_eval Safely evaluate an expression node or a string containing a Python
expression.  The string or node provided may only consist of the following
Python literal structures: strings, numbers, tuples, lists, dicts, booleans,
and None.
dump Return a formatted dump of the tree in *node*.  This is mainly useful for
debugging purposes.  The returned string will show the names and the values
for fields.  This makes the code impossible to evaluate, so if evaluation is
wanted *annotate_fields* must be set to False.  Attributes such as line
numbers and column offsets are not dumped by default.  If this is wanted,
*include_attributes* can be set to True.
copy_location Copy source location (`lineno` and `col_offset` attributes) from
*old_node* to *new_node* if possible, and return *new_node*.
fix_missing_locations When you compile a node tree with compile(), the compiler expects lineno a

In [154]:
class_definitions = [node for node in module.body if isinstance(node, ast.ClassDef)]
print class_definitions
print type(class_definitions)
print [each_class.name for each_class in class_definitions]

[<_ast.ClassDef object at 0x7fde7cc22510>, <_ast.ClassDef object at 0x7fde7cc026d0>]
<type 'list'>
['NodeVisitor', 'NodeTransformer']


In [160]:
for each_class in class_definitions:
    print each_class.name, ast.get_docstring(each_class)

NodeVisitor A node visitor base class that walks the abstract syntax tree and calls a
visitor function for every node found.  This function may return a value
which is forwarded by the `visit` method.

This class is meant to be subclassed, with the subclass adding visitor
methods.

Per default the visitor functions for the nodes are ``'visit_'`` +
class name of the node.  So a `TryFinally` node visit function would
be `visit_TryFinally`.  This behavior can be changed by overriding
the `visit` method.  If no visitor function exists for a node
(return value `None`) the `generic_visit` visitor is used instead.

Don't use the `NodeVisitor` if you want to apply changes to nodes during
traversing.  For this a special visitor exists (`NodeTransformer`) that
allows modifications.
NodeTransformer A :class:`NodeVisitor` subclass that walks the abstract syntax tree and
allows modification of nodes.

The `NodeTransformer` will walk the AST and use the return value of the
visitor methods to replace

In [162]:
for each_class in class_definitions:
    for fn in each_class.body:
        if isinstance(fn, ast.FunctionDef):
            print fn.name, ast.get_docstring(fn)

 visit Visit a node.
generic_visit Called if no explicit visitor function exists for a node.
generic_visit None


## Examples with numpy

In [37]:
# docstring of np
np.__doc__

'\nNumPy\n=====\n\nProvides\n  1. An array object of arbitrary homogeneous items\n  2. Fast mathematical operations over arrays\n  3. Linear Algebra, Fourier Transforms, Random Number Generation\n\nHow to use the documentation\n----------------------------\nDocumentation is available in two forms: docstrings provided\nwith the code, and a loose standing reference guide, available from\n`the NumPy homepage <http://www.scipy.org>`_.\n\nWe recommend exploring the docstrings using\n`IPython <http://ipython.scipy.org>`_, an advanced Python shell with\nTAB-completion and introspection capabilities.  See below for further\ninstructions.\n\nThe docstring examples assume that `numpy` has been imported as `np`::\n\n  >>> import numpy as np\n\nCode snippets are indicated by three greater-than signs::\n\n  >>> x = 42\n  >>> x = x + 1\n\nUse the built-in ``help`` function to view a function\'s docstring::\n\n  >>> help(np.sort)\n  ... # doctest: +SKIP\n\nFor some objects, ``np.info(obj)`` may provi

In [25]:
# docstring of np
help(np)

Help on package numpy:

NAME
    numpy

FILE
    /home/cdh_data/anaconda/lib/python2.7/site-packages/numpy/__init__.py

DESCRIPTION
    NumPy
    =====
    
    Provides
      1. An array object of arbitrary homogeneous items
      2. Fast mathematical operations over arrays
      3. Linear Algebra, Fourier Transforms, Random Number Generation
    
    How to use the documentation
    ----------------------------
    Documentation is available in two forms: docstrings provided
    with the code, and a loose standing reference guide, available from
    `the NumPy homepage <http://www.scipy.org>`_.
    
    We recommend exploring the docstrings using
    `IPython <http://ipython.scipy.org>`_, an advanced Python shell with
    TAB-completion and introspection capabilities.  See below for further
    instructions.
    
    The docstring examples assume that `numpy` has been imported as `np`::
    
      >>> import numpy as np
    
    Code snippets are indicated by three greater-than signs

In [39]:
# list constants and functions of np
dir(np)

['ALLOW_THREADS',
 'BUFSIZE',
 'CLIP',
 'DataSource',
 'ERR_CALL',
 'ERR_DEFAULT',
 'ERR_IGNORE',
 'ERR_LOG',
 'ERR_PRINT',
 'ERR_RAISE',
 'ERR_WARN',
 'FLOATING_POINT_SUPPORT',
 'FPE_DIVIDEBYZERO',
 'FPE_INVALID',
 'FPE_OVERFLOW',
 'FPE_UNDERFLOW',
 'False_',
 'Inf',
 'Infinity',
 'MAXDIMS',
 'MachAr',
 'NAN',
 'NINF',
 'NZERO',
 'NaN',
 'PINF',
 'PZERO',
 'PackageLoader',
 'RAISE',
 'SHIFT_DIVIDEBYZERO',
 'SHIFT_INVALID',
 'SHIFT_OVERFLOW',
 'SHIFT_UNDERFLOW',
 'ScalarType',
 'Tester',
 'True_',
 'UFUNC_BUFSIZE_DEFAULT',
 'UFUNC_PYVALS_NAME',
 'WRAP',
 '__NUMPY_SETUP__',
 '__all__',
 '__builtins__',
 '__config__',
 '__doc__',
 '__file__',
 '__git_revision__',
 '__name__',
 '__package__',
 '__path__',
 '__version__',
 '_import_tools',
 '_mat',
 'abs',
 'absolute',
 'absolute_import',
 'add',
 'add_docstring',
 'add_newdoc',
 'add_newdoc_ufunc',
 'add_newdocs',
 'alen',
 'all',
 'allclose',
 'alltrue',
 'alterdot',
 'amax',
 'amin',
 'angle',
 'any',
 'append',
 'apply_along_axis',
 'a

In [32]:
np.lib.arraysetops.__doc__

'\nSet operations for 1D numeric arrays based on sorting.\n\n:Contains:\n  ediff1d,\n  unique,\n  intersect1d,\n  setxor1d,\n  in1d,\n  union1d,\n  setdiff1d\n\n:Notes:\n\nFor floating point arrays, inaccurate results may appear due to usual round-off\nand floating point comparison issues.\n\nSpeed could be gained in some operations by an implementation of\nsort(), that can provide directly the permutation vectors, avoiding\nthus calls to argsort().\n\nTo do: Optionally return indices analogously to unique for all functions.\n\n:Author: Robert Cimrman\n\n'