diff --git a/docs/source/_images/scalability.png b/docs/source/_images/scalability.png new file mode 100644 index 000000000..ca926a4c1 Binary files /dev/null and b/docs/source/_images/scalability.png differ diff --git a/docs/source/_images/scalability.pptx b/docs/source/_images/scalability.pptx new file mode 100644 index 000000000..34749aa90 Binary files /dev/null and b/docs/source/_images/scalability.pptx differ diff --git a/docs/source/_images/scalability_with_logo.png b/docs/source/_images/scalability_with_logo.png new file mode 100644 index 000000000..27eb387dc Binary files /dev/null and b/docs/source/_images/scalability_with_logo.png differ diff --git a/docs/source/_images/scalability_with_logo.pptx b/docs/source/_images/scalability_with_logo.pptx new file mode 100644 index 000000000..106ca3ff3 Binary files /dev/null and b/docs/source/_images/scalability_with_logo.pptx differ diff --git a/docs/source/_templates/_api_ref.pandas.series_templ.rst b/docs/source/_templates/_api_ref.pandas.series_templ.rst new file mode 100644 index 000000000..6bf9981f2 --- /dev/null +++ b/docs/source/_templates/_api_ref.pandas.series_templ.rst @@ -0,0 +1,492 @@ +.. _api_ref.pandas.series: +.. include:: ./../ext_links.txt + +Pandas Series +============= +.. currentmodule:: pandas + +This is basic `Pandas*`_ data structure representing a dataframe column. In `NumPy*`_ terms this is +one-dimensional ndarray with axis labels. + +Constructor +----------- + +.. sdc_toctree + Series + +Attributes/Operators +-------------------- + +.. sdc_toctree + Series.index + Series.array + Series.values + Series.dtype + Series.shape + Series.nbytes + Series.ndim + Series.size + Series.T + Series.memory_usage + Series.hasnans + Series.empty + Series.dtypes + Series.name + Series.put + +Type Conversions +---------------- + +.. sdc_toctree + Series.astype + Series.infer_objects + Series.copy + Series.bool + Series.to_numpy + Series.to_period + Series.to_timestamp + Series.to_list + Series.get_values + Series.__array__ + +Indexing and Iteration +---------------------- + +.. sdc_toctree + Series.get + Series.at + Series.iat + Series.loc + Series.iloc + Series.__iter__ + Series.items + Series.iteritems + Series.keys + Series.pop + Series.item + Series.xs + +For more information on ``.at``, ``.iat``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +------------------------- + +.. sdc_toctree + Series.add + Series.sub + Series.mul + Series.div + Series.truediv + Series.floordiv + Series.mod + Series.pow + Series.radd + Series.rsub + Series.rmul + Series.rdiv + Series.rtruediv + Series.rfloordiv + Series.rmod + Series.rpow + Series.combine + Series.combine_first + Series.round + Series.lt + Series.gt + Series.le + Series.ge + Series.ne + Series.eq + Series.product + Series.dot + +User-Defined Functions, GroupBy, Window +--------------------------------------- + +.. sdc_toctree + Series.apply + Series.agg + Series.aggregate + Series.transform + Series.map + Series.groupby + Series.rolling + Series.expanding + Series.ewm + Series.pipe + +.. _api_ref.pandas.series.stats: + +Computations, Descriptive Statistics +------------------------------------ + +.. sdc_toctree + Series.abs + Series.all + Series.any + Series.autocorr + Series.between + Series.clip + Series.corr + Series.count + Series.cov + Series.cummax + Series.cummin + Series.cumprod + Series.cumsum + Series.describe + Series.diff + Series.factorize + Series.kurt + Series.mad + Series.max + Series.mean + Series.median + Series.min + Series.mode + Series.nlargest + Series.nsmallest + Series.pct_change + Series.prod + Series.quantile + Series.rank + Series.sem + Series.skew + Series.std + Series.sum + Series.var + Series.kurtosis + Series.unique + Series.nunique + Series.is_unique + Series.is_monotonic + Series.is_monotonic_increasing + Series.is_monotonic_decreasing + Series.value_counts + Series.compound + +Re-Indexing, Selection, Label Manipulation +------------------------------------------ + +.. sdc_toctree + Series.align + Series.drop + Series.droplevel + Series.drop_duplicates + Series.duplicated + Series.equals + Series.first + Series.head + Series.idxmax + Series.idxmin + Series.isin + Series.last + Series.reindex + Series.reindex_like + Series.rename + Series.rename_axis + Series.reset_index + Series.sample + Series.set_axis + Series.take + Series.tail + Series.truncate + Series.where + Series.mask + Series.add_prefix + Series.add_suffix + Series.filter + +Missing Data Handling +--------------------- + +.. sdc_toctree + Series.isna + Series.notna + Series.dropna + Series.fillna + Series.interpolate + +Re-Shaping, Sorting +------------------- + +.. sdc_toctree + Series.argsort + Series.argmin + Series.argmax + Series.reorder_levels + Series.sort_values + Series.sort_index + Series.swaplevel + Series.unstack + Series.explode + Series.searchsorted + Series.ravel + Series.repeat + Series.squeeze + Series.view + +Combining, Joining, Merging +----------------------------- + +.. sdc_toctree + Series.append + Series.replace + Series.update + +Time Series +----------- + +.. sdc_toctree + Series.asfreq + Series.asof + Series.shift + Series.first_valid_index + Series.last_valid_index + Series.resample + Series.tz_convert + Series.tz_localize + Series.at_time + Series.between_time + Series.tshift + Series.slice_shift + +Accessors +--------- + +Pandas provides dtype-specific methods under various accessors. +These are separate namespaces within :class:`Series` that only apply +to specific data types. + +=========================== ================================= +Data Type Accessor +=========================== ================================= +Datetime, Timedelta, Period :ref:`dt ` +String :ref:`str ` +Categorical :ref:`cat ` +Sparse :ref:`sparse ` +=========================== ================================= + +.. _api_ref.pandas.series.dt: + +Datetimelike properties +~~~~~~~~~~~~~~~~~~~~~~~ + +``Series.dt`` can be used to access the values of the series as +datetimelike and return several properties. +These can be accessed like ``Series.dt.``. + +Datetime properties +^^^^^^^^^^^^^^^^^^^ + +.. sdc_toctree + Series.dt.date + Series.dt.time + Series.dt.timetz + Series.dt.year + Series.dt.month + Series.dt.day + Series.dt.hour + Series.dt.minute + Series.dt.second + Series.dt.microsecond + Series.dt.nanosecond + Series.dt.week + Series.dt.weekofyear + Series.dt.dayofweek + Series.dt.weekday + Series.dt.dayofyear + Series.dt.quarter + Series.dt.is_month_start + Series.dt.is_month_end + Series.dt.is_quarter_start + Series.dt.is_quarter_end + Series.dt.is_year_start + Series.dt.is_year_end + Series.dt.is_leap_year + Series.dt.daysinmonth + Series.dt.days_in_month + Series.dt.tz + Series.dt.freq + +Datetime methods +^^^^^^^^^^^^^^^^ + +.. sdc_toctree + Series.dt.to_period + Series.dt.to_pydatetime + Series.dt.tz_localize + Series.dt.tz_convert + Series.dt.normalize + Series.dt.strftime + Series.dt.round + Series.dt.floor + Series.dt.ceil + Series.dt.month_name + Series.dt.day_name + +Period properties +^^^^^^^^^^^^^^^^^ + +.. sdc_toctree + Series.dt.qyear + Series.dt.start_time + Series.dt.end_time + +Timedelta properties +^^^^^^^^^^^^^^^^^^^^ + +.. sdc_toctree + Series.dt.days + Series.dt.seconds + Series.dt.microseconds + Series.dt.nanoseconds + Series.dt.components + +Timedelta methods +^^^^^^^^^^^^^^^^^ + +.. sdc_toctree + Series.dt.to_pytimedelta + Series.dt.total_seconds + +.. _api_ref.pandas.series.str: + +String handling +~~~~~~~~~~~~~~~ + +``Series.str`` can be used to access the values of the series as +strings and apply several methods to it. These can be accessed like +``Series.str.``. + +.. sdc_toctree + Series.str.capitalize + Series.str.casefold + Series.str.cat + Series.str.center + Series.str.contains + Series.str.count + Series.str.decode + Series.str.encode + Series.str.endswith + Series.str.extract + Series.str.extractall + Series.str.find + Series.str.findall + Series.str.get + Series.str.index + Series.str.join + Series.str.len + Series.str.ljust + Series.str.lower + Series.str.lstrip + Series.str.match + Series.str.normalize + Series.str.pad + Series.str.partition + Series.str.repeat + Series.str.replace + Series.str.rfind + Series.str.rindex + Series.str.rjust + Series.str.rpartition + Series.str.rstrip + Series.str.slice + Series.str.slice_replace + Series.str.split + Series.str.rsplit + Series.str.startswith + Series.str.strip + Series.str.swapcase + Series.str.title + Series.str.translate + Series.str.upper + Series.str.wrap + Series.str.zfill + Series.str.isalnum + Series.str.isalpha + Series.str.isdigit + Series.str.isspace + Series.str.islower + Series.str.isupper + Series.str.istitle + Series.str.isnumeric + Series.str.isdecimal + Series.str.get_dummies + +.. _api_ref.pandas.series.cat: + +Categorical Accessor +~~~~~~~~~~~~~~~~~~~~ + +Categorical-dtype specific methods and attributes are available under +the ``Series.cat`` accessor. + +.. sdc_toctree + Series.cat.categories + Series.cat.ordered + Series.cat.codes + Series.cat.rename_categories + Series.cat.reorder_categories + Series.cat.add_categories + Series.cat.remove_categories + Series.cat.remove_unused_categories + Series.cat.set_categories + Series.cat.as_ordered + Series.cat.as_unordered + + +.. _api_ref.pandas.series.sparse: + +Sparse Accessor +~~~~~~~~~~~~~~~ + +Sparse-dtype specific methods and attributes are provided under the +``Series.sparse`` accessor. + +.. sdc_toctree + Series.sparse.npoints + Series.sparse.density + Series.sparse.fill_value + Series.sparse.sp_values + Series.sparse.from_coo + Series.sparse.to_coo + +.. _api_ref.pandas.series.metadata: + +Plotting +-------- +``Series.plot`` is both a callable method and a namespace attribute for +specific plotting methods of the form ``Series.plot.``. + +.. sdc_toctree + Series.plot + Series.plot.area + Series.plot.bar + Series.plot.barh + Series.plot.box + Series.plot.density + Series.plot.hist + Series.plot.kde + Series.plot.line + Series.plot.pie + Series.hist + +Serialization, Input-Output, Conversion +--------------------------------------- + +.. sdc_toctree + Series.to_pickle + Series.to_csv + Series.to_dict + Series.to_excel + Series.to_frame + Series.to_xarray + Series.to_hdf + Series.to_sql + Series.to_msgpack + Series.to_json + Series.to_string + Series.to_clipboard + Series.to_latex diff --git a/docs/source/apireference.rst b/docs/source/apireference.rst index c8d7ec81e..1857d152a 100644 --- a/docs/source/apireference.rst +++ b/docs/source/apireference.rst @@ -1,11 +1,9 @@ .. _apireference: API Reference -~~~~~~~~~~~~~ +************* -This page gives an overview of all Pandas APIs supported currently by Intel® Scalable Dataframe Compiler +.. toctree:: + :maxdepth: 2 -.. autosummary:: - :toctree: _autosummary - - pandas.Series + Series: Columnar Data Structure <./_api_ref/api_ref.pandas.series.rst> diff --git a/docs/source/buildscripts/__pycache__/__init__.cpython-37.pyc b/docs/source/buildscripts/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 000000000..a2db886b3 Binary files /dev/null and b/docs/source/buildscripts/__pycache__/__init__.cpython-37.pyc differ diff --git a/docs/source/buildscripts/__pycache__/apiref_generator.cpython-37.pyc b/docs/source/buildscripts/__pycache__/apiref_generator.cpython-37.pyc new file mode 100644 index 000000000..ba7861f33 Binary files /dev/null and b/docs/source/buildscripts/__pycache__/apiref_generator.cpython-37.pyc differ diff --git a/docs/source/buildscripts/__pycache__/sdc_doc_utils.cpython-37.pyc b/docs/source/buildscripts/__pycache__/sdc_doc_utils.cpython-37.pyc new file mode 100644 index 000000000..bd04a7c3d Binary files /dev/null and b/docs/source/buildscripts/__pycache__/sdc_doc_utils.cpython-37.pyc differ diff --git a/docs/source/buildscripts/__pycache__/sdc_object_utils.cpython-37.pyc b/docs/source/buildscripts/__pycache__/sdc_object_utils.cpython-37.pyc new file mode 100644 index 000000000..3e3909df5 Binary files /dev/null and b/docs/source/buildscripts/__pycache__/sdc_object_utils.cpython-37.pyc differ diff --git a/docs/source/buildscripts/apiref_generator.py b/docs/source/buildscripts/apiref_generator.py new file mode 100644 index 000000000..892311485 --- /dev/null +++ b/docs/source/buildscripts/apiref_generator.py @@ -0,0 +1,657 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas +from sdc_object_utils import init_pandas_structure, init_sdc_structure, init_pandas_sdc_dict, get_sdc_object, get_obj +from sdc_object_utils import get_class_methods, get_class_attributes, get_fully_qualified_name +from sdc_doc_utils import is_sdc_user_guide_header, get_indent, reindent,get_short_description +from sdc_doc_utils import split_in_sections, get_docstring, create_heading_str, cut_sdc_dev_guide +import os + +APIREF_REL_PATH = './_api_ref/' + + +def reformat(text): + """ + Wrapper function that includes series of transformations of the ``text`` to fix Pandas docstrings which + cause Sphinx to generate warnings. + + :param text: Original text with warnings + :return: Modified text that fixes warnings + """ + text = reformat_replace_star_list_with_dash_list(text) # Must be called before :func:`reformat_asterisks` + text = reformat_asterisks(text) # Fix for * and ** symbols + text = reformat_explicit_markup(text) # Fix for explicit markup without a blank line + text = reformat_bullet_list(text) # Fix bullet list indentation issues + text = reformat_remove_unresolved_references(text) # Fix unresolved references after removal of References sections + return reformat_remove_multiple_blank_lines(text) + + +def reformat_remove_unresolved_references(text): + """ + Fixes unresolved references after removing References sections. + + Searches for pattern [numeric]_ in the text and removes it. Intel SDC references do not use [numeric]_ pattern + + :param text: Original text + :return: Reformatted text + """ + new_text = '' + while len(text) > 0: + idx = text.find('[') + + if idx >= 0: + new_text += text[0:idx] + idx1 = idx+1 + while idx1 < len(text) and text[idx1].isnumeric(): + # Iterating through numeric characters + idx1 += 1 + + if idx1+1 < len(text): + # There are at least two more symbols after numeric ones in the text + if text[idx1:idx1+2] != ']_': + new_text += text[idx:idx1+2] + + if idx1+2 < len(text): + text = text[idx1+2:] # Remove reference + else: + text = '' + else: + new_text += text[idx:] + text = '' + else: + new_text += text + text = '' + return new_text + + +def reformat_replace_star_list_with_dash_list(text): + """ + Replaces bullet lists starting with `*` with the lists starting with `-` + + :param text: Original text + :return: New text without `*` bullet lists + """ + lines = text.split('\n') + new_text = '' + for line in lines: + if line.strip().startswith('* '): + line = line.replace('* ', '- ', 1) + + new_text += line + '\n' + + return new_text + + +def reformat_remove_multiple_blank_lines(text): + """ + Removes redundant blank lines + + After multiple passes of the text reformatting there could be redundant blank lines between sections. + This pass is intended for removal of consecutive blank lines and keeping just one blank line between sections + + :param text: Original text + :return: Text with removed redundant blank lines + """ + + len_changed = True + + while len_changed: + new_text = text.replace('\n\n\n', '\n\n') + len_changed = len(new_text) < len(text) + text = new_text + + return new_text + + +def reformat_bullet_list(text): + lines = text.split('\n') + new_text = '' + bullet_indent = -1 + while len(lines) > 0: + line = lines[0] + if line.strip().startswith('- '): + # Here if met new bullet + bullet_indent = get_indent(line) # We need to know indent to identify multi-line bullets + new_text += line + '\n' + elif line.strip() == '': + bullet_indent = -1 # We finished parsing multi-line bullet + new_text += '\n' + else: + if bullet_indent >= 0: + # Here if we're parsing multi-line bullet + new_text += reindent(line, bullet_indent + 4) + '\n' + else: + # Here if we are not in bullet list + new_text += line + '\n' + lines.pop(0) + + return new_text + + +def reformat_explicit_markup(text): + """ + Fixes Pandas docstring warning about explicit markup not followed by a blank line. + + Parses the text and finds ``'.. '`` strings by adding a blank line next after. + + :param text: Original text with warnings + :return: Modified text that fixes warnings + """ + lines = text.split('\n') + new_text = '' + while len(lines) > 0: + line = lines[0] + + if line.strip().startswith('.. versionchanged') or line.strip().startswith('.. versionadded') or \ + line.strip().startswith('.. deprecated'): + new_text += line + '\n' + # Here if found explicit markup + if len(lines) > 1: + # Here if there is at least one line after explicit markup + if lines[1].strip != '': + # Here if there is no empty line after explicit markup. Add new line then + new_text += '\n' + lines.pop(0) + elif line.strip().startswith('.. note') or line.strip().startswith('.. warning'): + new_text += line.strip() + '\n' + if len(lines) > 1: + # Here if there is at least one line after explicit markup + if lines[1].strip() == '': + # Here if there is empty line after explicit markup. Remove new line then + lines.pop(1) + elif line.strip().startswith('.. ['): + new_text += '\n' # Remove references + else: + new_text += line + '\n' + lines.pop(0) + return new_text + + +def reformat_asterisks(text): + """ + Fixes Pandas docstring warning about using * and ** without ending \* and \*\*. + + The fix distinguishes single * and ** by adding \\ to them. No changes for *italic* and **bold** usages. + + :param text: Original text with warnings + :return: Modified text that fixes warnings + """ + lines = text.split('\n') + new_text = '' + for line in lines: + idx = 0 # Starting parsing position within the ``line`` + while idx < len(line): # Parsing until end of string reached + idx1 = line.find('*', idx) + if idx1 >= idx: + # There is at least one asterisk in the line + idx2 = line.find('*', idx1+1) + + if idx2 == -1: + # Only one single asterisk in the line - Reformat to `\*` + line = line.replace('*', '\\*') + idx = len(line) # Parsed the line. Go to another line + elif idx2 == idx1+1: + # First double asterisk met in the line + idx2 = line.find('**', idx1+2) + if idx2 == -1: + # Only one double asterisk in the line Reformat to `\*\*`. But there could be more asterisks + line = line.replace('**', '\\*\\*') + idx = idx1+4 + else: + # At least two double asterisks in the line + idx = idx2+2 # Deal with remaining asterisks on the next ``while`` loop iteration + else: + # There is another asterisk apart from the first asterisk + if idx2+1 < len(line): + # There is at least one more symbol in the line after second asterisk + if line[idx2+1] == '*': + # Situation when double asterisk is met after the first single asterisk - Reformat to `\*` + line = line.replace('*', '\\*', 1) # Replace the first single asterisk + idx = idx2 # Handle double asterisk on the next ``while`` iteration + else: + # Two asterisks met in the line to italize characters between them + idx = idx2+1 + else: + # Second asterisk was the last symbol in the line + idx = len(line) + else: + # No asterisks in the line + idx = len(line) + new_text += line + '\n' + + return new_text + + +def reformat_pandas_params(title, text): + """ + Re-formats ``text`` written in NumPy style documenting Parameters, Returns, Raises sections into + explicit `::` style. + + Algorithm searches for the pattern: + ` : ` + `` + `` + Reformat to the following: + `::` + `` + `` + `` + + + :param title: + :param text: + :return: Reformatted text + """ + + # Internal function. Returns correct markup for :param :, :return:, and :raises : + def _get_param_text(title, param): + title = title.strip() + if title == 'Parameters': + return ':param ' + param + ':' + elif title == 'Return' or title == 'Returns': + return ':return:' + elif title == 'Raises': + return ':raises:' + + + # Internal function. Returns correct markup for Parameters section + def _reformat_parameters(title, text): + lines = text.split('\n') + new_text = '' + + if len(lines) == 0: + return new_text + + indent = get_indent(text) + param = '' + description = '' + while len(lines) > 0: + line = lines[0] + line = line.strip() + idx = line.find(' : ') + if idx >= 0 & line[0:idx].isalnum(): + # Check if previous parameter existed. If so, need to add it to reformatted text + if param != '': + new_text += _get_param_text(title, param) + '\n' + reindent(description, indent+4) + '\n' + + # Found parameter. Extract the description (can be multi-line) + param = line[0:idx] + description = line[idx+3:] + '\n' + lines.pop(0) + else: + # There is no parameter description starting in this line. + # Check if it is continuation of parameter description from previous lines + if param != '': + # It is continuation of multi-line parameter description + description += reindent(line, indent+4) + '\n' + else: + # This is not the description of parameter. Copy as is + new_text += reindent(line, indent) + '\n' + lines.pop(0) + + if param != '' and description != '': + new_text += _get_param_text(title, param) + '\n' + reindent(description, indent+4) + '\n' + return new_text + + # Internal function. Returns correct markup for Raises section + def _reformat_raises(title, text): + lines = text.split('\n') + new_text = '' + + if len(lines) == 0: + return new_text + + indent = get_indent(text) + param = '' + description = '' + while len(lines) > 0: + line = lines[0] + line = line.strip() + + # Check if it is continuation of parameter description from previous lines + if param != '': + # It is continuation of multi-line parameter description + description += reindent(line, indent + 8) + '\n' + else: + # This is the first line of ``raises`` description + param = _get_param_text(title, '') + '\n' + reindent(line, indent + 4) + new_text += param + '\n' + lines.pop(0) + + if param != '' and description != '': + new_text += reindent(description, indent + 8) + '\n' + return new_text + '\n' + + # Internal function. Returns correct markup for Returns section + def _reformat_returns(title, text): + lines = text.split('\n') + new_text = '' + + if len(lines) == 0: + return new_text + + indent = get_indent(text) + param = '' + description = '' + while len(lines) > 0: + line = lines[0] + line = line.strip() + + # Check if it is continuation of parameter description from previous lines + if param != '': + # It is continuation of multi-line parameter description + description += reindent(line, indent + 4) + '\n' + else: + # This is the first line of ``return`` description + param = _get_param_text(title, '') + ' ' + line + new_text += reindent(param, indent) + '\n' + lines.pop(0) + + if param != '' and description != '': + new_text += reindent(description, indent + 4) + '\n' + return new_text + '\n' + + if title.strip() == 'Parameters': + return _reformat_parameters(title, text) + elif title.strip() == 'Returns' or title.strip() == 'Return': + return _reformat_returns(title, text) + elif title.strip() == 'Raises': + return _reformat_raises(title, text) + else: + return text + + +def generate_simple_object_doc(pandas_obj, short_doc_flag=False, doc_from_pandas_flag=True, add_sdc_sections=True, + unsupported_warning=True, reformat_pandas=True): + """ + Generates documentation for Pandas object obj according to flags. + + For complex objects such as modules and classes the function does not go to sub-objects, + i.e. to class attributes and sub-modules of the module. + + :param pandas_obj: Pandas object for which documentation to be generated. + :param short_doc_flag: Flag to indicate that only short description for the object is needed. + :param doc_from_pandas_flag: Flag to indicate that the documentation must be taken from Pandas docstring. + This docstring can be extended with Intel SDC specific sections. These are See Also, Examples, + Notes, Warning, Limitations, etc. if ``add_sdc_sections`` flag is set. + :param add_sdc_sections: Flag to indicate that extra sections of the documentation need to be taken from Intel SDC. + If ``doc_from_pandas_flag==False`` then the description section is taken from Intel SDC too. Otherwise + Intel SDC description section will be cut and Pandas API description will be used instead. + :param unsupported_warning: Flag, if ``True`` includes warning message if corresponding Intel SDC object is not + found. This indicates that given SDC method is unsupported. + :param reformat_pandas: Flag, if ``True`` re-formats Parameters section to :param: style. Needed to work around + Sphinx generator issues for Pandas Parameters section written in NumPy style + :return: Generated docstring. + """ + + doc = '' + if pandas_obj is None: + return doc # Empty documentation for no-object + + if doc_from_pandas_flag: # Check if documentation needs to be generated from Pandas docstring + if short_doc_flag: # Check if only short description is needed + doc = get_short_description(pandas_obj) # Short description is requested + else: + # Exclude Examples, Notes, See Also, References sections + sections = split_in_sections(reindent(get_docstring(pandas_obj), 0)) + while len(sections) > 0: + title, text = sections[0] + if title.strip() == '': # Description sections + doc += text + '\n\n' + sections.pop(0) + elif title.strip() == 'Examples': # Exclude Examples section + sections.pop(0) + elif title.strip() == 'Notes': # Exclude Notes section (may be too specific to Pandas) + sections.pop(0) + elif title.strip().lower() == 'see also': # Exclude See Also section (may be too specific to Pandas) + sections.pop(0) + elif title.strip() == 'References': # Exclude References section (may be too specific to Pandas) + sections.pop(0) + elif title.strip() == 'Parameters' or title.strip() == 'Raises' or title.strip() == 'Return' or \ + title.strip() == 'Returns': + if reformat_pandas: + doc += reformat_pandas_params(title, text) + sections.pop(0) + else: + doc += create_heading_str(title) + '\n\n' + text + '\n\n' + sections.pop(0) + else: + doc += create_heading_str(title) + '\n\n' + text + '\n\n' + sections.pop(0) + + if not add_sdc_sections: + if reformat_pandas: + return reformat(doc) + else: + return doc + + # Here if additional sections from Intel SDC object needs to be added to pandas_obj docstring + sdc_obj = get_sdc_object(pandas_obj) + if sdc_obj is None: + if unsupported_warning: + if reformat_pandas: + doc = reformat(doc) + + if short_doc_flag: + return doc + ' **Unsupported by Intel SDC**.' + else: + return doc + '\n\n.. warning::\n This feature is currently unsupported ' \ + 'by Intel Scalable Dataframe Compiler\n\n' + + if not short_doc_flag: + sdc_doc = get_docstring(sdc_obj) + sdc_doc = cut_sdc_dev_guide(sdc_doc) + + # Cut description section from ``sdc_doc`` + if is_sdc_user_guide_header(sdc_doc[0]): # First section is SDC User Guide header + sdc_doc.pop(0) + + if doc_from_pandas_flag: + # Ignore description from Intel SDC, keep Pandas description only + while len(sdc_doc) > 0: + title, text = sdc_doc[0] + if title.strip() != '': + break + sdc_doc.pop(0) + + indent = get_indent(doc) + for title, text in sdc_doc: + if title.strip() == '': + doc += '\n' + reindent(text, indent) + else: + doc += '\n' + reindent(create_heading_str(title), indent) + '\n' + \ + reindent(text, indent) + '\n' + + return reformat(doc) + + +def get_rst_filename(obj_name): + """ + Returns rst file name by respective object name. + + :param obj_name: String, object name for which file name is constructed + :return: String, rst file name for the object being documented + """ + file_name = obj_name.replace('.', '/') + file_name = APIREF_REL_PATH + file_name + '.rst' + return file_name + + +def open_file_for_write(file_name): + """ + Opens file ``filename`` for writing. If necessary, creates file directories on the path. + + :param file_name: Absolute or relative path that includes file name being created. + :return: File descriptor created. + """ + directory = os.path.dirname(file_name) + + if len(directory) > 0 and not os.path.exists(directory): + os.makedirs(directory) + + return open(file_name, 'w', encoding='utf-8') + + +def write_rst(file_name, docstring): + """ + Writes ``docstring`` into the file ``file_name``. + + :param file_name: String, name of the file including relative or absolute path + :param docstring: String, docstring to be written in the file + """ + file = open_file_for_write(file_name) + file.write(docstring) + file.close() + + +def write_simple_object_rst_file(pandas_name, short_doc_flag=False, doc_from_pandas_flag=True, add_sdc_sections=True): + """ + Writes Pandas object ``pandas_name`` (e.g. 'pandas.Series.at') into rst file. + + RST file has the name derived from ``pandas_name`` (e.g. 'pandas.Series.at.rst'). Additional flags are used + to control look and feel of the resulting content of the file. See :func:`generate_simple_object_doc` function + for details about these flags. + + :param pandas_name: String, the name of Pandas object + :param short_doc_flag: Flag, if ``True``, write short description of the object only + :param doc_from_pandas_flag: Flag, if ``True``, derive the description from Pandas docstring for the object. + :param add_sdc_sections: Flag, if ``True``, extend the docstring with respective Intel SDC sections (if any) + """ + pandas_obj = get_obj(pandas_name) + doc = generate_simple_object_doc(pandas_obj, short_doc_flag, doc_from_pandas_flag, add_sdc_sections) + if doc is None or doc == '': + return + + fname = get_rst_filename(pandas_name) + write_rst(fname, doc) + + +def parse_templ_rst(fname_templ): + """ + Parses input template rst file and outputs the final rst file + Template document must have the following structure: + + Heading or subheading + ********************* + + Any text (if any) + + Another heading or subheading + ----------------------------- + + Any text (if any) + + .. currentmodule:: + + .. sdc_toctree + + + + ... + + Any text (if any) + + Any text (if any) + + Another heading or subheading + ----------------------------- + + Any text (if any) + ... + + :param fname_templ: + """ + path, fname_out = os.path.split(fname_templ) + fname_out = fname_out.replace('_templ', '') + fname_out = fname_out.replace('_', '', 1) + fout = open_file_for_write(APIREF_REL_PATH + fname_out) + with open(fname_templ, 'r', encoding='utf-8') as fin: + doc = fin.readlines() + + while len(doc) > 0: + # Parsing lines until ``.. sdc_toctree`` section is met + while len(doc) > 0 and not doc[0].startswith('.. sdc_toctree'): + line = doc[0] + if line.startswith('.. currentmodule::'): + current_module_name = line[19:].strip() + fout.write(line) + doc.pop(0) + + if len(doc) == 0: + return + + doc.pop(0) # Skipping ``.. sdc_toctree`` + + # Parsing the list of APIs + while len(doc) >0 and doc[0].strip() != '': + line = doc[0] + indent = get_indent(line) + line = line.strip() + full_name = current_module_name + '.' + line + obj = get_obj(full_name) + short_description = generate_simple_object_doc(obj, short_doc_flag=True).strip() + new_line = reindent(':ref:`', indent) + line + ' <' + full_name + '>`\n' + \ + reindent(short_description, indent+4) + '\n' + fout.write(new_line) + doc.pop(0) + + full_description = generate_simple_object_doc(obj, short_doc_flag=False) + f = open_file_for_write(APIREF_REL_PATH + full_name + '.rst') + f.write('.. _' + full_name + ':\n\n:orphan:\n\n') + f.write(create_heading_str(full_name, '*') + '\n\n') + f.write(full_description) + f.close() + + if len(doc) == 0: + return + + fout.close() + + +def write_class_rst_files(cls, short_doc_flag=False, doc_from_pandas_flag=True, add_sdc_sections=True): + # Currenlty not in use. Should be used for auto-documenting class methods and attributes. + + for method_name, method_object in get_class_methods(cls): + write_simple_object_rst_file(get_fully_qualified_name(cls) + '.' + method_name, + short_doc_flag, doc_from_pandas_flag, add_sdc_sections) + + for attr_name, attr_object in get_class_attributes(cls): + write_simple_object_rst_file(get_fully_qualified_name(cls) + '.' + attr_name, + short_doc_flag, doc_from_pandas_flag, add_sdc_sections) + + +def generate_api_reference(): + init_pandas_structure() + init_sdc_structure() + init_pandas_sdc_dict() + + parse_templ_rst('./_templates/_api_ref.pandas.series_templ.rst') + + +if __name__ == "__main__": + generate_api_reference() diff --git a/docs/source/buildscripts/module_info.py b/docs/source/buildscripts/module_info.py new file mode 100644 index 000000000..98460b268 --- /dev/null +++ b/docs/source/buildscripts/module_info.py @@ -0,0 +1,443 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +from inspect import getmembers, ismodule, isclass, isfunction +import logging +import sys + + +# -- Debug logging -------------------------------------------------------------------------------------------------- +ENABLE_LOGGING = False + + +# Logging information about attribute parsing +def _attribute_logging(s): + if ENABLE_LOGGING: + logging.debug('[ATTRIBUTE]' + s) + return + + +# Logging information about method parsing +def _method_logging(s): + if ENABLE_LOGGING: + logging.debug('[METHOD]' + s) + return + + +# Logging information about function parsing +def _function_logging(s): + if ENABLE_LOGGING: + logging.debug('[FUNCTION]' + s) + return + + +# Logging information about class parsing +def _class_logging(s): + if ENABLE_LOGGING: + logging.debug('[CLASS]' + s) + return + + +# Logging information about module parsing +def _module_logging(s): + if ENABLE_LOGGING: + logging.debug('[MODULE]' + s) + return + + +# -- Returns all classes and respective methods of the module ------------------------------------------------------- +def get_submodules_of(module, inspected, module_list, skip_module_test, skip_class_test, + skip_method_test, skip_attribute_test, skip_function_test): + + # Returns True if the mod module will not be included in API Reference + def _skip_module(mod): + mod_name = mod.__name__ # Get new submodule name + sk_mod = False + + if mod in inspected: # Ignore already traversed modules + sk_mod = True + _module_logging('`' + mod_name + '` already traversed. Ignoring') + return sk_mod + + if '._' in mod_name or mod_name.startswith('_'): # Ignore internal module + sk_mod = True + _module_logging('`' + mod_name + '` is internal (starts with _). Ignoring') + return sk_mod + + if skip_module_test(mod, mod_name): + sk_mod = True + return sk_mod + + return sk_mod + + # Returns True if the cls class will not be included in API Reference + def _skip_class(cls): + sk_class = False + class_name = repr(cls)[8:-2] # Get full class name + + if '._' in class_name: # We are interested only in public classes + sk_class = True + _class_logging('`' + class_name + '` is internal. Ignoring') + return sk_class + + if skip_class_test(cls, class_name): + sk_class = True + return sk_class + + return sk_class + + # Returns True if the method method_name will not be included in API Reference + def _skip_method(cls, method_name): + sk_method = False + + if method_name.startswith('_'): # Ignore internal methods + sk_method = True + _method_logging('`' + method_name + '` is internal (starts with __). Ignoring') + return sk_method + + if skip_method_test(cls, method_name): + sk_method = True + return sk_method + + return sk_method + + # Returns True if the method method_name will not be included in API Reference + def _skip_function(function, function_name): + sk_function = False + + if function_name.startswith('_'): # Ignore internal function + sk_function = True + _function_logging('`' + function_name + '` is internal (starts with __). Ignoring') + return sk_function + + if skip_function_test(function, function_name): + sk_function = True + return sk_function + + return sk_function + + # Returns True if the attribute attr_name will not be included in API Reference + def _skip_attribute(cls, attr_name): + sk_attr = False + + if attr_name.startswith('_'): # Ignore internal methods + sk_attr = True + _attribute_logging('`' + attr_name + '` is internal (starts with __). Ignoring') + return sk_attr + + if skip_attribute_test(cls, attr_name): + sk_attr = True + return sk_attr + + return sk_attr + + # Creates the list of methods for the class + def _generate_class_methods(cls): + meths = [func for func in dir(cls) if callable(getattr(cls, func)) and not _skip_method(cls, func)] + for meth in meths: + _method_logging('Adding method `' + meth + '` to the list') + return meths + + # Creates the list of class's attributes + def _generate_class_attributes(cls): + attrs = [func for func in dir(cls) if not callable(getattr(cls, func)) and not _skip_attribute(cls, func)] + for att in attrs: + _attribute_logging('Adding attribute `' + att + '` to the list') + return attrs + + # -- get_classes_of() implementation begins + if _skip_module(module): + return + + inspected.add(module) # Add module to the set of traversed modules + module_name = module.__name__ + module_list.append({'module_name': module_name, 'module_object': module, 'classes': []}) + + _module_logging('********************** Inspecting module `' + module_name + '`') + + class_list = [] + module_list[-1]['classes'] = class_list + function_list = [] + module_list[-1]['functions'] = function_list + + # Traverses the mod module classes and submodules + for (name, obj) in getmembers(module): # Iterate through members of the submodule + if isclass(obj): # We are interested in objects, which are classes + if not _skip_class(obj): + _class_logging('********************** Inspecting class `' + name + '`') + methods = _generate_class_methods(obj) # Inspect methods of the class of interest only + attributes = _generate_class_attributes(obj) # Inspect attributes of the class of interest only + class_list.append({'class_name': name, 'class_object': obj, 'class_methods': methods, + 'class_attributes': attributes}) + + if isfunction(obj): # We are interested in objects, which are functions + if not _skip_function(obj, name): + function_list.append({'function_name': name, 'function_object': obj}) + + if ismodule(obj): + if not _skip_module(obj): + get_submodules_of(obj, inspected, module_list, skip_module_test, skip_class_test, + skip_method_test, skip_attribute_test, skip_function_test) + + return + + +# -- Returns all classes and respective methods of the module ------------------------------------------------------- +def print_modules_classes_methods_attributes(modules): + for the_module in modules: # modules is the list, each element represents dictionary characterizing the sub-module + print(the_module['module_name']) + print(' FUNCTIONS:') + for the_function in the_module['functions']: + print(' - ' + the_function['function_name']) + + print(' CLASSES:') + for the_class in the_module['classes']: + print(' - ' + the_class['class_name']) + print(' METHODS:') + for the_method in the_class['class_methods']: + print(' ' + the_method) + print(' ATTRIBUTES:') + for the_attribute in the_class['class_attributes']: + print(' ' + the_attribute) + return + + +# -- Trimming docstring -------------------------------------------------------------------------------------------- +def trim(docstring): + # Copyright 2015: Mirantis Inc. + # All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); you may + # not use this file except in compliance with the License. You may obtain + # a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + # License for the specific language governing permissions and limitations + # under the License. + + """trim function from PEP-257""" + if not docstring: + return "" + # Convert tabs to spaces (following the normal Python rules) + # and split into a list of lines: + lines = docstring.expandtabs().splitlines() + # Determine minimum indentation (first line doesn't count): + indent = sys.maxsize + for line in lines[1:]: + stripped = line.lstrip() + if stripped: + indent = min(indent, len(line) - len(stripped)) + # Remove indentation (first line is special): + trimmed = [lines[0].strip()] + if indent < sys.maxsize: + for line in lines[1:]: + trimmed.append(line[indent:].rstrip()) + # Strip off trailing and leading blank lines: + while trimmed and not trimmed[-1]: + trimmed.pop() + while trimmed and not trimmed[0]: + trimmed.pop(0) + + # Current code/unittests expects a line return at + # end of multiline docstrings + # workaround expected behavior from unittests + if "\n" in docstring: + trimmed.append("") + + # Return a single string: + return "\n".join(trimmed) + + +# -- String formatting ---------------------------------------------------------------------------------------------- +def reindent(string): + # Copyright 2015: Mirantis Inc. + # All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); you may + # not use this file except in compliance with the License. You may obtain + # a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + # License for the specific language governing permissions and limitations + # under the License. + + return "\n".join(l.strip() for l in string.strip().split("\n")) + + +# -- These symbols can be used to underline section title ----------------------------------------------------------- +UNDERLINE_SYMBOLS = ['~', '#', '@', '^', '*', '-', '_', '+', '='] + + +# -- Split section into section title and remaining text ------------------------------------------------------------ +def split_title(section): + def _is_section_title(title_line, underscore_line): + n = len(title_line) + for c in UNDERLINE_SYMBOLS: + s = c * n + if underscore_line.startswith(s): + return True + + return False + + if section.startswith('\n'): + section = section.replace('\n', '', 1) + + lines = section.split('\n', 2) + if len(lines) > 2: + # Only sections with number of lines>2 can start with a title + if _is_section_title(lines[0].strip(), lines[1].strip()): + return lines[0], lines[2] + else: + return '', section + else: + return '', section + + +# -- Parse docstring by forming the list of sections, where each section is dictionary with title and text ---------- +def split_in_sections(doc, sdc_header_section_flag=False): + sections = doc.split('\n\n') + titled_sections = [] + + # For SDC API Reference documentation the topmost section gives Pandas API name + if sdc_header_section_flag: + section = sections[0] + title, text = split_title(section) + titled_sections.append({'title': title, 'text': text}) + sections.pop(0) + + # Special processing for short and long description sections, if any + section = sections[0] + title, text = split_title(section) + while title.strip() == '': + titled_sections.append({'title': title, 'text': text}) + sections.pop(0) + if len(sections) > 0: + section = sections[0] + title, text = split_title(section) + else: + break + + # Other sections. Merge those which are just separated by blank lines + for i in range(len(sections)): + section = sections[i] + title, text = split_title(section) + if title.strip() == '': + titled_sections[-1]['text'] += '\n\n' + text + else: + titled_sections.append({'title': title, 'text': text}) + + return titled_sections + + +def get_function_doc(func, sdc_header_flag=False): + doc = func.__doc__ + + if doc is None: + doc = '' + + titled_sections = split_in_sections(doc, sdc_header_flag) + return titled_sections + + +def get_function_short_description(func, sdc_header_flag=False): + titled_sections = get_function_doc(func, sdc_header_flag) + if sdc_header_flag: # Ignore the first section + titled_sections.pop(0) + short_description = titled_sections[0]['text'] + + # Make it single line in case it is multi-line + lines = short_description.split('\n') + lines = [s.strip()+' ' for s in lines] + short_description = ''.join(lines) + + return short_description + + +def create_header_str(s, underlying_symbol='*'): + n = len(s) + return s + '\n' + underlying_symbol*n + + +def get_function(func_name, modules): + """ + Searches for the function func_name in the modules list. Name can or cannot be given fully qualified + + :param func_name: string, the function name being searched + :param modules: the list of modules created by :func:`get_submodules_of` + :return: function object or None + """ + + # Check if fully qualified name given + if func_name.find('.') != -1: + split_name = func_name.rsplit('.', 1) + func_name = split_name[-1] + module_name = split_name[-2] + + the_module = next((e for e in modules if e['module_name'] == module_name), None) + try: + if the_module: + return getattr(the_module['module_object'], func_name) + else: + return None + except AttributeError: + return None + else: + for the_module in modules: + for func_dict in the_module['functions']: + if func_name == func_dict['function_name']: + return func_dict['function_object'] + + return None + +def get_method_attr(name, modules): + """ + Searches for the method/attribute name in the modules list. Name is fully qualified + + :param name: string, the method/attribute being searched + :param modules: the list of modules created by :func:`get_submodules_of` + :return: method/attribute object or None + """ + split_name = name.rsplit('.', 2) + name = split_name[-1] + class_name = split_name[-2] + module_name = split_name[-3] + + the_module = next((e for e in modules if e['module_name'] == module_name), None) + the_class = next((e for e in the_module['classes'] if e['class_name'] == class_name), None) + try: + return getattr(the_class['class_object'], name) + except AttributeError: + return None diff --git a/docs/source/pandas_info.py b/docs/source/buildscripts/pandas_info.py similarity index 50% rename from docs/source/pandas_info.py rename to docs/source/buildscripts/pandas_info.py index c01025de1..612d539c1 100644 --- a/docs/source/pandas_info.py +++ b/docs/source/buildscripts/pandas_info.py @@ -29,16 +29,24 @@ import pandas import logging from datetime import datetime -from module_info import get_submodules_of, print_modules_classes_methods_attributes, get_doc +from module_info import get_submodules_of, print_modules_classes_methods_attributes, ENABLE_LOGGING # -- Debug logging -------------------------------------------------------------------------------------------------- log_file_name = '../build/pandas_info.log' # -- Submodules, classes, and methods to be excluded from API Reference --------------------------------------------- exclude_modules = [ - 'pandas.core', # This is PRIVATE submodule 'pandas.compat', # This is PRIVATE submodule - 'pandas.util' # This is PRIVATE submodule + 'pandas.util', # This is PRIVATE submodule + 'pandas.api.extensions', # This is extension for library developers extending Pandas. Not current interest to SDC + 'pandas.testing', # Utility functions for testing. Not a priority for SDC + 'pandas.plotting', # Plotting functions. Not a priority for compiling with SDC + 'pandas.errors', # Error handling functionality. Not a priority for SDC + 'pandas.api.types', # Not a priority for SDC + 'pandas.io.formats.style', # Helps to style dataframes with HTML and CSS. Not a priority for SDC + 'pandas.arrays', # Array extensions for Numpy. We do not explicitly cover in SDC documentation now + 'pandas.tseries', # SDC does not yet support Time Series objects + 'pandas.core.dtypes.dtypes', ] exclude_classes = [ @@ -50,35 +58,88 @@ exclude_attributes = [ ] +exclude_functions = [ +] + # -- Implements custom skip functions for the parser ---------------------------------------------------------------- def _skip_pandas_module(mod, mod_name): - return mod_name in exclude_modules or not mod_name.startswith('pandas') + for excl_mname in exclude_modules: + if mod_name.startswith(excl_mname): + return True + return not mod_name.startswith('pandas') def _skip_pandas_class(cls, cls_name): return cls_name in exclude_classes -def _skip_pandas_method(method_name): - return method_name in exclude_methods +def _skip_pandas_method(cls, method_name): + # Exclude the method if in the exclude_methods list + if method_name in exclude_methods: # Explicit exclusion of the method + return True + + # Exclude the method without docstring + try: + doc = getattr(cls, method_name).__doc__ + return len(doc) < 1 + except AttributeError: + return True + except TypeError: + return True + + +def _skip_pandas_function(func, function_name): + # Exclude the function if in the exclude_functions list + if function_name in exclude_functions: # Explicit exclusion of the method + return True + + # Exclude the function without docstring + try: + doc = func.__doc__ + return len(doc) < 1 + except AttributeError: + return True + except TypeError: + return True + + +def _skip_pandas_attribute(cls, attr_name): + # Exclude the attribute if in the exclude_methods list + if attr_name in exclude_attributes: # Explicit exclusion of the attribute + return True + + # Exclude the attribute without docstring + try: + doc = getattr(cls, attr_name).__doc__ + return len(doc) < 1 + except AttributeError: + return True + except TypeError: + return True + + +def get_pandas_modules(): + inspected_modules = set() + modules = [] + get_submodules_of(pandas, inspected_modules, modules, _skip_pandas_module, _skip_pandas_class, + _skip_pandas_method, _skip_pandas_attribute, _skip_pandas_function) + return modules -def _skip_pandas_attribute(attr_name): - return attr_name in exclude_attributes +def init_pandas_logging(): + if ENABLE_LOGGING: + logging.basicConfig(filename=log_file_name, level=logging.DEBUG) + logging.debug('****************** STARTING THE LOG *************************') + logging.debug(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) if __name__ == "__main__": # Initialize logging - logging.basicConfig(filename=log_file_name, level=logging.DEBUG) - logging.debug('****************** STARTING THE LOG *************************') - logging.debug(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) + init_pandas_logging() # Execute parser for Pandas - inspected_modules = set() - modules = [] - get_submodules_of(pandas, inspected_modules, modules, _skip_pandas_module, _skip_pandas_class, - _skip_pandas_method, _skip_pandas_attribute) + modules = get_pandas_modules() # You may uncomment this line in case you want to print out generated methods and attributes -# print_modules_classes_methods_attributes(modules) + print_modules_classes_methods_attributes(modules) diff --git a/docs/source/sdc2pd_name.py b/docs/source/buildscripts/sdc2pd_name.py similarity index 100% rename from docs/source/sdc2pd_name.py rename to docs/source/buildscripts/sdc2pd_name.py diff --git a/docs/source/buildscripts/sdc_doc_utils.py b/docs/source/buildscripts/sdc_doc_utils.py new file mode 100644 index 000000000..c01a4da9b --- /dev/null +++ b/docs/source/buildscripts/sdc_doc_utils.py @@ -0,0 +1,393 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +UNDERLINE_CHARS = ['-', '`', ':', '~', '^', '_', '*', '+', '#', '<', '>'] # Characters that can underline title + +SDC_USR_GUIDE_HEADING_STR = 'Intel Scalable Dataframe Compiler User Guide' + +SDC_USER_GUIDE_PANDAS_STR = 'Pandas API:' + +SDC_DEV_GUIDE_HEADING_STR = 'Intel Scalable Dataframe Compiler Developer Guide' + + +def get_indent(text): + """ + Returns indentation for a given ``text``. + + :param text: String, can be multi-line. Only first non-empty line is used to determine the indentation + :return: Indentation (the number of whitespace characters) + """ + lines = text.split('\n') + while len(lines) > 0 and lines[0] == '': + lines.pop(0) + + if len(lines) == 0: + return 0 # Text was empty, indentation for empty text is 0 + + n_stripped = len(lines[0].lstrip()) # Length of the string after stripping whitespaces on the left + return len(lines[0]) - n_stripped + + +def reindent(old_text, new_indent): + """ + Perform re-indentation of the text ``old_text`` with new indent ``new_indent``. + + :param old_text: Multi-line string for which re-indentation is performed + :param new_indent: New indent + :return: New multi-line text + """ + + if old_text == '': + return ' '*new_indent + + old_indent = get_indent(old_text) + lines = old_text.split('\n') + new_text = '' + for line in lines: + if line.strip() == '': + new_text += '\n' + else: + line = line[old_indent:] + new_text += ' '*new_indent + line + '\n' + + # If ``old_text`` has no ``'\n'`` in the end, remove it too from the ``new_text`` + if old_text[-1] != '\n': + new_text = new_text[:-1] + + return new_text + + +def create_heading_str(title, underlying_symbol='-'): + """ + Creates heading string for a given ``title``. Second line under title is decorated with ``underlying_symbol`` + + Heading is created taking into account of ``title`` indentation. + + :param title: + :param underlying_symbol: + :return: resulting heading string + """ + indent = get_indent(title) + n = len(title.strip()) + return title + '\n' + ' '*indent + underlying_symbol*n + + +def get_docstring(obj): + """ + Returns docstring for a given object or empty string if no-object is provided or there is no docstring for it. + + :param obj: Object for which the docstring to be provided + :return: Docstring + """ + if obj is None: + return '' + + doc = obj.__doc__ + if doc is None: + return '' + else: + return doc + + +def is_section_title(line, underline): + """ + Checks whether line and consecutive underline form valid section title. + + .. note:: + Function expects leading and trailing whitespaces removed for both strings prior to the call. + + :param line: String, title text + :param underline: String, underlying characters + :return: True if line and underline form valid section title + """ + + if line is None: + return False + + if underline is None: + return False + + if line == '': + return False + + if underline == '': + return False + + n = len(line) + for c in UNDERLINE_CHARS: + s = c * n + if underline == s: + return True + + return False + + +def is_sdc_user_guide_header(sdc_header): + """ + Checks whether a given title-text tuple forms valid Intel SDC header for User Guide. + + The header is expected to be 4 lines long, where the first three lines are of the form: + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: + The fourth line must be empty + + :param sdc_header: Tuple (title, text) + :return: True if sdc_header forms valid Intel SDC User Guide docstring header + """ + title, text = sdc_header + return title.strip() == SDC_USR_GUIDE_HEADING_STR and text.strip().startswith(SDC_USER_GUIDE_PANDAS_STR) + + +def is_sdc_dev_guide_header(sdc_header): + """ + Checks whether a given title-text tuple forms valid Intel SDC header for Developer Guide. + + The header is expected to be 3 lines long, where the first two lines are of the form: + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + The third line must be empty + + :param sdc_header: Tuple (title, text) + :return: True if sdc_header forms valid Intel SDC Developer Guide docstring header + """ + title, text = sdc_header + return title.strip() == SDC_DEV_GUIDE_HEADING_STR + + +def extract_pandas_name_from(text): + """ + Extracts Pandas API from ``text``. + + This function is used in conjunction with :func:`split_title`, which returns the tuple (title, text). + The ``title`` must contain valid Intel SDC header. The ``text`` is expected to be in the form + ``Pandas API: *fully qualified Pandas name*`` + + :param text: + :return: Pandas API name as a string + """ + line = text.strip().split('\n', 1)[0] # Pandas API is in the first line. Ignore whitespaces + return line.replace(SDC_USER_GUIDE_PANDAS_STR, '').strip() # Name begins right after ``Pandas API:`` + + +def split_title(section): + """ + Split section into title and remaining text. + + :param section: String, documented section + :return: Tuple (title, text) + """ + + if section is None: + return '', '' + + section = section.lstrip('\n') # Remove leading empty lines + + lines = section.split('\n', 2) + if len(lines) > 1: + # Only sections with number of lines >= 2 can be a title + if is_section_title(lines[0].strip(), lines[1].strip()): + if len(lines) > 2: + return lines[0], lines[2] # First line is title, second is underline, remaining is text + else: + return lines[0], '' # First line is title, second line is underline, but the text is empty string + else: + return '', section # First two lines do not form valid heading + else: + return '', section # When section is less than 3 lines we consider it having no title + + +def _merge_paragraphs_within_section(sections): + """ + Internal utility function that merges paragraphs into a single section. + + This function call is required after initial splitting of the docstring into sections. The initial split + is based on the presence of ``'\n\n'``, which separates sections and paragraphs. The difference between + section and paragraph is that section starts with the title of the form: + + This is title + ------------- + This is the first paragraph. It may be multi-line. + This is the second line of the paragraph. + + This is another multi-line paragraph. + This is the second line of the paragraph. + + Special treatment is required for Intel SDC header section and the following description section. Intel SDC + header section must the the first one in the docstring. It consists of exactly 3 lines: + + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: *pandas_api_fully_qualified_name* + + Right after the Intel SDC header section the description section (if any) goes. It generally consists of two + or more paragraphs. The first paragraph represents short description, which is typically single line. + The following paragraphs provide full description. In rare cases documentation does not have description section, + and this must be treated accordingly. + + + :param sections: List of tuples ``(title, text)``. + :return: Reformatted list of tuples ``(title, text)`, where paragraphs belonging to one section are merged in + single ``text`` item. + """ + if len(sections) == 0: + return sections + + merged_sections = [] + # Check if the very first section is Intel SDC header + section_title, section_text = sections[0] + if is_sdc_user_guide_header((section_title, section_text)): + merged_sections.append(sections[0]) + sections.pop(0) + + # Check if the next section is the short description + section_title, section_text = sections[0] + if section_title.strip() == '': + merged_sections.append(sections[0]) + sections.pop(0) + + if len(sections) == 0: + return merged_sections + + # Merge next sections with empty title into a single section representing full description + section_title, section_text = sections[0] + if section_title.strip() == '': + sections.pop(0) + while len(sections) > 0: + title, text = sections[0] + if title.strip() == '': + section_text += '\n\n' + text + sections.pop(0) + else: + break + merged_sections.append((section_title, section_text)) + + # Now merge paragraphs of remaining titled sections + while len(sections) > 0: + section_title, section_text = sections[0] + sections.pop(0) + while len(sections) > 0: + title, text = sections[0] + if title.strip() == '': + section_text += '\n\n' + text + sections.pop(0) + else: + break + merged_sections.append((section_title, section_text)) + + return merged_sections + + +def split_in_sections(doc): + """ + Splits the doc string into sections + + Each section is separated by empty line. Sections can start with headers or without. Each header follows NumPy + style: + + Section Title + ------------- + + Other permitted characters can be used to underline section title + + :param doc: Docstring to be split into sections + :return: List, sections of the doc. Each section is a tuple of strings (title, text) + + :seealso: NumPy style `example + `_ + """ + sections = doc.split('\n\n') # Sections are separated by empty lines + titled_sections = [] + + while len(sections) > 0: + title, text = split_title(sections[0]) + sections.pop(0) + titled_sections.append((title, text)) + + return _merge_paragraphs_within_section(titled_sections) + + +def get_short_description(obj, sdc_header_flag=False): + """ + Returns short description for a given object obj + + :param obj: Object for which short description needs to be returned + :param sdc_header_flag: Flag indicating that the first three lines must be considered as Intel SDC header + :return: String, short description + :raises: NameError, when ``sdc_header_flag==True`` and no Intel SDC header section found. + The header is expected to be 4 lines long, where the first three lines are of the form: + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: + The fourth line must be empty + + """ + doc = get_docstring(obj) + if doc == '': + return doc + + sections = split_in_sections(doc) # tuple (title, text) + + if sdc_header_flag: + if len(sections) > 1: # There must be at least one more section after Intel SDC header section + if not is_sdc_user_guide_header(sections[0]): + raise NameError('No Intel SDC header section found') + + sections.pop(0) # Ignore Intel SDC header section + + if len(sections) == 0: + return '' # Docstring has no sections, i.e. short description is absent + + title, text = sections[0] # Short description is the first section of the docstring + text = text.strip() + lines = text.split('\n') + lines = [line.strip() for line in lines] + lines = ' '.join(lines) + + return lines + + +def cut_sdc_dev_guide(doc): + """ + Removes Intel SDC Developer Guide related sections from the docstring. + + It is assumed that Developer Guide docstring follows the User Guide related sections of the docstring. + Everything after section the titled *Intel Scalable Dataframe Compiler Developer Guide* is cut + + :param doc: Docstring that includes User Guide and the following Developer Guide sections + :return: Docstring with the cut Developer Guide sections + """ + sections = split_in_sections(doc) # tuple (title, text) + trimmed_sections = [] + + while len(sections) > 0: + if is_sdc_dev_guide_header(sections[0]): + break + trimmed_sections.append(sections[0]) + sections.pop(0) + + return trimmed_sections diff --git a/docs/source/buildscripts/sdc_info.py b/docs/source/buildscripts/sdc_info.py new file mode 100644 index 000000000..cb6c1ba03 --- /dev/null +++ b/docs/source/buildscripts/sdc_info.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import logging +import sdc +from datetime import datetime +from module_info import get_submodules_of, print_modules_classes_methods_attributes, ENABLE_LOGGING, trim +from module_info import get_function, get_function_doc + +# -- String for pattern matching that indicates that the docstring belongs to Intel SDC API Reference --------------- +SDC_USR_GUIDE_HEADING_STR = \ + 'Intel Scalable Dataframe Compiler User Guide********************************************' +SDC_DEV_GUIDE_HEADING_STR = \ + 'Intel Scalablle Dataframe Compiler Developer Guide**************************************************' + +# -- Debug logging -------------------------------------------------------------------------------------------------- +log_file_name = '../build/sdc_info.log' + + +# -- Submodules, classes, and methods to be excluded from API Reference --------------------------------------------- +exclude_modules = [ + 'sdc.chiframes', + 'sdc.compiler', + 'sdc.config', + 'sdc.io.pio', + 'sdc.io.pio_api', + 'sdc.io.pio_lower', + 'sdc.utils', + 'sdc.hstr_ext', + 'sdc.datatypes.common_functions', + 'sdc.datatypes.hpat_pandas_dataframe_pass', + 'sdc.decorators', + 'sdc.dict_ext', + 'sdc.hdict_ext', + 'sdc.distributed', + 'sdc.distributed_api', + 'sdc.transport_seq', + 'sdc.distributed_lower', + 'sdc.hdist', + 'sdc.distributed_analysis', + 'sdc.hdatetime_ext', + 'sdc.hiframes', + 'sdc.io.csv_ext', + 'sdc.hio', + 'sdc.hiframes.join', + 'sdc.io.parquet_pio', + 'sdc.parquet_cpp', + 'sdc.shuffle_utils', + 'sdc.str_arr_ext', + 'sdc.str_ext', + 'sdc.timsort', +] + +exclude_classes = [ +] + +exclude_methods = [ +] + +exclude_attributes = [ +] + +exclude_functions = [ +] + + +# -- Implements custom skip functions for the parser ---------------------------------------------------------------- +def _skip_sdc_module(mod, mod_name): + return mod_name in exclude_modules or (not mod_name.startswith('sdc') and not mod_name.startswith('hpat')) + + +def _skip_sdc_class(cls, cls_name): + return True # Exclude all classes +# return cls_name in exclude_classes # Explicit exclusion of the class + + +def _skip_sdc_method(cls, method_name): + # Exclude the method if in the exclude_methods list + if method_name in exclude_methods: + return True + + # Exclude the method without docstring + try: + doc = getattr(cls, method_name).__doc__ + if len(doc) < 1: + return True + except AttributeError: + return True + except TypeError: + return True + + # Exclude the method that does have docstring aimed for API Reference + return not doc.startswith(SDC_USR_GUIDE_HEADING_STR) + + +def _skip_sdc_function(func, function_name): + # Exclude the function if in the exclude_methods list + if function_name in exclude_functions: + return True + + # Exclude the function without docstring + try: + doc = func.__doc__ + if len(doc) < 1: + return True + except AttributeError: + return True + except TypeError: + return True + + # Include the function that has docstring aimed for API Reference + doc = ''.join(trim(doc).splitlines()) + return not doc.startswith(SDC_USR_GUIDE_HEADING_STR) + + +def _skip_sdc_attribute(cls, attr_name): + # Exclude the attribute if in the exclude_methods list + if attr_name in exclude_attributes: + return True + + # Exclude the attribute without docstring + try: + doc = getattr(cls, attr_name).__doc__ + if len(doc) < 1: + return True + except AttributeError: + return True + except TypeError: + return True + + # Include the attribute that has docstring aimed for API Reference + doc = ''.join(trim(doc).splitlines()) + return not doc.startswith(SDC_USR_GUIDE_HEADING_STR) + + +def get_sdc_modules(): + inspected_modules = set() + modules = [] + get_submodules_of(sdc, inspected_modules, modules, _skip_sdc_module, _skip_sdc_class, + _skip_sdc_method, _skip_sdc_attribute, _skip_sdc_function) + return modules + + +def init_sdc_logging(): + if ENABLE_LOGGING: + logging.basicConfig(filename=log_file_name, level=logging.DEBUG) + logging.debug('****************** STARTING THE LOG *************************') + logging.debug(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) + + +if __name__ == "__main__": + # Initialize logging + init_sdc_logging() + + # Execute parser for SDC + + # You may uncomment this line in case you want to print out generated methods and attributes + # print_modules_classes_methods_attributes(modules) + modules = get_sdc_modules() + + func = get_function('hpat_pandas_series_at', modules) + if func: + titled_sections = get_function_doc(func) + print(titled_sections) diff --git a/docs/source/buildscripts/sdc_object_utils.py b/docs/source/buildscripts/sdc_object_utils.py new file mode 100644 index 000000000..31e335b21 --- /dev/null +++ b/docs/source/buildscripts/sdc_object_utils.py @@ -0,0 +1,393 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from inspect import getmembers, ismodule, isclass, isfunction +import sys +import pandas +import sdc +from sdc_doc_utils import is_sdc_user_guide_header, get_docstring, split_title, extract_pandas_name_from + +# -- Pandas submodules to be excluded from API Reference --------------------------------------------- +exclude_pandas_submodules = [ + 'pandas.compat', # This is PRIVATE submodule + 'pandas.util', # This is PRIVATE submodule + 'pandas.api.extensions', # This is extension for library developers extending Pandas + 'pandas.testing', # Utility functions for testing. Not a priority for SDC + 'pandas.plotting', # Plotting functions. Not a priority for compiling with SDC + 'pandas.errors', # Error handling functionality. Not a priority for SDC + 'pandas.api.types', # Not a priority for SDC + 'pandas.io.formats.style', # Helps to style dataframes with HTML and CSS. Not a priority for SDC + 'pandas.arrays', # Array extensions for Numpy. We do not explicitly cover in SDC documentation now + 'pandas.tseries', # SDC does not yet support Time Series objects + 'pandas.core.dtypes.dtypes', +] + +# -- Intel SDC submodules to be excluded from API Reference ------------------------------------------- +exclude_sdc_submodules = [ + 'sdc.chiframes', + 'sdc.compiler', + 'sdc.config', + 'sdc.io.pio', + 'sdc.io.pio_api', + 'sdc.io.pio_lower', + 'sdc.utils', + 'sdc.hstr_ext', + 'sdc.datatypes.common_functions', + 'sdc.datatypes.hpat_pandas_dataframe_pass', + 'sdc.decorators', + 'sdc.dict_ext', + 'sdc.hdict_ext', + 'sdc.distributed', + 'sdc.distributed_api', + 'sdc.transport_seq', + 'sdc.distributed_lower', + 'sdc.hdist', + 'sdc.distributed_analysis', + 'sdc.hdatetime_ext', + 'sdc.hiframes', + 'sdc.io.csv_ext', + 'sdc.hio', + 'sdc.hiframes.join', + 'sdc.io.parquet_pio', + 'sdc.parquet_cpp', + 'sdc.shuffle_utils', + 'sdc.str_arr_ext', + 'sdc.str_ext', + 'sdc.timsort', +] + +pandas_modules = dict() # Dictionary of pandas submodules and their classes and functions +sdc_modules = dict() # Dictionary of Intel SDC submodules and their classes and functions +pandas_sdc_dict = dict() # Dictionary {: } that maps Pandas API to respective Intel SDC API + + +def get_sdc_object(pandas_obj): + """ + Returns corresponding Intel SDC object for a given Pandas object pandas_obj. + + :param pandas_obj: Pandas object to be matched with Intel SDC object + :return: Intel SDC object corresponding to pandas_obj + """ + if pandas_obj in pandas_sdc_dict: + return pandas_sdc_dict[pandas_obj] + else: + return None # There is no match in Intel SDC to pandas_obj + + +def init_pandas_sdc_dict(): + """ + Initializes global dictionary that performs mapping between Pandas objects and SDC objects. + + To function correctly this function must be called after initialization of ``sdc_modules`` and ``pandas_modules`` + lists by :func:`init_sdc_structure` and :func:`init_pandas_structure`` functions respectively. + """ + + def _map_sdc_to_pandas(sdc_obj): + if isfunction(sdc_obj): + doc = get_docstring(sdc_obj) + + # The very first section of Intel SDC documentation is expected to start with + # the User Guide header followed by the name of respective Pandas API. + # The following code extracts respective Pandas API + title, text = split_title(doc) + if is_sdc_user_guide_header((title, text)): + pandas_name = extract_pandas_name_from(text) + pandas_obj = get_obj(pandas_name) + pandas_sdc_dict[pandas_obj] = sdc_obj + return False + + global pandas_sdc_dict + pandas_sdc_dict = {} + + traverse(sdc_modules, _map_sdc_to_pandas, True) + + +def get_obj(obj_name): + """ + Retrieves object corresponding to fully qualified name obj_name. + + The fully qualified name starts with the imported module name visible by sys.modules followed by + submodules and then classes and finally by class attributes + :param obj_name: Fully qualified object name string + :return: If found, returns the object corresponding to obj_name. Otherwise raises exception + :raises AttributeError: If submodule or attribute does not exists + """ + split_name = obj_name.split('.') + split_obj = sys.modules[split_name[0]] + + # Iterate through submodules + while ismodule(split_obj) and len(split_name) > 1: + split_name.pop(0) + not_found = True + for (name, obj) in getmembers(split_obj): # Go through members of split_obj + if split_name[0] == name: + not_found = False + break + + if not_found: + raise AttributeError('Member `' + split_name[0] + '` for `' + obj_name + '` does not exists') + + split_obj = obj + + split_name.pop(0) + for name in split_name: + split_obj = getattr(split_obj, name) + + return split_obj + + +def get_class_methods(cls): + """ + Returns the list of class methods, accessible by both names and as objects. + + Function ignores internal methods starting with ``_``. + + :param cls: The class object + :return: List of class methods, each item is the tuple ``(method_name, method_object)`` + """ + return [(func, getattr(cls, func)) for func in dir(cls) + if callable(getattr(cls, func)) and not func.startswith('_')] + + +def get_class_attributes(cls): + """ + Returns the list of class attributes, accessible by both names and as objects. + + Function ignores internal attributes starting with ``_``. + + :param cls: The class object + :return: List of class attributes, each item is the tuple ``(attribute_name, attribute_object)`` + """ + return [(func, getattr(cls, func)) for func in dir(cls) + if not callable(getattr(cls, func)) and not func.startswith('_')] + + +def get_fully_qualified_name(cls): + """ + Returns fully qualified name of the class. + + :param cls: The class object + :return: String, fully qualified name + """ + return repr(cls)[8:-2] + + +def init_module_structure(module_obj, the_module, inspected, skip_test): + """ + Initializes hierarchical structure ``the_module``. + + :param module_obj: Module object being traversed. + :param the_module: Dictionary ``{'module_obj': module_obj, 'submodules': submodules, + 'classes': classes, 'functions': functions}``. The ``submodules`` is the list of + submodules that belong to ``module_obj``. Each submodule has the same structure as ``the_module``. + The ``classes`` is the list of classes that belong to ``module_obj``. + The functions is the list of functions that belong ``to module_obj``. + :param inspected: Set of already traversed module objects. This set is needed to avoid circular traversal of + the same module, which may be returned by by ``getmembers`` function multiple times. + :param skip_test: Function that takes module object as an argument and returns True if this object + needs to be included in the module structure hierarchy or skipped if False. This function is used as + a mechanism to customize the structure of modules, classes, and functions. This in turn minimizes following + structure traversal costs. + """ + + # Returns True if the mod module needs to be ignored + def _is_skip_module(mod): + mod_name = mod.__name__ + return '._' in mod_name or mod_name.startswith('_') + + # Returns True if the class cls needs to be ignored + def _is_skip_class(cls): + class_name = get_fully_qualified_name(cls) + return '._' in class_name + + # Returns True if the object obj needs to be ignored + def _is_internal(obj): + obj_name = obj.__name__ + return obj_name.startswith('_') + + # ************ The init_module_structure implementation starts here ******************************************* + if _is_skip_module(module_obj) or module_obj in inspected or skip_test(module_obj): + return + + inspected.add(module_obj) + + # Traverse submodules, classes, and functions + submodules = [] + classes = [] + functions = [] + for (name, obj) in getmembers(module_obj): # Iterate through members of the submodule + if skip_test(obj): + continue # Customizable test for skipping objects as needed + + if ismodule(obj) and obj not in inspected and not _is_skip_module(obj): + the_submodule = dict() + init_module_structure(obj, the_submodule, inspected, skip_test) + submodules.append(the_submodule) + + if isclass(obj) and not _is_skip_class(obj): + classes.append(obj) + + if isfunction(obj) and not _is_internal(obj): + functions.append(obj) + + the_module['module_obj'] = module_obj + the_module['submodules'] = submodules + the_module['classes'] = classes + the_module['functions'] = functions + + +def _print_module(the_module, print_submodules_flag=True): + """ + Recursively prints ``the_module`` content. Internal utility function for debugging purposes + + :param the_module: Dictionary ``{'module_obj': module_obj, 'submodules': submodules, + 'classes': classes, 'functions': functions}``. The ``submodules`` is the list of + submodules that belong to ``module_obj``. Each submodule has the same structure as ``the_module``. + The ``classes`` is the list of classes that belong to ``module_obj``. + The functions is the list of functions that belong ``to module_obj``. + """ + print(the_module['module_obj'].__name__) + + print(' CLASSES:') + for the_class in the_module['classes']: + print(' - ' + the_class.__name__) + + print(' FUNCTIONS:') + for the_func in the_module['functions']: + print(' - ' + the_func.__name__) + + if print_submodules_flag: + print(' SUBMODULES:') + for submodule in the_module['submodules']: + _print_module(submodule, print_submodules_flag) + + +def traverse(the_module, do_action, traverse_submodules_flag=True): + """ + Traverses ``the_module`` and performs action :func:`do_action` on each of the objects of the structure. + + :param the_module: Dictionary ``{'module_obj': module_obj, 'submodules': submodules, + 'classes': classes, 'functions': functions}``. The ``submodules`` is the list of + submodules that belong to ``module_obj``. Each submodule has the same structure as ``the_module``. + The ``classes`` is the list of classes that belong to ``module_obj``. + The functions is the list of functions that belong to ``module_obj``. + :param do_action: Function that takes one parameter ``module_obj`` as input. It returns ``True`` if + traversal needs to be stopped. + :param traverse_submodules_flag: True if function must recursively traverse submodules too + :return: Returns tuple ``(the_module, obj)`` where ``obj`` is the object identified by :func:`do_action` and + ``the_module`` is the corresponding dictionary structure to which the object belongs. It returns ``None`` + if no object has been identified by the :func:`do_action` + """ + if do_action(the_module['module_obj']): + return the_module, the_module['module_obj'] + + # Traverse classes of the_module + for the_class in the_module['classes']: + if do_action(the_class): + return the_module, the_class + + # Traverse functions of the_module + for the_func in the_module['functions']: + if do_action(the_func): + return the_module, the_func + + # Recursively traverse submodules of the_module + if traverse_submodules_flag: + for submodule in the_module['submodules']: + the_tuple = traverse(submodule, do_action, traverse_submodules_flag) + if the_tuple is not None: + return the_tuple + + return None + + +def get_pandas_module_structure(pandas_obj): + """ + Returns corresponding ``the_module`` dictionary structure to which ``pandas_obj`` belongs to. + + This function is typically used in conjunction with :func:`traverse` + + :param pandas_obj: + :return: ``the_module`` dictionary structure + """ + + def _find(obj): + return obj == pandas_obj + + the_module, the_object = traverse(pandas_modules, _find) + return the_module + + +def init_pandas_structure(): + """ + Initializes ``pandas_modules`` global dictionary representing the structure of Pandas. + """ + + # Test that allows to ignore certain Pandas submodules, classes, or attributes + def _skip_pandas_test(obj): + if ismodule(obj): + name = obj.__name__ + for mod_name in exclude_pandas_submodules: + if name.startswith(mod_name): + return True + return not name.startswith('pandas') + + global pandas_modules + pandas_modules = dict() + inspected_mods = set() + init_module_structure(pandas, pandas_modules, inspected_mods, _skip_pandas_test) + + +def init_sdc_structure(): + """ + Initializes ``sdc_modules`` global dictionary representing the structure of Intel SDC. + """ + + # Test that allows to ignore certain Intel SDC submodules, classes, or attributes + def _skip_sdc_test(obj): + if ismodule(obj): + name = obj.__name__ + for mod_name in exclude_sdc_submodules: + if name.startswith(mod_name): + return True + return not name.startswith('sdc') and not name.startswith('hpat') + + global sdc_modules + sdc_modules = dict() + inspected_mods = set() + init_module_structure(sdc, sdc_modules, inspected_mods, _skip_sdc_test) + + +if __name__ == "__main__": + init_pandas_structure() + _print_module(pandas_modules) + + init_sdc_structure() + _print_module(sdc_modules) + + init_pandas_sdc_dict() + print(pandas_sdc_dict) diff --git a/docs/source/buildscripts/user_guide_gen.py b/docs/source/buildscripts/user_guide_gen.py new file mode 100644 index 000000000..312f589e1 --- /dev/null +++ b/docs/source/buildscripts/user_guide_gen.py @@ -0,0 +1,280 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from module_info import get_function, get_method_attr, get_function_doc, get_function_short_description +from module_info import create_header_str +from pandas_info import get_pandas_modules, init_pandas_logging +from sdc_info import get_sdc_modules, init_sdc_logging +from texttable import Texttable +import os + +PANDAS_API_STR = 'Pandas API: ' # This substring prepends Pandas API name in the documentation +APIREF_RELPATH = r'./_api_ref/' # Relative path to API Reference folder +RST_MODULES = { + 'api_reference.rst': ['pandas'], + 'io.rst': ['pandas.io.api', 'pandas.io.clipboards', 'pandas.io.common', 'pandas.io.excel', + 'pandas.io.feather_format', 'pandas.io.formats.console', 'pandas.io.formats.format', + 'pandas.io.formats.printing', 'pandas.io.gbq', 'pandas.io.html', 'pandas.io.json', + 'pandas.io.msgpack', 'pandas.io.msgpack.exceptions', 'pandas.io.packers', 'pandas.io.parquet', + 'pandas.io.parsers', 'pandas.io.pickle', 'pandas.io.pytables', 'pandas.io.sas', + 'pandas.io.sas.sasreader', 'pandas.io.spss', 'pandas.io.sql', 'pandas.io.stata'], + 'series.rst': ['pandas.Series'], + 'dataframe.rst': ['pandas.DataFrame'], + '' + 'general_functions.rst': [], +} + +pandas_modules = [] # List of Pandas submodules along with its functions and classes +sdc_modules = [] # List of Intel SDC submodules along with its functions and classes + + +def generate_module_doc(the_module): + module_doc = None + module_name = the_module['module_name'] + + # First, look up if there is RST file documenting particular module + for rst in RST_MODULES: + for mod in RST_MODULES[rst]: + if mod == module_name: + return module_doc # If there is a documentation for a given module then just return + + # If there is no RST file then we create the documentation based on module's docstring + module_obj = the_module['module_object'] + module_description = get_function_short_description(module_obj).strip() + if module_description is None: + module_description = '' + + module_doc = module_description + '\n\nFor details please refer to Pandas API Reference for :py:mod:`' + \ + module_name + '`\n\n' + return module_doc + + +def generate_api_index_for_module(the_module): + module_description = generate_module_doc(the_module) + if module_description is None: + module_description = '' + module_doc = '' + + module_header_flag = False + # Document functions first, if any + tab = Texttable() + for func in the_module['functions']: # Iterate through the module functions + name = func['function_name'] + obj = getattr(the_module['module_object'], name) # Retrieve the function object + description = get_function_short_description(obj).strip() + tab.add_rows([[name, description]], header=False) + + module_name = '' + func_doc = tab.draw() + if func_doc and func_doc != '': # If the function list is not empty then add module name to the document + module_name = the_module['module_name'] + module_doc += create_header_str(module_name, '~') + '\n\n' + module_description + '\n\n' + \ + create_header_str('Functions:', '-') + \ + '\n\n' + func_doc + '\n\n' + module_header_flag = True + + # Document classes + classes_header_flag = False + for the_class in the_module['classes']: # Iterate through the module classes + tab.reset() + class_name = the_class['class_name'] + class_obj = the_class['class_object'] + class_description = class_obj.__doc__ + if not class_description: + class_description = '' + class_doc = '' + class_header_flag = False + + # Document class attributes first, if any + for attr in the_class['class_attributes']: # Iterate through the class attributes + name = attr + obj = getattr(the_class['class_object'], name) # Retrieve the attribute object + description = get_function_short_description(obj).strip() + tab.add_rows([[name, description]], header=False) + + attr_doc = tab.draw() + if attr_doc and attr_doc != '': # If the attribute list is not empty then add class name to the document + class_header_flag = True + class_doc += create_header_str(class_name, '^') + '\n\n' + class_description + '\n\n' + \ + create_header_str('Attributes:', '+') + \ + '\n\n' + attr_doc + '\n\n' + + # Document class methods, if any + for method in the_class['class_methods']: # Iterate through the class methods + name = method + obj = getattr(the_class['class_object'], name) # Retrieve the method object + description = get_function_short_description(obj).strip() + tab.add_rows([[name, description]], header=False) + + method_doc = tab.draw() + if method_doc and method_doc != '': # If the method list is not empty then add class name to the document + if not class_header_flag: + class_doc += create_header_str(class_name, '^') + '\n\n' + class_description + '\n\n' + \ + create_header_str('Methods:', '+') + \ + '\n\n' + method_doc + '\n\n' + class_header_flag = True + else: + class_doc += create_header_str('Methods:', '+') + \ + '\n\n' + method_doc + '\n\n' + + if not module_header_flag: # There is no module header yet + if class_header_flag: # There were methods/attributes for the class + module_doc += create_header_str(module_name, '~') + '\n\n' + module_description + '\n\n' + \ + create_header_str('Classes:', '-') + \ + '\n\n' + class_doc + '\n\n' + module_header_flag = True + classes_header_flag = True + else: # The module header has been added + if class_header_flag: # There are new methods/attributes for the class + if not classes_header_flag: # First class of the module description + module_doc += create_header_str('Classes:', '-') + '\n\n' + module_doc += '\n\n' + class_doc + '\n\n' + return module_doc + + +def get_module_rst_fname(the_module): + file_name = the_module['module_name'] + file_name = file_name.replace('.', '/') + file_name = APIREF_RELPATH + file_name + '.rst' + return file_name + + +def generate_api_index(): + doc = '.. _apireference::\n\nAPI Reference\n*************\n\n' \ + '.. toctree::\n :maxdepth: 1\n\n' + + for the_module in pandas_modules: # Iterate through pandas_modules + module_doc = generate_api_index_for_module(the_module) + if len(module_doc) > 0: + file_name = get_module_rst_fname(the_module) + write_rst(file_name, module_doc) + doc += ' ' + file_name + '\n' + return doc + + +def generate_sdc_object_doc(sdc_func): + sdc_titled_sections = get_function_doc(sdc_func, True) + sdc_see_also_text = next((sec['text'] for sec in sdc_titled_sections + if sec['title'].lower().strip() == 'see also'), '') + sdc_limitations_text = next((sec['text'] for sec in sdc_titled_sections + if sec['title'].lower().strip() == 'limitations'), '') + sdc_examples_text = next((sec['text'] for sec in sdc_titled_sections + if sec['title'].lower().strip() == 'examples'), '') + + # Get respective Pandas API name + pandas_name = sdc_titled_sections[0]['text'].strip() + pandas_name = pandas_name.replace(PANDAS_API_STR, '') + pandas_name = pandas_name.replace('\n', '') + + # Find respective Pandas API + doc_object = get_method_attr(pandas_name, pandas_modules) + if not doc_object: + doc_object = get_function(pandas_name, pandas_modules) + if not doc_object: + raise NameError('Pandas API:' + pandas_name + 'does not exist') + + # Extract Pandas API docstring as the list of sections + pandas_titled_sections = [] + if doc_object: + pandas_titled_sections = get_function_doc(doc_object, False) + + # Form final docstring which is a combination of Pandas docstring for the description, Parameters section, + # Raises section, Returns section. See Also, Limitations and Examples sections (if any) are taken from SDC docstring + short_description_section = pandas_titled_sections[0]['text'] + '\n\n' + pandas_titled_sections.pop(0) + + long_description_section = '' + while pandas_titled_sections[0]['title'] == '': + long_description_section += pandas_titled_sections[0]['text'] + '\n\n' + pandas_titled_sections.pop(0) + + raises_section = parameters_section = returns_section = see_also_section = \ + limitations_section = examples_section = '' + for section in pandas_titled_sections: + title = section['title'].lower().strip() + if title == 'raises': + raises_section = 'Raises\n------\n\n' + section['text'] + '\n\n' + elif title == 'parameters': + parameters_section = 'Parameters\n----------\n\n' + section['text'] + '\n\n' + elif title == 'return' or title == 'returns': + returns_section = 'Returns\n-------\n\n' + section['text'] + '\n\n' + + if sdc_see_also_text: + see_also_section = '\n.. seealso::\n\n' + sdc_see_also_text + '\n\n' + + if sdc_limitations_text: + limitations_section = 'Limitations\n-----------\n\n' + sdc_limitations_text + '\n\n' + + if sdc_examples_text: + examples_section = 'Examples\n-----------\n\n' + sdc_examples_text + '\n\n' + + rst_label = pandas_name.replace('.', '_') + + n = len(pandas_name) + docstring = \ + '.. _' + rst_label + ':\n\n' + \ + pandas_name + '\n' + '*'*n + '\n' + \ + short_description_section + \ + long_description_section + \ + parameters_section + \ + returns_section + \ + raises_section + \ + limitations_section + \ + examples_section + \ + see_also_section + + file_name = rst_label + '.rst' + + return file_name, docstring + + +def write_rst(file_name, docstring): + directory = os.path.dirname(file_name) + + if len(directory) > 0 and not os.path.exists(directory): + os.makedirs(directory) + + file = open(file_name, 'w') + file.write(docstring) + file.close() + + +if __name__ == "__main__": + init_pandas_logging() + pandas_modules = get_pandas_modules() + + init_sdc_logging() + sdc_modules = get_sdc_modules() + + for the_module in sdc_modules: + if the_module['module_name'] == 'sdc.datatypes.hpat_pandas_series_functions': + for func in the_module['functions']: + file_name, doc = generate_sdc_object_doc(func['function_object']) + write_rst(APIREF_RELPATH + file_name, doc) + + doc = generate_api_index() + write_rst('apireference.rst', doc) diff --git a/docs/source/compilation.rst b/docs/source/compilation.rst index 8935cb4f0..a3622a343 100644 --- a/docs/source/compilation.rst +++ b/docs/source/compilation.rst @@ -1,8 +1,8 @@ .. _compilation: .. include:: ./ext_links.txt -Compiling With Intel® Scalable Dataframe Compiler -================================================= +Compiling With Intel® SDC +========================= .. todo:: Basic compilation controls. What can be compiled and what cannot. How to work around compilation issues. @@ -85,9 +85,9 @@ should also be deterministic. The below example is not supported since the funct Discuss other typical scenarios when Numba or hpat cannot perform type inference Dealing With Integer NaN Values -================================= +------------------------------- -The :py:class:`pandas.Series` are built upon :py:class:`numpy.array`, which does not support +The :py:class:`pandas.Series` are built upon :py:class:`numpy.ndarray`, which does not support ``NaN`` values for integers. For that reason `Pandas*`_ dynamically converts integer columns to floating point ones when ``NaN`` values are needed. Intel SDC can perform such a conversion only if enough information about ``NaN`` values is available at compilation time. When it is impossible the user is responsible for manual @@ -98,7 +98,7 @@ conversion of integer data to floating point data. ``NaN`` cannot be known at compile time and show how it can be worked around Type Inference In I/O Operations -================================= +-------------------------------- If the filename is constant, the Intel SDC may be able to determine file schema at compilation time. It will allow to perform type inference of columns in respective `Pandas*`_ dataframe. diff --git a/docs/source/conf.py b/docs/source/conf.py index a2faa29b9..98ca74306 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,24 +41,37 @@ # -- Import sdc package to build API Reference ------------------------------- import os +import sys +import shutil SDC_DOC_NO_API_REF_STR = 'SDC_DOC_NO_API_REF' +SDC_DOC_APIREF_DIR = '_api_ref' +sys.path.insert(0, os.path.relpath('buildscripts')) sdc_doc_no_api_ref = False # Generate API Reference by default if SDC_DOC_NO_API_REF_STR in os.environ: sdc_doc_no_api_ref = os.environ[SDC_DOC_NO_API_REF_STR] == '1' if not sdc_doc_no_api_ref: + if os.path.exists(SDC_DOC_APIREF_DIR): + shutil.rmtree(SDC_DOC_APIREF_DIR) + try: import sdc except ImportError: - print('IMPORT EXCEPTION: Cannot import SDC. ') - print('Documentation generator for API Reference for a given module expects that module ' - 'to be installed. Use conda/pip install SDC to install it prior to using API Reference generation') - print('If you want to disable API Reference generation, set the environment variable SDC_DOC_NO_API_REF=1') + raise ImportError('Cannot import sdc.\n' + 'Documentation generator for API Reference for a given module expects that module ' + 'to be installed. Use conda/pip install SDC to install it prior to using API Reference ' + 'generation. If you want to disable API Reference generation, set the environment ' + 'variable SDC_DOC_NO_API_REF=1') - raise + try: + from apiref_generator import generate_api_reference + except ImportError: + raise ImportError('Cannot import apiref_generator', os.getcwd()) + + generate_api_reference() # -- Project information ----------------------------------------------------- @@ -77,15 +90,10 @@ # ones. extensions = [ 'sphinx.ext.todo', -# 'sphinx.ext.autosummary', 'sphinx.ext.intersphinx', -# 'sphinx.ext.autodoc', 'sphinx.ext.extlinks', 'sphinx.ext.githubpages', 'sphinx.ext.napoleon', -# 'sphinx.ext.autosectionlabel', -# 'sphinx.ext.graphviz', -# 'sphinx.ext.coverage' ] @@ -130,8 +138,10 @@ # Each entry of the dictionary has the following format: # 'class name': ('link to object.inv file for that class', None) intersphinx_mapping = { - 'pandas.Series': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'numpy.array': ('https://docs.scipy.org/doc/numpy', None), + 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), + 'python': ('http://docs.python.org/2', None), + 'numpy': ('http://docs.scipy.org/doc/numpy', None), + 'scipy': ('http://docs.scipy.org/doc/scipy/reference', None), } # -- Napoleon extension configuration (Numpy and Google docstring options) ------- @@ -147,17 +157,5 @@ napoleon_use_param = True napoleon_use_rtype = True - -# -- Auto-section label configuration ----------------------------------------------- -#autosectionlabel_prefix_document = True - - -# -- Autodoc configuration ---------------------------------------------------------- -#autodoc_docstring_signature = True - - -# -- Auto-summary configuration ----------------------------------------------------- -#autosummary_generate = True - # -- Prepend module name to an object name or not ----------------------------------- add_module_names = False diff --git a/docs/source/examples.rst b/docs/source/examples.rst index f008a93c5..75e9de504 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -1,7 +1,8 @@ -.. examples: +.. _examples: +.. include:: ./ext_links.txt List of examples ================ .. todo:: - Austoenerate the list of examples from respective docstrings in examples + Auto-generate the list of examples from respective docstrings in examples diff --git a/docs/source/for_developers.rst b/docs/source/for_developers.rst index f2d9bb430..e5675b827 100644 --- a/docs/source/for_developers.rst +++ b/docs/source/for_developers.rst @@ -1,4 +1,5 @@ -.. _developers: +.. _for_developers: +.. include:: ./ext_links.txt Contribute to Intel® Scalable Dataframe Compiler Development ============================================================ diff --git a/docs/source/index.rst b/docs/source/index.rst index 24016075a..fe2b9c0fc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,12 +11,11 @@ This is Intel® Scalable Dataframe Compiler (Intel® SDC), which is an extension `Pandas*`_ operations. It automatically vectorizes and parallelizes the code by leveraging modern hardware instructions and by utilizing all available cores. -.. seealso:: - Intel SDC is just one of technologies that enable native speeds for Python. Learn about other technologies - here. +.. image:: ./_images/scalability.png + :width: 800px + :align: center + :alt: Intel® Scalable Dataframe Compiler scalability -.. todo:: - Insert performance chart illustrating speedups vs. stock `Pandas*`_ User Manual =========== diff --git a/docs/source/info.py b/docs/source/info.py deleted file mode 100644 index 76a1ee145..000000000 --- a/docs/source/info.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2015: Mirantis Inc. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import re -import sys - -PARAM_OR_RETURNS_REGEX = re.compile(":(?:param|returns)") -RETURNS_REGEX = re.compile(":returns: (?P.*)", re.S) -PARAM_REGEX = re.compile(":param (?P[\*\w]+): (?P.*?)" - "(?:(?=:param)|(?=:return)|(?=:raises)|\Z)", re.S) - -def trim(docstring): - """trim function from PEP-257""" - if not docstring: - return "" - # Convert tabs to spaces (following the normal Python rules) - # and split into a list of lines: - lines = docstring.expandtabs().splitlines() - # Determine minimum indentation (first line doesn't count): - indent = sys.maxsize - for line in lines[1:]: - stripped = line.lstrip() - if stripped: - indent = min(indent, len(line) - len(stripped)) - # Remove indentation (first line is special): - trimmed = [lines[0].strip()] - if indent < sys.maxsize: - for line in lines[1:]: - trimmed.append(line[indent:].rstrip()) - # Strip off trailing and leading blank lines: - while trimmed and not trimmed[-1]: - trimmed.pop() - while trimmed and not trimmed[0]: - trimmed.pop(0) - - # Current code/unittests expects a line return at - # end of multiline docstrings - # workaround expected behavior from unittests - if "\n" in docstring: - trimmed.append("") - - # Return a single string: - return "\n".join(trimmed) - - -def reindent(string): - return "\n".join(l.strip() for l in string.strip().split("\n")) - - -def parse_docstring(docstring): - """Parse the docstring into its components. - :returns: a dictionary of form - { - "short_description": ..., - "long_description": ..., - "params": [{"name": ..., "doc": ...}, ...], - "returns": ... - } - """ - - short_description = long_description = returns = "" - params = [] - - if docstring: - docstring = trim(docstring) - - lines = docstring.split("\n", 1) - short_description = lines[0] - - if len(lines) > 1: - long_description = lines[1].strip() - - params_returns_desc = None - - match = PARAM_OR_RETURNS_REGEX.search(long_description) - if match: - long_desc_end = match.start() - params_returns_desc = long_description[long_desc_end:].strip() - long_description = long_description[:long_desc_end].rstrip() - - if params_returns_desc: - params = [ - {"name": name, "doc": trim(doc)} - for name, doc in PARAM_REGEX.findall(params_returns_desc) - ] - - match = RETURNS_REGEX.search(params_returns_desc) - if match: - returns = reindent(match.group("doc")) - - return { - "short_description": short_description, - "long_description": long_description, - "params": params, - "returns": returns - } - - -class InfoMixin(object): - - @classmethod - def _get_doc(cls): - """Return documentary of class - By default it returns docstring of class, but it can be overridden - for example for cases like merging own docstring with parent - """ - return cls.__doc__ - - @classmethod - def get_info(cls): - doc = parse_docstring(cls._get_doc()) - - return { - "name": cls.get_name(), - "platform": cls.get_platform(), - "platform": cls.get_platform(), - "module": cls.__module__, - "title": doc["short_description"], - "description": doc["long_description"], - "parameters": doc["params"], - "schema": getattr(cls, "CONFIG_SCHEMA", None), - "returns": doc["returns"] - } diff --git a/docs/source/module_info.py b/docs/source/module_info.py deleted file mode 100644 index d192ef2c6..000000000 --- a/docs/source/module_info.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2019, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - - -from inspect import getmembers, ismodule, isclass -from info import trim -import logging - - -# -- Debug logging -------------------------------------------------------------------------------------------------- -# Logging information about attribute parsing -def _attribute_logging(s): - logging.debug('[ATTRIBUTE]' + s) - return - - -# Logging information about method parsing -def _method_logging(s): - logging.debug('[METHOD]' + s) - return - - -# Logging information about class parsing -def _class_logging(s): - logging.debug('[CLASS]' + s) - return - - -# Logging information about module parsing -def _module_logging(s): - logging.debug('[MODULE]' + s) - return - - -# -- Returns all classes and respective methods of the module ------------------------------------------------------- -def get_submodules_of(module, inspected, module_list, skip_module_test, skip_class_test, - skip_method_test, skip_attribute_test): - - # Returns True if the mod module will not be included in API Reference - def _skip_module(mod): - mod_name = mod.__name__ # Get new submodule name - sk_mod = False - - if mod in inspected: # Ignore already traversed modules - sk_mod = True - _module_logging('`' + mod_name + '` already traversed. Ignoring') - return sk_mod - - if '._' in mod_name or mod_name.startswith('_'): # Ignore internal module - sk_mod = True - _module_logging('`' + mod_name + '` is internal (starts with _). Ignoring') - return sk_mod - - if skip_module_test(mod, mod_name): - sk_mod = True - return sk_mod - - return sk_mod - - # Returns True if the cls class will not be included in API Reference - def _skip_class(cls): - sk_class = False - class_name = repr(cls)[8:-2] # Get full class name - - if '._' in class_name: # We are interested only in public classes - sk_class = True - _class_logging('`' + class_name + '` is internal. Ignoring') - return sk_class - - if skip_class_test(cls, class_name): - sk_class = True - return sk_class - - return sk_class - - # Returns True if the method method_name will not be included in API Reference - def _skip_method(method_name): - sk_method = False - - if method_name.startswith('_'): # Ignore internal methods - sk_method = True - _method_logging('`' + method_name + '` is internal (starts with __). Ignoring') - return sk_method - - if skip_method_test(method_name): - sk_method = True - return sk_method - - return sk_method - - # Returns True if the attribute attr_name will not be included in API Reference - def _skip_attribute(attr_name): - sk_attr = False - - if attr_name.startswith('_'): # Ignore internal methods - sk_attr = True - _attribute_logging('`' + attr_name + '` is internal (starts with __). Ignoring') - return sk_attr - - if skip_attribute_test(attr_name): - sk_attr = True - return sk_attr - - return sk_attr - - # Creates the list of methods for the class - def _generate_class_methods(cls): - meths = [func for func in dir(cls) if callable(getattr(cls, func)) and not _skip_method(func)] - for meth in meths: - _method_logging('Adding method `' + meth + '` to the list') - return meths - - # Creates the list of class's attributes - def _generate_class_attributes(cls): - attrs = [func for func in dir(cls) if not callable(getattr(cls, func)) and not _skip_attribute(func)] - for att in attrs: - _attribute_logging('Adding attribute `' + att + '` to the list') - return attrs - - # -- get_classes_of() implementation begins - if _skip_module(module): - return - - inspected.add(module) # Add module to the set of traversed modules - module_name = module.__name__ - module_list.append({'module_name': module_name, 'module_object': module, 'classes': []}) - - _module_logging('********************** Inspecting module `' + module_name + '`') - - class_list = [] - # Traverses the mod module classes and submodules - for (name, obj) in getmembers(module): # Iterate through members of the submodule - if isclass(obj): # We are interested in members, which are classes - if not _skip_class(obj): - _class_logging('********************** Inspecting class `' + name + '`') - methods = _generate_class_methods(obj) # Inspect methods of the class of interest only - attributes = _generate_class_attributes(obj) # Inspect attributes of the class of interest only - class_list.append({'class_name': name, 'class_object': obj, 'class_methods': methods, - 'class_attributes': attributes}) - module_list[-1]['classes'] = class_list - - if ismodule(obj): - if not _skip_module(obj): - get_submodules_of(obj, inspected, module_list, skip_module_test, skip_class_test, - skip_method_test, skip_attribute_test) - - return - - -# -- Returns all classes and respective methods of the module ------------------------------------------------------- -def print_modules_classes_methods_attributes(modules): - for the_module in modules: # modules is the list, each element represents dictionary characterizing the sub-module - print(the_module['module_name']) - for the_class in the_module['classes']: - print('- ' + the_class['class_name']) - print(' METHODS:') - for the_method in the_class['class_methods']: - print(' ' + the_method) - print(' ATTRIBUTES:') - for the_attribute in the_class['class_attributes']: - print(' ' + the_attribute) - return - - -# -- These symbols can be used to underline section title ----------------------------------------------------------- -UNDERLINE_SYMBOLS = ['~', '#', '@', '^', '*', '-', '_', '+', '='] - - -# -- Split section into section title and remaining text ------------------------------------------------------------ -def split_title(section): - def _is_section_title(title_line, underscore_line): - n = len(title_line) - for c in UNDERLINE_SYMBOLS: - s = c * n - if underscore_line.startswith(s): - return True - - return False - - trimmed = trim(section) - lines = trimmed.split('\n', 2) - if len(lines) > 2: - # Only sections with number of lines>2 can start with a title - if _is_section_title(lines[0], lines[1]): - return lines[0], lines[2] - else: - return '', section - else: - return '', section - - -# -- Parse docstring by forming the list of sections, where each section is dictionary with title and text ---------- -def parse_docstring(doc): - sections = doc.split('\n\n') - titled_sections = [] - - # The first and the second sections are to be Short and Long description - section = sections[0] - title, text = split_title(section) - titled_sections.append({'title': title, 'text': text}) - - section = sections[1] - title, text = split_title(section) - titled_sections.append({'title': title, 'text': text}) - - # Other sections. Merge those which are just separated by blank lines - for i in range(2, len(sections)): - section = sections[i] - title, text = split_title(section) - if title == '': - titled_sections[-1]['text'] += '\n\n' + text - else: - titled_sections.append({'title': title, 'text': text}) - - return titled_sections - - -# -- Get full documentation for the class cls ----------------------------------------------------------------------- -def get_doc(cls): - obj = cls['class_object'] - doc = parse_docstring(obj.__doc__) - short_description_doc = doc[0]['text'] - long_description_doc = doc[1]['text'] - parameters_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'Parameters']) - returns_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'Returns']) - raises_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'Raises']) - seealso_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'See also']) - notes_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'Notes']) - examples_doc = ''.join([sec['text'] for sec in doc if sec['title'] == 'Examples']) - - return { - "name": cls['class_name'], - "module": obj.__module__, - "title": short_description_doc, - "description": long_description_doc, - "parameters": parameters_doc, - "returns": returns_doc, - "raises": raises_doc, - "seealso": seealso_doc, - "notes": notes_doc, - "examples": examples_doc - } diff --git a/docs/source/overview.rst b/docs/source/overview.rst index 28421b42f..34ee0c0a2 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -1,8 +1,8 @@ .. _overview: .. include:: ./ext_links.txt -What is Intel® Scalable Dataframe Compiler? -=========================================== +What is Intel® SDC? +=================== Intel® Scalable Dataframe Compiler (Intel® SDC) is an extension of `Numba*`_ that allows just-in-time and ahead-of-time diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 2edd1f86f..f2e7405d3 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -1,4 +1,5 @@ .. _performance: +.. include:: ./ext_links.txt Getting Performance With Intel® SDC =================================== diff --git a/docs/source/sdc-sphinx-theme/static/sdc.css b/docs/source/sdc-sphinx-theme/static/sdc.css index 4c9476732..6a9123d3a 100644 --- a/docs/source/sdc-sphinx-theme/static/sdc.css +++ b/docs/source/sdc-sphinx-theme/static/sdc.css @@ -11,6 +11,12 @@ @import url("basic.css"); +/* Caption text --------------------------------------------------------------*/ +span-caption-text { + font-weight: bold; +} + +/* Body text -----------------------------------------------------------------*/ body { background-color: #ffffff; margin: 0; @@ -43,11 +49,11 @@ h1,h2,h3,h4,h5,h6 { line-height: 1.5em; } h1 { - font-size: 24px; + font-size: 26px; margin: 0; } h2 { - font-size: 21px; + font-size: 20px; line-height: 1.2em; margin: 1em 0 0.5em 0; border-bottom: 1px solid #0070c5; diff --git a/docs/source/sdc_info.py b/docs/source/sdc_info.py deleted file mode 100644 index 745870aa0..000000000 --- a/docs/source/sdc_info.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2019, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - - -import sdc -import logging -from datetime import datetime -from module_info import get_submodules_of, print_modules_classes_methods_attributes, get_doc - -# -- Debug logging -------------------------------------------------------------------------------------------------- -log_file_name = '../build/sdc_info.log' - - -# -- Submodules, classes, and methods to be excluded from API Reference --------------------------------------------- -exclude_modules = [ -] - -exclude_classes = [ -] - -exclude_methods = [ -] - -exclude_attributes = [ -] - - -# -- Implements custom skip functions for the parser ---------------------------------------------------------------- -def _skip_sdc_module(mod, mod_name): - return mod_name in exclude_modules or (not mod_name.startswith('sdc') and not mod_name.startswith('hpat')) - - -def _skip_sdc_class(cls, cls_name): - return cls_name in exclude_classes - - -def _skip_sdc_method(method_name): - return method_name in exclude_methods - - -def _skip_sdc_attribute(attr_name): - return attr_name in exclude_attributes - - -if __name__ == "__main__": - # Initialize logging - logging.basicConfig(filename=log_file_name, level=logging.DEBUG) - logging.debug('****************** STARTING THE LOG *************************') - logging.debug(datetime.now().strftime("%d/%m/%Y %H:%M:%S")) - - # Execute parser for SDC - inspected_modules = set() - modules = [] - get_submodules_of(sdc, inspected_modules, modules, _skip_sdc_module, _skip_sdc_class, - _skip_sdc_method, _skip_sdc_attribute) - - # You may uncomment this line in case you want to print out generated methods and attributes - print_modules_classes_methods_attributes(modules) diff --git a/docs/source/series.rst b/docs/source/series.rst deleted file mode 100644 index 666df3140..000000000 --- a/docs/source/series.rst +++ /dev/null @@ -1,13 +0,0 @@ -Pandas.Series -========================================= -This section provides a list of all operations of Pandas Series supported in Intel® SDC. - -.. currentmodule:: API_Doc.hpat_pandas_series_functions - -.. autosummary:: - :toctree:api/ - -.. todo:: - ne - append - iloc \ No newline at end of file diff --git a/docs/source/workflow.rst b/docs/source/workflow.rst index 9c5b95c6a..999d3415c 100644 --- a/docs/source/workflow.rst +++ b/docs/source/workflow.rst @@ -1,7 +1,8 @@ -.. workflow: +.. _workflow: +.. include:: ./ext_links.txt -Typical Intel® SDC workflow -=========================== +Step by step on typical data analysis workflow +============================================== .. todo:: Discuss high-level issues related to typical data analytics workflow, starting from I/O issues to dataframe processing to machine learning. Discuss each of these aspects on some examples.