diff --git a/hpat/_hpat_common.h b/hpat/_hpat_common.h index 376137df9..5e1f93c9b 100644 --- a/hpat/_hpat_common.h +++ b/hpat/_hpat_common.h @@ -17,9 +17,10 @@ // Float32 = 6 // Float64 = 7 - -struct HPAT_CTypes { - enum HPAT_CTypeEnum { +struct HPAT_CTypes +{ + enum HPAT_CTypeEnum + { INT8 = 0, UINT8 = 1, INT32 = 2, diff --git a/hpat/distributed.py b/hpat/distributed.py index a21fdae54..089cefc3a 100644 --- a/hpat/distributed.py +++ b/hpat/distributed.py @@ -149,7 +149,7 @@ def _run_dist_pass(self, blocks): if isinstance(rhs, ir.Expr): out_nodes = self._run_expr(inst, namevar_table) elif isinstance(rhs, ir.Var) and (self._is_1D_arr(rhs.name) - and not is_array_container(self.typemap, rhs.name)): + and not is_array_container(self.typemap, rhs.name)): self._array_starts[lhs] = self._array_starts[rhs.name] self._array_counts[lhs] = self._array_counts[rhs.name] self._array_sizes[lhs] = self._array_sizes[rhs.name] @@ -192,9 +192,9 @@ def _run_dist_pass(self, blocks): # TODO: use Parfor loop blocks when replacing funcs in # parfor loop body inline_closure_call(self.func_ir, rp_func.glbls, - block, len(new_body), rp_func.func, self.typingctx, - rp_func.arg_types, - self.typemap, self.calltypes, work_list) + block, len(new_body), rp_func.func, self.typingctx, + rp_func.arg_types, + self.typemap, self.calltypes, work_list) replaced = True break else: @@ -221,7 +221,7 @@ def _run_expr(self, inst, namevar_table): self._array_sizes[lhs] = self._array_sizes[arr_name] if (rhs.op == 'getattr' and (self._is_1D_arr(rhs.value.name) - or self._is_1D_Var_arr(rhs.value.name)) + or self._is_1D_Var_arr(rhs.value.name)) and rhs.attr == 'size'): return self._run_array_size(inst.target, rhs.value) if (rhs.op == 'static_getitem' @@ -261,7 +261,7 @@ def _run_expr(self, inst, namevar_table): return self._run_getsetitem(rhs.value, index, rhs, inst) if (rhs.op == 'getattr' and (self._is_1D_arr(rhs.value.name) - or self._is_1D_Var_arr(rhs.value.name)) + or self._is_1D_Var_arr(rhs.value.name)) and rhs.attr == 'shape'): # XXX: return a new tuple using sizes here? self._shape_attrs[lhs] = rhs.value.name @@ -436,7 +436,7 @@ def _run_call(self, assign): self.oneDVar_len_vars[assign.target.name] = arr_var if (hpat.config._has_h5py and (func_mod == 'hpat.io.pio_api' - and func_name in ('h5read', 'h5write', 'h5read_filter')) + and func_name in ('h5read', 'h5write', 'h5read_filter')) and self._is_1D_arr(rhs.args[5].name)): # TODO: make create_dataset/create_group collective arr = rhs.args[5].name @@ -462,7 +462,7 @@ def _run_call(self, assign): self._file_open_set_parallel(file_varname) if hpat.config._has_h5py and (func_mod == 'hpat.io.pio_api' - and func_name == 'get_filter_read_indices'): + and func_name == 'get_filter_read_indices'): # out += self._gen_1D_Var_len(assign.target) size_var = out[-1].target @@ -485,7 +485,7 @@ def _run_call(self, assign): def f(fname, cindex, arr, out_dtype, start, count): # pragma: no cover return hpat.io.parquet_pio.read_parquet_parallel(fname, cindex, - arr, out_dtype, start, count) + arr, out_dtype, start, count) return self._replace_func(f, rhs.args) @@ -505,7 +505,7 @@ def f(fname, cindex, arr, out_dtype, start, count): # pragma: no cover def f(fname, cindex, start, count): # pragma: no cover return hpat.io.parquet_pio.read_parquet_str_parallel(fname, cindex, - start, count) + start, count) f_block = compile_to_numba_ir(f, {'hpat': hpat}, self.typingctx, (self.typemap[rhs.args[0].name], types.intp, @@ -517,7 +517,7 @@ def f(fname, cindex, start, count): # pragma: no cover # TODO: fix numba.extending if hpat.config._has_xenon and (fdef == ('read_xenon_col', 'numba.extending') - and self._is_1D_arr(rhs.args[3].name)): + and self._is_1D_arr(rhs.args[3].name)): arr = rhs.args[3].name assert len(self._array_starts[arr]) == 1, "only 1D arrs in Xenon" start_var = self._array_starts[arr][0] @@ -525,12 +525,13 @@ def f(fname, cindex, start, count): # pragma: no cover rhs.args += [start_var, count_var] def f(connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start, count): # pragma: no cover - return hpat.io.xenon_ext.read_xenon_col_parallel(connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start, count) + return hpat.io.xenon_ext.read_xenon_col_parallel( + connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start, count) return self._replace_func(f, rhs.args) if hpat.config._has_xenon and (fdef == ('read_xenon_str', 'numba.extending') - and self._is_1D_arr(lhs)): + and self._is_1D_arr(lhs)): arr = lhs size_var = rhs.args[3] assert self.typemap[size_var.name] == types.intp @@ -544,13 +545,20 @@ def f(connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start, count): rhs.args.append(count_var) def f(connect_tp, dset_tp, col_id_tp, schema_arr_tp, start_tp, count_tp): # pragma: no cover - return hpat.io.xenon_ext.read_xenon_str_parallel(connect_tp, dset_tp, col_id_tp, schema_arr_tp, start_tp, count_tp) - - - f_block = compile_to_numba_ir(f, {'hpat': hpat}, self.typingctx, - (hpat.io.xenon_ext.xe_connect_type, hpat.io.xenon_ext.xe_dset_type, types.intp, - self.typemap[rhs.args[3].name], types.intp, types.intp), - self.typemap, self.calltypes).blocks.popitem()[1] + return hpat.io.xenon_ext.read_xenon_str_parallel( + connect_tp, dset_tp, col_id_tp, schema_arr_tp, start_tp, count_tp) + + f_block = compile_to_numba_ir(f, + {'hpat': hpat}, + self.typingctx, + (hpat.io.xenon_ext.xe_connect_type, + hpat.io.xenon_ext.xe_dset_type, + types.intp, + self.typemap[rhs.args[3].name], + types.intp, + types.intp), + self.typemap, + self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, rhs.args) out += f_block.body[:-2] out[-1].target = assign.target @@ -644,8 +652,8 @@ def f(arr, bag, start, count): # pragma: no cover out.append(assign) if fdef == ('rolling_fixed', 'hpat.hiframes.rolling') and ( - self._is_1D_arr(rhs.args[0].name) - or self._is_1D_Var_arr(rhs.args[0].name)): + self._is_1D_arr(rhs.args[0].name) + or self._is_1D_Var_arr(rhs.args[0].name)): in_arr = rhs.args[0].name if self._is_1D_arr(in_arr): self._array_starts[lhs] = self._array_starts[in_arr] @@ -658,8 +666,8 @@ def f(arr, bag, start, count): # pragma: no cover out = [ir.Assign(ir.Const(True, loc), true_var, loc), assign] if fdef == ('rolling_variable', 'hpat.hiframes.rolling') and ( - self._is_1D_arr(rhs.args[0].name) - or self._is_1D_Var_arr(rhs.args[0].name)): + self._is_1D_arr(rhs.args[0].name) + or self._is_1D_Var_arr(rhs.args[0].name)): in_arr = rhs.args[0].name if self._is_1D_arr(in_arr): self._array_starts[lhs] = self._array_starts[in_arr] @@ -672,9 +680,9 @@ def f(arr, bag, start, count): # pragma: no cover out = [ir.Assign(ir.Const(True, loc), true_var, loc), assign] if (func_mod == 'hpat.hiframes.rolling' - and func_name in ('shift', 'pct_change') - and (self._is_1D_arr(rhs.args[0].name) - or self._is_1D_Var_arr(rhs.args[0].name))): + and func_name in ('shift', 'pct_change') + and (self._is_1D_arr(rhs.args[0].name) + or self._is_1D_Var_arr(rhs.args[0].name))): in_arr = rhs.args[0].name if self._is_1D_arr(in_arr): self._array_starts[lhs] = self._array_starts[in_arr] @@ -687,7 +695,7 @@ def f(arr, bag, start, count): # pragma: no cover out = [ir.Assign(ir.Const(True, loc), true_var, loc), assign] if fdef == ('quantile', 'hpat.hiframes.api') and (self._is_1D_arr(rhs.args[0].name) - or self._is_1D_Var_arr(rhs.args[0].name)): + or self._is_1D_Var_arr(rhs.args[0].name)): arr = rhs.args[0].name if arr in self._array_sizes: assert len(self._array_sizes[arr] @@ -735,7 +743,7 @@ def f(arr, bag, start, count): # pragma: no cover # always rebalance returned distributed arrays # TODO: need different flag for 1D_Var return (distributed_var)? # TODO: rebalance strings? - #return [assign] # self._run_call_rebalance_array(lhs, assign, rhs.args) + # return [assign] # self._run_call_rebalance_array(lhs, assign, rhs.args) assign.value = rhs.args[0] return [assign] @@ -756,16 +764,15 @@ def f(arr, bag, start, count): # pragma: no cover out += div_nodes # XXX: get sizes in lower dimensions - self._array_starts[lhs] = [-1]*ndim - self._array_counts[lhs] = [-1]*ndim - self._array_sizes[lhs] = [-1]*ndim + self._array_starts[lhs] = [-1] * ndim + self._array_counts[lhs] = [-1] * ndim + self._array_sizes[lhs] = [-1] * ndim self._array_starts[lhs][0] = start_var self._array_counts[lhs][0] = count_var self._array_sizes[lhs][0] = total_length return out - if fdef == ('threaded_return', 'hpat.distributed_api'): assign.value = rhs.args[0] return [assign] @@ -773,7 +780,6 @@ def f(arr, bag, start, count): # pragma: no cover if fdef == ('rebalance_array', 'hpat.distributed_api'): return self._run_call_rebalance_array(lhs, assign, rhs.args) - # output of mnb.predict is 1D with same size as 1st dimension of input # TODO: remove ml module and use new DAAL API if func_name == 'predict': @@ -827,8 +833,8 @@ def _run_call_np(self, lhs, func_name, assign, args): # output array has same properties (starts etc.) as input array if (func_name in ['cumsum', 'cumprod', 'empty_like', - 'zeros_like', 'ones_like', 'full_like', - 'copy', 'ravel', 'ascontiguousarray'] + 'zeros_like', 'ones_like', 'full_like', + 'copy', 'ravel', 'ascontiguousarray'] and self._is_1D_arr(args[0].name)): if func_name == 'ravel': assert self.typemap[args[0].name].ndim == 1, "only 1D ravel supported" @@ -909,19 +915,17 @@ def _run_call_array(self, lhs, arr, func_name, assign, args): if func_name == 'reshape' and self._is_1D_arr(arr.name): return self._run_reshape(assign, arr, args) - if func_name == 'transpose' and self._is_1D_arr(lhs): # Currently only 1D arrays are supported assert self._is_1D_arr(arr.name) ndim = self.typemap[arr.name].ndim - self._array_starts[lhs] = [-1]*ndim - self._array_counts[lhs] = [-1]*ndim - self._array_sizes[lhs] = [-1]*ndim + self._array_starts[lhs] = [-1] * ndim + self._array_counts[lhs] = [-1] * ndim + self._array_sizes[lhs] = [-1] * ndim self._array_starts[lhs][0] = self._array_starts[arr.name][0] self._array_counts[lhs][0] = self._array_counts[arr.name][0] self._array_sizes[lhs][0] = self._array_sizes[arr.name][0] - # TODO: refactor # TODO: add unittest if func_name == 'tofile': @@ -993,14 +997,14 @@ def _run_call_df(self, lhs, df, func_name, assign, args): # fix to_csv() type to have None as 1st arg call_type = self.calltypes.pop(rhs) - arg_typs = list((types.none,)+call_type.args[1:]) + arg_typs = list((types.none,) + call_type.args[1:]) arg_typs[5] = types.bool_ arg_typs = tuple(arg_typs) # self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type( # self.typingctx, arg_typs, {}) self.calltypes[rhs] = numba.typing.Signature( - string_type, arg_typs, new_df_typ, - call_type.pysig) + string_type, arg_typs, new_df_typ, + call_type.pysig) # None as 1st arg none_var = ir.Var(assign.target.scope, mk_unique_var('none'), rhs.loc) @@ -1027,7 +1031,6 @@ def _run_call_df(self, lhs, df, func_name, assign, args): # TODO: fix string data reference count dummy_use = numba.njit(lambda a: None) - def f(fname, str_out): # pragma: no cover count = len(str_out) start = hpat.distributed_api.dist_exscan(count) @@ -1044,10 +1047,10 @@ def _gen_is_root_and_cond(self, cond_var): def f(cond): return cond & (hpat.distributed_api.get_rank() == 0) f_block = compile_to_numba_ir(f, {'hpat': hpat}, - self.typingctx, - (self.typemap[cond_var.name],), - self.typemap, - self.calltypes).blocks.popitem()[1] + self.typingctx, + (self.typemap[cond_var.name],), + self.typemap, + self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [cond_var]) nodes = f_block.body[:-2] return nodes @@ -1056,15 +1059,15 @@ def _fix_parallel_df_index(self, df): def f(df): # pragma: no cover l = len(df) start = hpat.distributed_api.dist_exscan(l) - ind = np.arange(start, start+l) + ind = np.arange(start, start + l) df2 = hpat.hiframes.pd_dataframe_ext.set_df_index(df, ind) return df2 f_block = compile_to_numba_ir(f, {'hpat': hpat, 'np': np}, - self.typingctx, - (self.typemap[df.name],), - self.typemap, - self.calltypes).blocks.popitem()[1] + self.typingctx, + (self.typemap[df.name],), + self.typemap, + self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [df]) nodes = f_block.body[:-2] return nodes @@ -1148,10 +1151,10 @@ def f(lhs, in_arr, new_0dim_global_len, old_0dim_global_len, dtype_size): # pra lhs, in_arr, new_0dim_global_len, old_0dim_global_len, dtype_size) f_block = compile_to_numba_ir(f, {'hpat': hpat}, - self.typingctx, - (self.typemap[lhs.name], self.typemap[in_arr.name], - types.intp, types.intp, types.intp), - self.typemap, self.calltypes).blocks.popitem()[1] + self.typingctx, + (self.typemap[lhs.name], self.typemap[in_arr.name], + types.intp, types.intp, types.intp), + self.typemap, self.calltypes).blocks.popitem()[1] dtype_ir = self.dtype_size_assign_ir(dtype, scope, loc) out.append(dtype_ir) replace_arg_nodes(f_block, [lhs, in_arr, self._array_sizes[lhs.name][0], @@ -1165,7 +1168,6 @@ def f(lhs, in_arr, new_0dim_global_len, old_0dim_global_len, dtype_size): # pra # args[0] = self._tuple_table[new_size_var.name][0] # out.append(assign) - def _run_call_rebalance_array(self, lhs, assign, args): out = [assign] if not self._is_1D_Var_arr(args[0].name): @@ -1175,8 +1177,7 @@ def _run_call_rebalance_array(self, lhs, assign, args): self._array_counts[lhs] = self._array_counts[in_1d_arr] self._array_sizes[lhs] = self._array_sizes[in_1d_arr] else: - warnings.warn("array {} is not 1D_Block_Var".format( - args[0].name)) + warnings.warn("array {} is not 1D_Block_Var".format(args[0].name)) return out arr = args[0] @@ -1189,9 +1190,9 @@ def _run_call_rebalance_array(self, lhs, assign, args): out += div_nodes # XXX: get sizes in lower dimensions - self._array_starts[lhs] = [-1]*ndim - self._array_counts[lhs] = [-1]*ndim - self._array_sizes[lhs] = [-1]*ndim + self._array_starts[lhs] = [-1] * ndim + self._array_counts[lhs] = [-1] * ndim + self._array_sizes[lhs] = [-1] * ndim self._array_starts[lhs][0] = start_var self._array_counts[lhs][0] = count_var self._array_sizes[lhs][0] = total_length @@ -1382,7 +1383,7 @@ def f(oneD_var_arr): # pragma: no cover self.func_ir._definitions[tuple_var.name] = [tuple_call] return out, tuple_var - #new_body += self._run_1D_array_shape( + # new_body += self._run_1D_array_shape( # inst.target, rhs.value) # def _run_1D_array_shape(self, lhs, arr): # """return shape tuple with global size of 1D/1D_Var arrays @@ -1559,8 +1560,7 @@ def f(A, val, start, stop, chunk_start, chunk_count): # pragma: no cover # strided whole slice # e.g. A = X[::2,5] - elif guard(is_whole_slice, self.typemap, self.func_ir, index_var, - accept_stride=True): + elif guard(is_whole_slice, self.typemap, self.func_ir, index_var, accept_stride=True): # FIXME: we use rebalance array to handle the output array # TODO: convert to neighbor exchange # on each processor, the slice has to start from an offset: @@ -1677,8 +1677,8 @@ def _run_parfor_1D_Var(self, parfor, namevar_table): def f(A): # pragma: no cover arr_len = len(A) f_block = compile_to_numba_ir(f, {'hpat': hpat}, self.typingctx, - (self.typemap[arr_var.name],), - self.typemap, self.calltypes).blocks.popitem()[1] + (self.typemap[arr_var.name],), + self.typemap, self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [arr_var]) nodes = f_block.body[:-3] # remove none return l.stop = nodes[-1].target @@ -1715,8 +1715,8 @@ def _fix_ind_bounds(start, stop): return start + prefix, stop + prefix f_block = compile_to_numba_ir(_fix_ind_bounds, {'hpat': hpat}, - self.typingctx, (types.intp,types.intp), self.typemap, - self.calltypes).blocks.popitem()[1] + self.typingctx, (types.intp, types.intp), self.typemap, + self.calltypes).blocks.popitem()[1] replace_arg_nodes(f_block, [l_nest.start, l_nest.stop]) nodes = f_block.body[:-2] ret_var = nodes[-1].target @@ -1764,9 +1764,9 @@ def _run_arg(self, assign): out += div_nodes # XXX: get sizes in lower dimensions - self._array_starts[arr.name] = [-1]*ndim - self._array_counts[arr.name] = [-1]*ndim - self._array_sizes[arr.name] = [-1]*ndim + self._array_starts[arr.name] = [-1] * ndim + self._array_counts[arr.name] = [-1] * ndim + self._array_sizes[arr.name] = [-1] * ndim self._array_starts[arr.name][0] = start_var self._array_counts[arr.name][0] = count_var self._array_sizes[arr.name][0] = total_length @@ -1820,7 +1820,6 @@ def _gen_parfor_reductions(self, parfor, namevar_table): # return lhs // rhs # return None - def _gen_1D_div(self, size_var, scope, loc, prefix, end_call_name, end_call): div_nodes = [] if isinstance(size_var, int): @@ -2111,8 +2110,7 @@ def _get_tuple_varlist(self, tup_var, out): out += nodes return vals_list - def _get_arg(self, f_name, args, kws, arg_no, arg_name, default=None, - err_msg=None): + def _get_arg(self, f_name, args, kws, arg_no, arg_name, default=None, err_msg=None): arg = None if len(args) > arg_no: arg = args[arg_no] diff --git a/hpat/hiframes/pd_dataframe_ext.py b/hpat/hiframes/pd_dataframe_ext.py index 1a6ccc792..c4fdfdeef 100644 --- a/hpat/hiframes/pd_dataframe_ext.py +++ b/hpat/hiframes/pd_dataframe_ext.py @@ -640,6 +640,7 @@ def merge_overload(left, right, how='inner', on=None, left_on=None, # check if on's inferred type is NoneType and store the result, # use it later to branch based on the value available at compile time onHasNoneType = isinstance(numba.typeof(on), types.NoneType) + def _impl(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None): @@ -660,6 +661,7 @@ def merge_asof_overload(left, right, on=None, left_on=None, right_on=None, # check if on's inferred type is NoneType and store the result, # use it later to branch based on the value available at compile time onHasNoneType = isinstance(numba.typeof(on), types.NoneType) + def _impl(left, right, on=None, left_on=None, right_on=None, left_index=False, right_index=False, by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, diff --git a/hpat/io/parquet_pio.py b/hpat/io/parquet_pio.py index bb81e9026..725ad40a3 100644 --- a/hpat/io/parquet_pio.py +++ b/hpat/io/parquet_pio.py @@ -1,3 +1,9 @@ +from hpat.config import _has_pyarrow +import llvmlite.binding as ll +from llvmlite import ir as lir +from numba.targets.arrayobj import make_array +from numba.targets.imputils import lower_builtin +from numba import cgutils import numba from numba import ir, config, ir_utils, types from numba.ir_utils import (mk_unique_var, replace_vars_inner, find_topo_order, @@ -26,7 +32,6 @@ repr(types.NPDatetime('ns')): 3, 'int8': 6} - def read_parquet(): return 0 @@ -104,13 +109,14 @@ def gen_parquet_read(self, file_name, lhs): out_nodes = [] # get arrow readers once + def init_arrow_readers(fname): arrow_readers = get_arrow_readers(unicode_to_char_ptr(fname)) f_block = compile_to_numba_ir(init_arrow_readers, - {'get_arrow_readers': _get_arrow_readers, - 'unicode_to_char_ptr': unicode_to_char_ptr, - }).blocks.popitem()[1] + {'get_arrow_readers': _get_arrow_readers, + 'unicode_to_char_ptr': unicode_to_char_ptr, + }).blocks.popitem()[1] replace_arg_nodes(f_block, [file_name]) out_nodes += f_block.body[:-3] @@ -136,8 +142,8 @@ def cleanup_arrow_readers(readers): s = del_arrow_readers(readers) f_block = compile_to_numba_ir(cleanup_arrow_readers, - {'del_arrow_readers': _del_arrow_readers, - }).blocks.popitem()[1] + {'del_arrow_readers': _del_arrow_readers, + }).blocks.popitem()[1] replace_arg_nodes(f_block, [arrow_readers_var]) out_nodes += f_block.body[:-3] return col_names, col_arrs, out_nodes @@ -194,6 +200,7 @@ def get_element_type(dtype): out = 'bool_' return out + def _get_numba_typ_from_pa_typ(pa_typ): import pyarrow as pa _typ_map = { @@ -227,6 +234,7 @@ def _get_numba_typ_from_pa_typ(pa_typ): raise ValueError("Arrow data type {} not supported yet".format(pa_typ)) return _typ_map[pa_typ] + def parquet_file_schema(file_name): import pyarrow.parquet as pq col_names = [] @@ -241,6 +249,7 @@ def parquet_file_schema(file_name): # TODO: close file? return col_names, col_types + def _rm_pd_index(col_names, col_types): """remove pandas index if found in columns """ @@ -300,13 +309,6 @@ def generic(self, args, kws): return signature(types.int32, *unliteral_all(args)) -from numba import cgutils -from numba.targets.imputils import lower_builtin -from numba.targets.arrayobj import make_array -from llvmlite import ir as lir -import llvmlite.binding as ll - -from hpat.config import _has_pyarrow if _has_pyarrow: from .. import parquet_cpp ll.add_symbol('get_arrow_readers', parquet_cpp.get_arrow_readers) diff --git a/hpat/io/pio.py b/hpat/io/pio.py index 30ba4fa61..33f883e67 100644 --- a/hpat/io/pio.py +++ b/hpat/io/pio.py @@ -18,7 +18,6 @@ from hpat.utils import find_str_const, debug_prints - def remove_h5(rhs, lives, call_list): # the call is dead if the read array is dead if call_list == ['h5read', 'io', pio_api] and rhs.args[6].name not in lives: @@ -49,8 +48,7 @@ def handle_possible_h5_read(self, assign, lhs, rhs): loc_vars = {} exec(func_text, {}, loc_vars) _h5_read_impl = loc_vars['_h5_read_impl'] - f_block = compile_to_numba_ir( - _h5_read_impl, {'hpat': hpat}).blocks.popitem()[1] + f_block = compile_to_numba_ir(_h5_read_impl, {'hpat': hpat}).blocks.popitem()[1] index_var = rhs.index if rhs.op == 'getitem' else rhs.index_var replace_arg_nodes(f_block, [rhs.value, index_var]) nodes = f_block.body[:-3] # remove none return diff --git a/hpat/io/pio_api.py b/hpat/io/pio_api.py index ee47db5cb..439be47ea 100644 --- a/hpat/io/pio_api.py +++ b/hpat/io/pio_api.py @@ -24,6 +24,7 @@ class H5FileType(types.Opaque): def __init__(self): super(H5FileType, self).__init__(name='H5FileType') + h5file_type = H5FileType() @@ -31,6 +32,7 @@ class H5DatasetType(types.Opaque): def __init__(self): super(H5DatasetType, self).__init__(name='H5DatasetType') + h5dataset_type = H5DatasetType() @@ -38,6 +40,7 @@ class H5GroupType(types.Opaque): def __init__(self): super(H5GroupType, self).__init__(name='H5GroupType') + h5group_type = H5GroupType() @@ -45,6 +48,7 @@ class H5DatasetOrGroupType(types.Opaque): def __init__(self): super(H5DatasetOrGroupType, self).__init__(name='H5DatasetOrGroupType') + h5dataset_or_group_type = H5DatasetOrGroupType() h5file_data_type = types.int64 @@ -77,11 +81,13 @@ def _create_dataset_typer(args, kws): name = args[0] if len(args) > 0 else types.unliteral(kwargs['name']) shape = args[1] if len(args) > 1 else types.unliteral(kwargs['shape']) dtype = args[2] if len(args) > 2 else types.unliteral(kwargs['dtype']) + def create_dset_stub(name, shape, dtype): pass pysig = numba.utils.pysignature(create_dset_stub) return signature(h5dataset_type, name, shape, dtype).replace(pysig=pysig) + @infer_getattr class FileAttribute(AttributeTemplate): key = h5file_type @@ -104,6 +110,7 @@ def resolve_create_dataset(self, f_id, args, kws): def resolve_create_group(self, f_id, args, kws): return signature(h5group_type, *unliteral_all(args)) + @infer_getattr class GroupOrDatasetAttribute(AttributeTemplate): key = h5dataset_or_group_type @@ -123,6 +130,7 @@ class GroupAttribute(AttributeTemplate): def resolve_create_dataset(self, f_id, args, kws): return _create_dataset_typer(unliteral_all(args), kws) + @infer_global(operator.getitem) class GetItemH5File(AbstractTemplate): key = operator.getitem @@ -136,6 +144,7 @@ def generic(self, args, kws): if in_f == h5dataset_or_group_type and in_idx == string_type: return signature(h5dataset_or_group_type, in_f, in_idx) + @infer_global(operator.setitem) class SetItemH5Dset(AbstractTemplate): def generic(self, args, kws): @@ -143,6 +152,7 @@ def generic(self, args, kws): if args[0] == h5dataset_type: return signature(types.none, *args) + def h5g_get_num_objs(): return @@ -180,9 +190,11 @@ def h5write(): """dummy function for C h5_write""" return + def h5_read_dummy(): return + @infer_global(h5_read_dummy) class H5ReadType(AbstractTemplate): def generic(self, args, kws): @@ -264,8 +276,10 @@ def generic(self, args, kws): assert len(args) == 2 return signature(string_type, *args) + sum_op = hpat.distributed_api.Reduce_Type.Sum.value + @numba.njit def get_filter_read_indices(bool_arr): indices = bool_arr.nonzero()[0] @@ -297,6 +311,7 @@ def get_filter_read_indices(bool_arr): end = hpat.distributed_api.get_end(n, n_pes, rank) return all_indices[start:end] + @intrinsic def tuple_to_ptr(typingctx, tuple_tp=None): def codegen(context, builder, sig, args): @@ -305,9 +320,20 @@ def codegen(context, builder, sig, args): return builder.bitcast(ptr, lir.IntType(8).as_pointer()) return signature(types.voidptr, tuple_tp), codegen -_h5read_filter = types.ExternalFunction("hpat_h5_read_filter", - types.int32(h5dataset_or_group_type, types.int32, types.voidptr, - types.voidptr, types.intp, types.voidptr, types.int32, types.voidptr, types.int32)) + +_h5read_filter = types.ExternalFunction( + "hpat_h5_read_filter", + types.int32( + h5dataset_or_group_type, + types.int32, + types.voidptr, + types.voidptr, + types.intp, + types.voidptr, + types.int32, + types.voidptr, + types.int32)) + @numba.njit def h5read_filter(dset_id, ndim, starts, counts, is_parallel, out_arr, read_indices): @@ -315,4 +341,4 @@ def h5read_filter(dset_id, ndim, starts, counts, is_parallel, out_arr, read_indi counts_ptr = tuple_to_ptr(counts) type_enum = hpat.distributed_api.get_type_enum(out_arr) return _h5read_filter(dset_id, ndim, starts_ptr, counts_ptr, is_parallel, - out_arr.ctypes, type_enum, read_indices.ctypes, len(read_indices)) + out_arr.ctypes, type_enum, read_indices.ctypes, len(read_indices)) diff --git a/hpat/io/pio_lower.py b/hpat/io/pio_lower.py index 363db6837..42bd9deab 100644 --- a/hpat/io/pio_lower.py +++ b/hpat/io/pio_lower.py @@ -5,8 +5,7 @@ import hpat.io from hpat.io import pio_api from hpat.utils import _numba_to_c_type_map -from hpat.io.pio_api import (h5file_type, h5dataset_or_group_type, h5dataset_type, - h5group_type) +from hpat.io.pio_api import (h5file_type, h5dataset_or_group_type, h5dataset_type, h5group_type) from hpat.str_ext import string_type, gen_get_unicode_chars, gen_std_str_to_unicode from llvmlite import ir as lir @@ -39,6 +38,7 @@ h5g_close = types.ExternalFunction("h5g_close", types.none(h5group_type)) + @lower_builtin(operator.getitem, h5file_type, string_type) @lower_builtin(operator.getitem, h5dataset_or_group_type, string_type) def h5_open_dset_lower(context, builder, sig, args): @@ -113,12 +113,9 @@ def h5_close(context, builder, sig, args): builder.call(fn, args) return context.get_dummy_value() -@lower_builtin("h5group.create_dataset", h5group_type, string_type, - types.UniTuple, string_type) -@lower_builtin("h5file.create_dataset", h5file_type, string_type, - types.UniTuple, string_type) -@lower_builtin(pio_api.h5create_dset, h5file_type, string_type, - types.UniTuple, string_type) +@lower_builtin("h5group.create_dataset", h5group_type, string_type, types.UniTuple, string_type) +@lower_builtin("h5file.create_dataset", h5file_type, string_type, types.UniTuple, string_type) +@lower_builtin(pio_api.h5create_dset, h5file_type, string_type, types.UniTuple, string_type) def h5_create_dset(context, builder, sig, args): fg_id, dset_name, counts, dtype_str = args @@ -152,6 +149,7 @@ def h5_create_dset(context, builder, sig, args): return builder.call(fn, call_args) + @lower_builtin("h5group.create_group", h5group_type, string_type) @lower_builtin("h5file.create_group", h5file_type, string_type) @lower_builtin(pio_api.h5create_group, h5file_type, string_type) diff --git a/hpat/io/xenon_ext.py b/hpat/io/xenon_ext.py index 7ff9a7340..9b80f3aeb 100644 --- a/hpat/io/xenon_ext.py +++ b/hpat/io/xenon_ext.py @@ -24,6 +24,7 @@ from hpat.str_arr_ext import StringArray, StringArrayPayloadType, construct_string_array from hpat.str_arr_ext import string_array_type + def remove_xenon(rhs, lives, call_list): # the call is dead if the read array is dead if call_list == [read_xenon_col] and rhs.args[3].name not in lives: @@ -41,6 +42,7 @@ def remove_xenon(rhs, lives, call_list): def read_xenon(): return + def _handle_read(assign, lhs, rhs, func_ir): if not hpat.config._has_xenon: raise ValueError("Xenon support not available") @@ -78,7 +80,7 @@ def _handle_read(assign, lhs, rhs, func_ir): loc_vars = {} exec(func_text, {}, loc_vars) schm_func = loc_vars['f'] - f_block = compile_to_numba_ir(schm_func, {'np': np,}).blocks.popitem()[1] + f_block = compile_to_numba_ir(schm_func, {'np': np, }).blocks.popitem()[1] out_nodes += f_block.body[:-3] schema_arr_var = out_nodes[-1].target @@ -99,10 +101,11 @@ def _handle_read(assign, lhs, rhs, func_ir): # we need to close in the URI case since we opened the connection/dataset if len(rhs.args) == 1: - out_nodes += gen_close_xenon(xe_connect_var, xe_dset_var); + out_nodes += gen_close_xenon(xe_connect_var, xe_dset_var) return col_items, out_nodes + _xe_type_to_numba = {'BOOL': types.Array(types.boolean, 1, 'C'), 'I8': types.Array(types.char, 1, 'C'), 'I16': types.Array(types.int16, 1, 'C'), @@ -116,7 +119,8 @@ def _handle_read(assign, lhs, rhs, func_ir): _type_to_xe_dtype_number = {'int8': 0, 'int16': 1, 'int32': 2, 'int64': 3, 'float32': 4, 'float64': 5, 'DECIMAL': 6, - 'bool_': 7, 'string': 8, 'BLOB': 9} + 'bool_': 7, 'string': 8, 'BLOB': 9} + def get_xe_typ_enum(c_type): if c_type == string_array_type: @@ -124,6 +128,7 @@ def get_xe_typ_enum(c_type): assert isinstance(c_type, types.Array) return _type_to_xe_dtype_number[get_element_type(c_type.dtype)] + def gen_xe_init_from_uri(func_ir, dset_name_var): dset_name = get_constant(func_ir, dset_name_var) if dset_name is NOT_CONSTANT: @@ -139,6 +144,7 @@ def gen_xe_init_from_uri(func_ir, dset_name_var): out_nodes, xe_connect_var, xe_dset_var = gen_init_xenon(address, dset_name) return out_nodes, col_names, col_types, xe_connect_var, xe_dset_var + def parse_xe_schema(schema): # print("schema", schema) # example: {first:CHAR,last:CHAR,age:I32,street:CHAR,state:CHAR,zip:I32} @@ -195,6 +201,7 @@ def get_column_read_nodes(c_type, cvar, xe_connect_var, xe_dset_var, i, schema_a out_nodes.append(assign) return out_nodes + def gen_init_xenon(address, dset_name): # TODO: support non-constant address/dset_name func_text = ('def f():\n connect_t = xe_connect(unicode_to_char_ptr("{}"))\n'.format(address)) @@ -204,9 +211,9 @@ def gen_init_xenon(address, dset_name): exec(func_text, {}, loc_vars) init_func = loc_vars['f'] f_block = compile_to_numba_ir(init_func, - {'xe_connect': xe_connect, - 'unicode_to_char_ptr': unicode_to_char_ptr, - 'xe_open': xe_open}).blocks.popitem()[1] + {'xe_connect': xe_connect, + 'unicode_to_char_ptr': unicode_to_char_ptr, + 'xe_open': xe_open}).blocks.popitem()[1] connect_var = None dset_t_var = None @@ -214,20 +221,21 @@ def gen_init_xenon(address, dset_name): out_nodes = f_block.body[:-3] for stmt in reversed(out_nodes): if stmt.target.name.startswith("connect_t"): - connect_var = stmt.target + connect_var = stmt.target if stmt.target.name.startswith("dset_t"): - dset_t_var = stmt.target + dset_t_var = stmt.target assert connect_var is not None and dset_t_var is not None return out_nodes, connect_var, dset_t_var + def gen_close_xenon(connect_var, dset_t_var): # def close_func(connect_var, dset_t_var): s = xe_close(connect_var, dset_t_var) f_block = compile_to_numba_ir(close_func, - {'xe_close': xe_close}).blocks.popitem()[1] + {'xe_close': xe_close}).blocks.popitem()[1] replace_arg_nodes(f_block, [connect_var, dset_t_var]) out_nodes = f_block.body[:-3] @@ -240,23 +248,29 @@ def get_element_type(dtype): out = 'bool_' return out + class XeConnectType(types.Opaque): def __init__(self): super(XeConnectType, self).__init__(name='XeConnectType') + xe_connect_type = XeConnectType() register_model(XeConnectType)(models.OpaqueModel) + class XeDSetType(types.Opaque): def __init__(self): super(XeDSetType, self).__init__(name='XeDSetType') + xe_dset_type = XeDSetType() register_model(XeDSetType)(models.OpaqueModel) -get_column_size_xenon = types.ExternalFunction("get_column_size_xenon", types.int64(xe_connect_type, xe_dset_type, types.intp)) +get_column_size_xenon = types.ExternalFunction( + "get_column_size_xenon", types.int64( + xe_connect_type, xe_dset_type, types.intp)) # read_xenon_col = types.ExternalFunction("c_read_xenon", types.void(string_type, types.intp, types.voidptr, types.CPointer(types.int64))) xe_connect = types.ExternalFunction("c_xe_connect", xe_connect_type(types.voidptr)) xe_open = types.ExternalFunction("c_xe_open", xe_dset_type(xe_connect_type, types.voidptr)) @@ -282,6 +296,7 @@ def codegen(context, builder, sig, args): return context.get_dummy_value() return signature(types.none, connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp), codegen + @intrinsic def read_xenon_col_parallel(typingctx, connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start_tp, count_tp): def codegen(context, builder, sig, args): @@ -303,6 +318,7 @@ def codegen(context, builder, sig, args): return context.get_dummy_value() return signature(types.none, connect_tp, dset_tp, col_id_tp, column_tp, schema_arr_tp, start_tp, count_tp), codegen + @intrinsic def read_xenon_str(typingctx, connect_tp, dset_tp, col_id_tp, size_tp, schema_arr_tp): def codegen(context, builder, sig, args): @@ -339,6 +355,7 @@ def codegen(context, builder, sig, args): return impl_ret_new_ref(context, builder, typ, ret) return signature(string_array_type, connect_tp, dset_tp, col_id_tp, size_tp, schema_arr_tp), codegen + @intrinsic def read_xenon_str_parallel(typingctx, connect_tp, dset_tp, col_id_tp, schema_arr_tp, start_tp, count_tp): def codegen(context, builder, sig, args): @@ -358,7 +375,7 @@ def codegen(context, builder, sig, args): lir.IntType(8).as_pointer().as_pointer(), lir.IntType(64).as_pointer(), lir.IntType(64), - lir.IntType(64),]) + lir.IntType(64), ]) fn = builder.module.get_or_insert_function(fnty, name="c_read_xenon_col_str_parallel") res = builder.call(fn, [args[0], args[1], args[2], diff --git a/hpat/runtests.py b/hpat/runtests.py index 6311c0488..07d178f49 100644 --- a/hpat/runtests.py +++ b/hpat/runtests.py @@ -1,10 +1,12 @@ import unittest import hpat.tests + def load_tests(loader, tests, pattern): suite = unittest.TestSuite() suite.addTests(loader.loadTestsFromModule(hpat.tests)) return suite + if __name__ == '__main__': unittest.main() diff --git a/hpat/tests/gen_test_data.py b/hpat/tests/gen_test_data.py index 32052714f..6f707cd22 100644 --- a/hpat/tests/gen_test_data.py +++ b/hpat/tests/gen_test_data.py @@ -50,9 +50,9 @@ def generate_spark_data(): import tarfile if os.path.exists('sdf_dt.pq'): - shutil.rmtree('sdf_dt.pq') + shutil.rmtree('sdf_dt.pq') - sdf_dt_archive = os.path.join(os.path.dirname(os.path.abspath(__file__)),'sdf_dt.pq.bz2') + sdf_dt_archive = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sdf_dt.pq.bz2') tar = tarfile.open(sdf_dt_archive, "r:bz2") tar.extractall('.') tar.close() @@ -81,17 +81,17 @@ def generate_other_data(): dset1[:] = arr f.close() - df = pd.DataFrame({'A': ['bc']+["a"]*3+ ["bc"]*3+['a'], 'B': [-8,1,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': ['bc'] + ["a"] * 3 + ["bc"] * 3 + ['a'], 'B': [-8, 1, 2, 3, 1, 5, 6, 7]}) df.to_parquet("groupby3.pq") df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C": ["small", "large", "large", "small", - "small", "large", "small", "small", - "large"], - "D": [1, 2, 2, 6, 3, 4, 5, 6, 9]}) + "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", + "one", "one", "two", "two"], + "C": ["small", "large", "large", "small", + "small", "large", "small", "small", + "large"], + "D": [1, 2, 2, 6, 3, 4, 5, 6, 9]}) df.to_parquet("pivot2.pq") # CSV reader test @@ -104,7 +104,7 @@ def generate_other_data(): f.write(data) with open("csv_data_infer1.csv", "w") as f: - f.write('A,B,C,D\n'+data) + f.write('A,B,C,D\n' + data) data = ("0,2.3,2015-01-03,47736\n" "1,2.3,1966-11-13,47736\n" @@ -120,7 +120,7 @@ def generate_other_data(): 'B': [4, 5, 9, 6]}) df2 = pd.DataFrame({'time': pd.DatetimeIndex( ['2017-01-01', '2017-01-14', '2017-01-16', '2017-02-23', '2017-02-23', - '2017-02-25']), 'A': [2,3,7,8,9,10]}) + '2017-02-25']), 'A': [2, 3, 7, 8, 9, 10]}) df1.to_parquet("asof1.pq") df2.to_parquet("asof2.pq") diff --git a/hpat/tests/test_d4p.py b/hpat/tests/test_d4p.py index 017e0c850..5d786f2f8 100644 --- a/hpat/tests/test_d4p.py +++ b/hpat/tests/test_d4p.py @@ -14,7 +14,6 @@ count_parfor_OneD_Vars, count_array_OneD_Vars, dist_IR_contains) - class TestD4P(unittest.TestCase): def test_logistic_regression(self): ''' @@ -23,28 +22,28 @@ def test_logistic_regression(self): * optional and required arguments passing ''' def train_impl(n, d): - X = np.ones((n,d), dtype=np.double)+.5 - Y = np.ones((n,1), dtype=np.double) + X = np.ones((n, d), dtype=np.double) + .5 + Y = np.ones((n, 1), dtype=np.double) algo = d4p.logistic_regression_training(2, penaltyL1=0.1, penaltyL2=0.1, interceptFlag=True) return algo.compute(X, Y) + def prdct_impl(n, d, model): - w = np.ones((n,d), dtype=np.double)-22.5 - algo = d4p.logistic_regression_prediction(2, - resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities") + w = np.ones((n, d), dtype=np.double) - 22.5 + algo = d4p.logistic_regression_prediction( + 2, resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities") return algo.compute(w, model) - + train_hpat = hpat.jit(train_impl) prdct_hpat = hpat.jit(prdct_impl) n = 11 d = 4 pred_impl = prdct_impl(n, d, train_impl(n, d).model).prediction pred_hpat = prdct_hpat(n, d, train_hpat(n, d).model).prediction - - np.testing.assert_allclose(pred_impl, pred_hpat) + np.testing.assert_allclose(pred_impl, pred_hpat) if __name__ == "__main__": unittest.main() diff --git a/hpat/tests/test_dataframe.py b/hpat/tests/test_dataframe.py index 70d9f85e7..2aa1d50df 100644 --- a/hpat/tests/test_dataframe.py +++ b/hpat/tests/test_dataframe.py @@ -6,8 +6,7 @@ import numba import hpat -from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains, get_start_end) +from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, count_parfor_OneDs, count_array_OneDs, dist_IR_contains, get_start_end) from hpat.tests.gen_test_data import ParquetGenerator @@ -18,8 +17,10 @@ def inner_get_column(df): # df2['D'] = np.ones(3) return df.A + COL_IND = 0 + class TestDataFrame(unittest.TestCase): @unittest.skip('Error - fix needed\n') @@ -89,7 +90,7 @@ def test_impl(n): def test_box2(self): def test_impl(): - df = pd.DataFrame({'A': [1,2,3], 'B': ['a', 'bb', 'ccc']}) + df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'bb', 'ccc']}) return df hpat_func = hpat.jit(test_impl) @@ -183,7 +184,7 @@ def test_impl(df): 'NUMA_PES=3 build') def test_filter1(self): def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+n, 'B': np.arange(n)**2}) + df = pd.DataFrame({'A': np.arange(n) + n, 'B': np.arange(n)**2}) df1 = df[df.A > .5] return df1.B.sum() @@ -197,7 +198,7 @@ def test_impl(n): 'NUMA_PES=3 build') def test_filter2(self): def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+n, 'B': np.arange(n)**2}) + df = pd.DataFrame({'A': np.arange(n) + n, 'B': np.arange(n)**2}) df1 = df.loc[df.A > .5] return np.sum(df1.B) @@ -211,7 +212,7 @@ def test_impl(n): 'NUMA_PES=3 build') def test_filter3(self): def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+n, 'B': np.arange(n)**2}) + df = pd.DataFrame({'A': np.arange(n) + n, 'B': np.arange(n)**2}) df1 = df.iloc[(df.A > .5).values] return np.sum(df1.B) @@ -232,7 +233,7 @@ def test_impl(df, n): def test_iloc2(self): def test_impl(df, n): - return df.iloc[np.array([1,4,9])].B.values + return df.iloc[np.array([1, 4, 9])].B.values hpat_func = hpat.jit(test_impl) n = 11 @@ -241,7 +242,7 @@ def test_impl(df, n): def test_iloc3(self): def test_impl(df): - return df.iloc[:,1].values + return df.iloc[:, 1].values hpat_func = hpat.jit(test_impl) n = 11 @@ -251,7 +252,7 @@ def test_impl(df): @unittest.skip("TODO: support A[[1,2,3]] in Numba") def test_iloc4(self): def test_impl(df, n): - return df.iloc[[1,4,9]].B.values + return df.iloc[[1, 4, 9]].B.values hpat_func = hpat.jit(test_impl) n = 11 @@ -261,7 +262,7 @@ def test_impl(df, n): def test_iloc5(self): # test iloc with global value def test_impl(df): - return df.iloc[:,COL_IND].values + return df.iloc[:, COL_IND].values hpat_func = hpat.jit(test_impl) n = 11 @@ -270,7 +271,7 @@ def test_impl(df): def test_loc1(self): def test_impl(df): - return df.loc[:,'B'].values + return df.loc[:, 'B'].values hpat_func = hpat.jit(test_impl) n = 11 @@ -281,7 +282,7 @@ def test_impl(df): 'NUMA_PES=3 build') def test_iat1(self): def test_impl(n): - df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n)+n}) + df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n) + n}) return df.iat[3, 1] hpat_func = hpat.jit(test_impl) n = 11 @@ -292,34 +293,34 @@ def test_impl(df): return df.iat[3, 1] hpat_func = hpat.jit(test_impl) n = 11 - df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n)+n}) + df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n) + n}) self.assertEqual(hpat_func(df), test_impl(df)) def test_iat3(self): def test_impl(df, n): - return df.iat[n-1, 1] + return df.iat[n - 1, 1] hpat_func = hpat.jit(test_impl) n = 11 - df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n)+n}) + df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n) + n}) self.assertEqual(hpat_func(df, n), test_impl(df, n)) def test_iat_set1(self): def test_impl(df, n): - df.iat[n-1, 1] = n**2 + df.iat[n - 1, 1] = n**2 return df.A # return the column to check column aliasing hpat_func = hpat.jit(test_impl) n = 11 - df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n)+n}) + df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n) + n}) df2 = df.copy() pd.testing.assert_series_equal(hpat_func(df, n), test_impl(df2, n)) def test_iat_set2(self): def test_impl(df, n): - df.iat[n-1, 1] = n**2 + df.iat[n - 1, 1] = n**2 return df # check df aliasing/boxing hpat_func = hpat.jit(test_impl) n = 11 - df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n)+n}) + df = pd.DataFrame({'B': np.ones(n), 'A': np.arange(n) + n}) df2 = df.copy() pd.testing.assert_frame_equal(hpat_func(df, n), test_impl(df2, n)) @@ -330,7 +331,7 @@ def test_impl(df, n): def test_set_column1(self): # set existing column def test_impl(n): - df = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n)+3.0}) + df = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n) + 3.0}) df['A'] = np.arange(n) return df @@ -349,7 +350,7 @@ def test_impl(df, n): hpat_func = hpat.jit(test_impl) n = 11 - df1 = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n)+3.0}) + df1 = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n) + 3.0}) df2 = df1.copy() hpat_func(df1, n) test_impl(df2, n) @@ -362,7 +363,7 @@ def test_impl(df, n): def test_set_column_new_type1(self): # set existing column with a new type def test_impl(n): - df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)+3.0}) + df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n) + 3.0}) df['A'] = np.arange(n) return df @@ -377,7 +378,7 @@ def test_impl(n): def test_set_column2(self): # create new column def test_impl(n): - df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)+1.0}) + df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n) + 1.0}) df['C'] = np.arange(n) return df @@ -396,7 +397,7 @@ def test_impl(df, n): hpat_func = hpat.jit(test_impl) n = 11 - df1 = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n)+3.0}) + df1 = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n) + 3.0}) df2 = df1.copy() hpat_func(df1, n) test_impl(df2, n) @@ -409,7 +410,7 @@ def test_impl(df): df['C'] = df['A'][df['B']] hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [1,2,3], 'B': [True, False, True]}) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [True, False, True]}) df2 = df.copy() test_impl(df2) hpat_func(df) @@ -697,7 +698,7 @@ def test_impl(n): 'NUMA_PES=3 build') def test_pct_change1(self): def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.pct_change(3) hpat_func = hpat.jit(test_impl) @@ -709,7 +710,7 @@ def test_impl(n): def test_mean1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.mean() hpat_func = hpat.jit(test_impl) @@ -721,7 +722,7 @@ def test_impl(n): def test_std1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.std() hpat_func = hpat.jit(test_impl) @@ -733,7 +734,7 @@ def test_impl(n): def test_var1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.var() hpat_func = hpat.jit(test_impl) @@ -745,7 +746,7 @@ def test_impl(n): def test_max1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.max() hpat_func = hpat.jit(test_impl) @@ -757,7 +758,7 @@ def test_impl(n): def test_min1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.min() hpat_func = hpat.jit(test_impl) @@ -769,7 +770,7 @@ def test_impl(n): def test_sum1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.sum() hpat_func = hpat.jit(test_impl) @@ -781,7 +782,7 @@ def test_impl(n): def test_prod1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.prod() hpat_func = hpat.jit(test_impl) @@ -792,7 +793,7 @@ def test_impl(n): def test_count1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.arange(n)+1}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1}) return df.count() hpat_func = hpat.jit(test_impl) @@ -901,7 +902,7 @@ def test_impl(df): df2.drop(columns=['D'], inplace=True) return df2 - df = pd.DataFrame({'A': [1,2,3], 'B': [2,3,4]}) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) @@ -923,13 +924,13 @@ def test_impl(df, df2): n = 11 df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) df2 = pd.DataFrame({'A': np.arange(n), 'C': np.arange(n)**2}) - df2.A[n//2:] = n + df2.A[n // 2:] = n pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) @unittest.skip("needs dict typing in Numba") def test_isin_dict1(self): def test_impl(df): - vals = {'A': [2,3,4], 'C': [4,5,6]} + vals = {'A': [2, 3, 4], 'C': [4, 5, 6]} return df.isin(vals) hpat_func = hpat.jit(test_impl) @@ -939,7 +940,7 @@ def test_impl(df): def test_isin_list1(self): def test_impl(df): - vals = [2,3,4] + vals = [2, 3, 4] return df.isin(vals) hpat_func = hpat.jit(test_impl) @@ -956,7 +957,7 @@ def test_impl(df, df2): n = 11 df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) df2 = pd.DataFrame({'A': np.arange(n), 'C': np.arange(n)**2}) - df2.A[n//2:] = n + df2.A[n // 2:] = n pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) def test_append2(self): @@ -967,7 +968,7 @@ def test_impl(df, df2, df3): n = 11 df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) df2 = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) - df2.A[n//2:] = n + df2.A[n // 2:] = n df3 = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) pd.testing.assert_frame_equal( hpat_func(df, df2, df3), test_impl(df, df2, df3)) @@ -984,7 +985,7 @@ def test_impl(S1, S2): # TODO: support int as column name pd.testing.assert_frame_equal( hpat_func(S1, S2), - test_impl(S1, S2).rename(columns={0:'0', 1:'1'})) + test_impl(S1, S2).rename(columns={0: '0', 1: '1'})) @unittest.skip('Error - fix needed\n' 'NUMA_PES=3 build') @@ -993,7 +994,7 @@ def test_var_rename(self): # can cause extra assignments and definition handling errors # TODO: inline freevar def test_impl(): - df = pd.DataFrame({'A': [1,2,3], 'B': [2,3,4]}) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) # TODO: df['C'] = [5,6,7] df['C'] = np.ones(3) return inner_get_column(df) diff --git a/hpat/tests/test_date.py b/hpat/tests/test_date.py index 3405cf85c..dbf64e25d 100644 --- a/hpat/tests/test_date.py +++ b/hpat/tests/test_date.py @@ -5,9 +5,9 @@ import numba import hpat from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, - count_parfor_OneD_Vars, count_array_OneD_Vars, - dist_IR_contains) + count_parfor_OneDs, count_array_OneDs, + count_parfor_OneD_Vars, count_array_OneD_Vars, + dist_IR_contains) from datetime import datetime import random @@ -246,7 +246,7 @@ def test_impl(df): hpat_func = hpat.jit(test_impl) df = self._gen_str_date_df() pd.testing.assert_index_equal(hpat_func(df), test_impl(df), - check_names=False) + check_names=False) def test_datetime_index_year(self): def test_impl(df): @@ -340,8 +340,9 @@ def _gen_str_date_df(self): rows = 10 data = [] for row in range(rows): - data.append(datetime(2017, random.randint(1,12), random.randint(1,28)).isoformat()) - return pd.DataFrame({'str_date' : data}) + data.append(datetime(2017, random.randint(1, 12), random.randint(1, 28)).isoformat()) + return pd.DataFrame({'str_date': data}) + if __name__ == "__main__": unittest.main() diff --git a/hpat/tests/test_groupby.py b/hpat/tests/test_groupby.py index eba49b4e3..658d18e5d 100644 --- a/hpat/tests/test_groupby.py +++ b/hpat/tests/test_groupby.py @@ -5,28 +5,28 @@ import numba import hpat from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains, - get_start_end) + count_parfor_OneDs, count_array_OneDs, dist_IR_contains, + get_start_end) _pivot_df1 = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C": ["small", "large", "large", "small", - "small", "large", "small", "small", - "large"], - "D": [1, 2, 2, 6, 3, 4, 5, 6, 9]}) + "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", + "one", "one", "two", "two"], + "C": ["small", "large", "large", "small", + "small", "large", "small", "small", + "large"], + "D": [1, 2, 2, 6, 3, 4, 5, 6, 9]}) class TestGroupBy(unittest.TestCase): def test_agg_seq(self): def test_impl(df): - A = df.groupby('A')['B'].agg(lambda x: x.max()-x.min()) + A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min()) return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) # np.testing.assert_array_equal(hpat_func(df), test_impl(df)) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) @@ -36,7 +36,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_count(self): @@ -45,7 +45,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_mean(self): @@ -54,7 +54,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_min(self): @@ -63,7 +63,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) @unittest.skip("pending numba #3881") @@ -73,7 +73,7 @@ def test_impl(df): return df2 hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': pd.date_range('2019-1-3', '2019-1-9')}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': pd.date_range('2019-1-3', '2019-1-9')}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_max(self): @@ -82,7 +82,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_all_col(self): @@ -91,7 +91,7 @@ def test_impl(df): return df2.B.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_as_index(self): @@ -100,7 +100,7 @@ def test_impl(df): return df2.A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_prod(self): @@ -109,7 +109,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_var(self): @@ -118,7 +118,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_std(self): @@ -127,7 +127,7 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_seq_multiselect(self): @@ -136,8 +136,8 @@ def test_impl(df): return df2.C.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7], - 'C': [3,5,6,5,4,4,3]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7], + 'C': [3, 5, 6, 5, 4, 4, 3]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_multikey_seq(self): @@ -146,8 +146,8 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7], - 'C': [3,5,6,5,4,4,3]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7], + 'C': [3, 5, 6, 5, 4, 4, 3]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) def test_agg_multikey_parallel(self): @@ -157,10 +157,10 @@ def test_impl(in_A, in_B, in_C): return A.sum() hpat_func = hpat.jit(locals={'in_A:input': 'distributed', - 'in_B:input': 'distributed', - 'in_C:input': 'distributed'})(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7], - 'C': [3,5,6,5,4,4,3]}) + 'in_B:input': 'distributed', + 'in_C:input': 'distributed'})(test_impl) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7], + 'C': [3, 5, 6, 5, 4, 4, 3]}) start, end = get_start_end(len(df)) h_A = df.A.values[start:end] h_B = df.B.values[start:end] @@ -175,7 +175,7 @@ def test_impl(in_A, in_B, in_C): def test_agg_parallel(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n, np.int64), 'B': np.arange(n)}) - A = df.groupby('A')['B'].agg(lambda x: x.max()-x.min()) + A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min()) return A.sum() hpat_func = hpat.jit(test_impl) @@ -273,7 +273,7 @@ def test_impl(n): def test_agg_parallel_str(self): def test_impl(): df = pq.read_table("groupby3.pq").to_pandas() - A = df.groupby('A')['B'].agg(lambda x: x.max()-x.min()) + A = df.groupby('A')['B'].agg(lambda x: x.max() - x.min()) return A.sum() hpat_func = hpat.jit(test_impl) @@ -314,7 +314,7 @@ def test_impl(df, cond): return df2.C, c hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [2,1,1,1,2,2,1], 'B': [-8,2,3,1,5,6,7], 'C': [2,3,-1,1,2,3,-1]}) + df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7], 'C': [2, 3, -1, 1, 2, 3, -1]}) cond = df.A > 1 res = test_impl(df, cond) h_res = hpat_func(df, cond) @@ -323,12 +323,12 @@ def test_impl(df, cond): def test_agg_seq_str(self): def test_impl(df): - A = df.groupby('A')['B'].agg(lambda x: (x=='aa').sum()) + A = df.groupby('A')['B'].agg(lambda x: (x == 'aa').sum()) return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': ['aa','b','b','b','aa','aa','b'], - 'B': ['ccc','a','bb','aa','dd','ggg','rr']}) + df = pd.DataFrame({'A': ['aa', 'b', 'b', 'b', 'aa', 'aa', 'b'], + 'B': ['ccc', 'a', 'bb', 'aa', 'dd', 'ggg', 'rr']}) # np.testing.assert_array_equal(hpat_func(df), test_impl(df)) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) @@ -338,8 +338,8 @@ def test_impl(df): return A.values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': ['aa','b','b','b','aa','aa','b'], - 'B': ['ccc','a','bb','aa','dd','ggg','rr']}) + df = pd.DataFrame({'A': ['aa', 'b', 'b', 'b', 'aa', 'aa', 'b'], + 'B': ['ccc', 'a', 'bb', 'aa', 'dd', 'ggg', 'rr']}) # np.testing.assert_array_equal(hpat_func(df), test_impl(df)) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) diff --git a/hpat/tests/test_hiframes.py b/hpat/tests/test_hiframes.py index dacc65b18..8ac7ca62d 100644 --- a/hpat/tests/test_hiframes.py +++ b/hpat/tests/test_hiframes.py @@ -10,8 +10,8 @@ from hpat import hiframes from hpat.str_arr_ext import StringArray from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains, - get_start_end) + count_parfor_OneDs, count_array_OneDs, dist_IR_contains, + get_start_end) class TestHiFrames(unittest.TestCase): @@ -52,7 +52,7 @@ def test_impl(df): return df['A'][df['B']].values hpat_func = hpat.jit(test_impl) - df = pd.DataFrame({'A': [1,2,3], 'B': [True, False, True]}) + df = pd.DataFrame({'A': [1, 2, 3], 'B': [True, False, True]}) np.testing.assert_array_equal(test_impl(df), hpat_func(df)) @unittest.skip('Error - fix needed\n' @@ -122,7 +122,7 @@ def test_impl(): def test_column_map(self): def test_impl(n): df = pd.DataFrame({'A': np.arange(n)}) - df['B'] = df.A.map(lambda a: 2*a) + df['B'] = df.A.map(lambda a: 2 * a) return df.B.sum() n = 121 @@ -133,7 +133,7 @@ def test_impl(n): 'NUMA_PES=3 build') def test_column_map_arg(self): def test_impl(df): - df['B'] = df.A.map(lambda a: 2*a) + df['B'] = df.A.map(lambda a: 2 * a) return n = 121 @@ -435,7 +435,7 @@ def test_str_split_filter(self): def test_impl(df): B = df.A.str.split(',') df2 = pd.DataFrame({'B': B}) - return df2[df2.B.str.len()>1] + return df2[df2.B.str.len() > 1] df = pd.DataFrame({'A': ['AB,CC', 'C,ABB,D', 'G', '', 'g,f']}) hpat_func = hpat.jit(test_impl) @@ -568,7 +568,7 @@ def test_impl(df): 'NUMA_PES=3 build') def test_1D_Var_len(self): def test_impl(n): - df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)+1.0}) + df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n) + 1.0}) df1 = df[df.A > 5] return len(df1.B) @@ -593,8 +593,9 @@ def test_impl(n): self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) # size 7 with unroll + def test_impl_2(n): - df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.random.ranf(n)}) + df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.random.ranf(n)}) Ac = df.A.rolling(7).sum() return Ac.sum() @@ -623,7 +624,7 @@ def test_impl(n): def test_rolling3(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.random.ranf(n)}) - Ac = df.A.rolling(3, center=True).apply(lambda a: a[0]+2*a[1]+a[2]) + Ac = df.A.rolling(3, center=True).apply(lambda a: a[0] + 2 * a[1] + a[2]) return Ac.sum() hpat_func = hpat.jit(test_impl) @@ -677,7 +678,7 @@ def test_impl(df): return C.sum() n = 11 - df = pd.DataFrame({'A': np.random.ranf(3*n), 'B': ['one', 'two', 'three']*n}) + df = pd.DataFrame({'A': np.random.ranf(3 * n), 'B': ['one', 'two', 'three'] * n}) hpat_func = hpat.jit(test_impl) np.testing.assert_almost_equal(hpat_func(df), test_impl(df)) @@ -688,8 +689,8 @@ def test_impl(df): return df.B.sum() n = 121 - A = [3,4,5,6,1] - B = [5,6,2,1,3] + A = [3, 4, 5, 6, 1] + B = [5, 6, 2, 1, 3] n = 5 start, end = get_start_end(n) df = pd.DataFrame({'A': A, 'B': B}) @@ -703,8 +704,8 @@ def test_impl(df): 'NUMA_PES=3 build') def test_concat(self): def test_impl(n): - df1 = pd.DataFrame({'key1': np.arange(n), 'A': np.arange(n)+1.0}) - df2 = pd.DataFrame({'key2': n-np.arange(n), 'A': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'key1': np.arange(n), 'A': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'key2': n - np.arange(n), 'A': n + np.arange(n) + 1.0}) df3 = pd.concat([df1, df2]) return df3.A.sum() + df3.key2.sum() @@ -723,7 +724,7 @@ def test_impl(): df1 = pq.read_table('example.parquet').to_pandas() df2 = pq.read_table('example.parquet').to_pandas() A3 = pd.concat([df1, df2]) - return (A3.two=='foo').sum() + return (A3.two == 'foo').sum() hpat_func = hpat.jit(test_impl) self.assertEqual(hpat_func(), test_impl()) @@ -734,8 +735,8 @@ def test_impl(): 'NUMA_PES=3 build') def test_concat_series(self): def test_impl(n): - df1 = pd.DataFrame({'key1': np.arange(n), 'A': np.arange(n)+1.0}) - df2 = pd.DataFrame({'key2': n-np.arange(n), 'A': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'key1': np.arange(n), 'A': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'key2': n - np.arange(n), 'A': n + np.arange(n) + 1.0}) A3 = pd.concat([df1.A, df2.A]) return A3.sum() @@ -754,7 +755,7 @@ def test_impl(): df1 = pq.read_table('example.parquet').to_pandas() df2 = pq.read_table('example.parquet').to_pandas() A3 = pd.concat([df1.two, df2.two]) - return (A3=='foo').sum() + return (A3 == 'foo').sum() hpat_func = hpat.jit(test_impl) self.assertEqual(hpat_func(), test_impl()) @@ -771,15 +772,14 @@ def test_impl(nsyms): s_open = 20 * np.ones(max_num_days) s_low = 28 * np.ones(max_num_days) s_close = 19 * np.ones(max_num_days) - df = pd.DataFrame({'Open': s_open, 'Low': s_low, - 'Close': s_close}) + df = pd.DataFrame({'Open': s_open, 'Low': s_low, 'Close': s_close}) df['Stdev'] = df['Close'].rolling(window=90).std() df['Moving Average'] = df['Close'].rolling(window=20).mean() df['Criteria1'] = (df['Open'] - df['Low'].shift(1)) < -df['Stdev'] df['Criteria2'] = df['Open'] > df['Moving Average'] df['BUY'] = df['Criteria1'] & df['Criteria2'] df['Pct Change'] = (df['Close'] - df['Open']) / df['Open'] - df['Rets'] = df['Pct Change'][df['BUY'] == True] + df['Rets'] = df['Pct Change'][df['BUY']] all_res += df['Rets'].mean() return all_res @@ -800,15 +800,16 @@ def test_impl(A, B): df2['C'] = np.full_like(df2.B.values, 3, np.int8) return df2 - A = np.array([1,1,2,3]) - B = np.array([3,4,5,6]) + A = np.array([1, 1, 2, 3]) + B = np.array([3, 4, 5, 6]) hpat_func = hpat.jit(locals={'A:input': 'distributed', - 'B:input': 'distributed', 'df2:return': 'distributed'})(test_impl) + 'B:input': 'distributed', 'df2:return': 'distributed'})(test_impl) start, end = get_start_end(len(A)) df2 = hpat_func(A[start:end], B[start:end]) # TODO: # pd.testing.assert_frame_equal( # hpat_func(A[start:end], B[start:end]), test_impl(A, B)) + if __name__ == "__main__": unittest.main() diff --git a/hpat/tests/test_io.py b/hpat/tests/test_io.py index 5560a182d..2a040df0a 100644 --- a/hpat/tests/test_io.py +++ b/hpat/tests/test_io.py @@ -6,8 +6,8 @@ import pyarrow.parquet as pq import hpat from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains, get_rank, - get_start_end) + count_parfor_OneDs, count_array_OneDs, dist_IR_contains, get_rank, + get_start_end) kde_file = 'kde.parquet' @@ -92,10 +92,10 @@ def test_impl(): @unittest.skip("fix collective create dataset") def test_h5_write_parallel(self): def test_impl(N, D): - points = np.ones((N,D)) - responses = np.arange(N)+1.0 + points = np.ones((N, D)) + responses = np.arange(N) + 1.0 f = h5py.File("lr_w.hdf5", "w") - dset1 = f.create_dataset("points", (N,D), dtype='f8') + dset1 = f.create_dataset("points", (N, D), dtype='f8') dset1[:] = points dset2 = f.create_dataset("responses", (N,), dtype='f8') dset2[:] = responses @@ -109,8 +109,8 @@ def test_impl(N, D): X = f['points'][:] Y = f['responses'][:] f.close() - np.testing.assert_almost_equal(X, np.ones((N,D))) - np.testing.assert_almost_equal(Y, np.arange(N)+1.0) + np.testing.assert_almost_equal(X, np.ones((N, D))) + np.testing.assert_almost_equal(Y, np.arange(N) + 1.0) @unittest.skip("fix collective create dataset and group") def test_h5_write_group(self): @@ -187,7 +187,7 @@ def test_h5_filter(self): def test_impl(): f = h5py.File("h5_test_filter.h5", "r") b = np.arange(11) % 3 == 0 - X = f['test'][b,:,:,:] + X = f['test'][b, :, :, :] f.close() return X @@ -230,6 +230,7 @@ def test_impl(): 'NUMA_PES=3 build') def test_pq_read_freevar_str1(self): kde_file2 = 'kde.parquet' + def test_impl(): df = pd.read_parquet(kde_file2) X = df['points'] @@ -258,7 +259,7 @@ def test_impl(): def test_pq_str(self): def test_impl(): df = pq.read_table('example.parquet').to_pandas() - A = df.two.values=='foo' + A = df.two.values == 'foo' return A.sum() hpat_func = hpat.jit(test_impl) @@ -271,7 +272,7 @@ def test_impl(): def test_pq_str_with_nan_seq(self): def test_impl(): df = pq.read_table('example.parquet').to_pandas() - A = df.five.values=='foo' + A = df.five.values == 'foo' return A hpat_func = hpat.jit(test_impl) @@ -282,7 +283,7 @@ def test_impl(): def test_pq_str_with_nan_par(self): def test_impl(): df = pq.read_table('example.parquet').to_pandas() - A = df.five.values=='foo' + A = df.five.values == 'foo' return A.sum() hpat_func = hpat.jit(test_impl) @@ -298,7 +299,7 @@ def test_impl(): def test_pq_str_with_nan_par_multigroup(self): def test_impl(): df = pq.read_table('example2.parquet').to_pandas() - A = df.five.values=='foo' + A = df.five.values == 'foo' return A.sum() hpat_func = hpat.jit(test_impl) @@ -361,9 +362,9 @@ def test_impl(): def test_csv1(self): def test_impl(): return pd.read_csv("csv_data1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':np.float, 'D':np.int}, - ) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': np.float, 'D': np.int}, + ) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -371,11 +372,11 @@ def test_impl(): 'NUMA_PES=3 build') def test_csv_keys1(self): def test_impl(): - dtype = {'A':np.int, 'B':np.float, 'C':np.float, 'D':np.int} + dtype = {'A': np.int, 'B': np.float, 'C': np.float, 'D': np.int} return pd.read_csv("csv_data1.csv", - names=dtype.keys(), - dtype=dtype, - ) + names=dtype.keys(), + dtype=dtype, + ) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -383,9 +384,9 @@ def test_csv_const_dtype1(self): def test_impl(): dtype = {'A': 'int', 'B': 'float64', 'C': 'float', 'D': 'int64'} return pd.read_csv("csv_data1.csv", - names=dtype.keys(), - dtype=dtype, - ) + names=dtype.keys(), + dtype=dtype, + ) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -409,10 +410,10 @@ def test_impl(): def test_csv_skip1(self): def test_impl(): return pd.read_csv("csv_data1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':np.float, 'D':np.int}, - skiprows=2, - ) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': np.float, 'D': np.int}, + skiprows=2, + ) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -430,7 +431,7 @@ def test_impl(): def test_csv_infer_skip_parallel1(self): def test_impl(): df = pd.read_csv("csv_data_infer1.csv", skiprows=2, - names=['A', 'B', 'C', 'D']) + names=['A', 'B', 'C', 'D']) return df.A.sum(), df.B.sum(), df.C.sum(), df.D.sum() hpat_func = hpat.jit(test_impl) @@ -441,8 +442,8 @@ def test_impl(): def test_csv_rm_dead1(self): def test_impl(): df = pd.read_csv("csv_data1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':np.float, 'D':np.int},) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': np.float, 'D': np.int},) return df.B.values hpat_func = hpat.jit(test_impl) np.testing.assert_array_equal(hpat_func(), test_impl()) @@ -450,17 +451,17 @@ def test_impl(): def test_csv_date1(self): def test_impl(): return pd.read_csv("csv_data_date1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':str, 'D':np.int}, - parse_dates=[2]) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': str, 'D': np.int}, + parse_dates=[2]) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) def test_csv_str1(self): def test_impl(): return pd.read_csv("csv_data_date1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':str, 'D':np.int}) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': str, 'D': np.int}) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -469,8 +470,8 @@ def test_impl(): def test_csv_parallel1(self): def test_impl(): df = pd.read_csv("csv_data1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':np.float, 'D':np.int}) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': np.float, 'D': np.int}) return (df.A.sum(), df.B.sum(), df.C.sum(), df.D.sum()) hpat_func = hpat.jit(test_impl) self.assertEqual(hpat_func(), test_impl()) @@ -480,8 +481,8 @@ def test_impl(): def test_csv_str_parallel1(self): def test_impl(): df = pd.read_csv("csv_data_date1.csv", - names=['A', 'B', 'C', 'D'], - dtype={'A':np.int, 'B':np.float, 'C':str, 'D':np.int}) + names=['A', 'B', 'C', 'D'], + dtype={'A': np.int, 'B': np.float, 'C': str, 'D': np.int}) return (df.A.sum(), df.B.sum(), (df.C == '1966-11-13').sum(), df.D.sum()) hpat_func = hpat.jit(locals={'df:return': 'distributed'})(test_impl) @@ -492,21 +493,21 @@ def test_impl(): def test_csv_usecols1(self): def test_impl(): return pd.read_csv("csv_data1.csv", - names=['C'], - dtype={'C':np.float}, - usecols=[2], - ) + names=['C'], + dtype={'C': np.float}, + usecols=[2], + ) hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) def test_csv_cat1(self): def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C']) - dtypes = {'C1':np.int, 'C2': ct_dtype, 'C3':str} + dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} df = pd.read_csv("csv_data_cat1.csv", - names=['C1', 'C2', 'C3'], - dtype=dtypes, - ) + names=['C1', 'C2', 'C3'], + dtype=dtypes, + ) return df.C2 hpat_func = hpat.jit(test_impl) pd.testing.assert_series_equal( @@ -516,9 +517,9 @@ def test_csv_cat2(self): def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C', 'D']) df = pd.read_csv("csv_data_cat1.csv", - names=['C1', 'C2', 'C3'], - dtype={'C1':np.int, 'C2': ct_dtype, 'C3':str}, - ) + names=['C1', 'C2', 'C3'], + dtype={'C1': np.int, 'C2': ct_dtype, 'C3': str}, + ) return df hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) @@ -526,9 +527,9 @@ def test_impl(): def test_csv_single_dtype1(self): def test_impl(): df = pd.read_csv("csv_data_dtype1.csv", - names=['C1', 'C2'], - dtype=np.float64, - ) + names=['C1', 'C2'], + dtype=np.float64, + ) return df hpat_func = hpat.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl()) diff --git a/hpat/tests/test_join.py b/hpat/tests/test_join.py index 5eeace639..da4713d52 100644 --- a/hpat/tests/test_join.py +++ b/hpat/tests/test_join.py @@ -9,18 +9,18 @@ import hpat from hpat.str_arr_ext import StringArray from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains, - get_start_end) + count_parfor_OneDs, count_array_OneDs, dist_IR_contains, + get_start_end) class TestJoin(unittest.TestCase): @unittest.skip('Error - fix needed\n' - 'NUMA_PES=3 build') + 'NUMA_PES=3 build') def test_join1(self): def test_impl(n): - df1 = pd.DataFrame({'key1': np.arange(n)+3, 'A': np.arange(n)+1.0}) - df2 = pd.DataFrame({'key2': 2*np.arange(n)+1, 'B': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'key1': np.arange(n) + 3, 'A': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'key2': 2 * np.arange(n) + 1, 'B': n + np.arange(n) + 1.0}) df3 = pd.merge(df1, df2, left_on='key1', right_on='key2') return df3.B.sum() @@ -41,12 +41,12 @@ def test_impl(df1, df2): hpat_func = hpat.jit(test_impl) n = 11 - df1 = pd.DataFrame({'key1': np.arange(n)+3, 'A': np.arange(n)+1.0}) - df2 = pd.DataFrame({'key2': 2*np.arange(n)+1, 'B': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'key1': np.arange(n) + 3, 'A': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'key2': 2 * np.arange(n) + 1, 'B': n + np.arange(n) + 1.0}) pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2)) n = 11111 - df1 = pd.DataFrame({'key1': np.arange(n)+3, 'A': np.arange(n)+1.0}) - df2 = pd.DataFrame({'key2': 2*np.arange(n)+1, 'B': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'key1': np.arange(n) + 3, 'A': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'key2': 2 * np.arange(n) + 1, 'B': n + np.arange(n) + 1.0}) pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2)) def test_join1_seq_str(self): @@ -77,13 +77,13 @@ def test_impl(df1, df2): return df1.merge(df2, on=['A', 'B']) hpat_func = hpat.jit(test_impl) - df1 = pd.DataFrame({'A': [3,1,1,3,4], - 'B': [1,2,3,2,3], - 'C': [7,8,9,4,5]}) + df1 = pd.DataFrame({'A': [3, 1, 1, 3, 4], + 'B': [1, 2, 3, 2, 3], + 'C': [7, 8, 9, 4, 5]}) - df2 = pd.DataFrame({'A': [2,1,4,4,3], - 'B': [1,3,2,3,2], - 'D': [1,2,3,4,8]}) + df2 = pd.DataFrame({'A': [2, 1, 4, 4, 3], + 'B': [1, 3, 2, 3, 2], + 'D': [1, 2, 3, 4, 8]}) pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2)) @@ -102,14 +102,14 @@ def test_impl(A1, B1, C1, A2, B2, D2): 'C1:input': 'distributed', 'A2:input': 'distributed', 'B2:input': 'distributed', - 'D2:input': 'distributed',})(test_impl) - df1 = pd.DataFrame({'A': [3,1,1,3,4], - 'B': [1,2,3,2,3], - 'C': [7,8,9,4,5]}) + 'D2:input': 'distributed', })(test_impl) + df1 = pd.DataFrame({'A': [3, 1, 1, 3, 4], + 'B': [1, 2, 3, 2, 3], + 'C': [7, 8, 9, 4, 5]}) - df2 = pd.DataFrame({'A': [2,1,4,4,3], - 'B': [1,3,2,3,2], - 'D': [1,2,3,4,8]}) + df2 = pd.DataFrame({'A': [2, 1, 4, 4, 3], + 'B': [1, 3, 2, 3, 2], + 'D': [1, 2, 3, 4, 8]}) start, end = get_start_end(len(df1)) h_A1 = df1.A.values[start:end] @@ -142,14 +142,14 @@ def test_impl(A1, B1, C1, A2, B2, D2): hpat_func = hpat.jit(locals={ 'A1:input': 'distributed', 'B1:input': 'distributed', - 'C1:input': 'distributed',})(test_impl) - df1 = pd.DataFrame({'A': [3,1,1,3,4], - 'B': [1,2,3,2,3], - 'C': [7,8,9,4,5]}) + 'C1:input': 'distributed', })(test_impl) + df1 = pd.DataFrame({'A': [3, 1, 1, 3, 4], + 'B': [1, 2, 3, 2, 3], + 'C': [7, 8, 9, 4, 5]}) - df2 = pd.DataFrame({'A': [2,1,4,4,3], - 'B': [1,3,2,3,2], - 'D': [1,2,3,4,8]}) + df2 = pd.DataFrame({'A': [2, 1, 4, 4, 3], + 'B': [1, 3, 2, 3, 2], + 'D': [1, 2, 3, 4, 8]}) start, end = get_start_end(len(df1)) h_A1 = df1.A.values[start:end] @@ -217,7 +217,7 @@ def test_impl(df1, df2): df2 = pd.DataFrame( {'time': pd.DatetimeIndex( ['2017-01-01', '2017-01-02', '2017-01-04', '2017-02-23', - '2017-02-25']), 'A': [2,3,7,8,9]}) + '2017-02-25']), 'A': [2, 3, 7, 8, 9]}) pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2)) @unittest.skip('AssertionError - fix needed\n' @@ -240,9 +240,9 @@ def test_impl(df1, df2): hpat_func = hpat.jit(test_impl) df1 = pd.DataFrame( - {'key': [2,3,5,1,2,8], 'A': np.array([4,6,3,9,9,-1], np.float)}) + {'key': [2, 3, 5, 1, 2, 8], 'A': np.array([4, 6, 3, 9, 9, -1], np.float)}) df2 = pd.DataFrame( - {'key': [1,2,9,3,2], 'B': np.array([1,7,2,6,5], np.float)}) + {'key': [1, 2, 9, 3, 2], 'B': np.array([1, 7, 2, 6, 5], np.float)}) h_res = hpat_func(df1, df2) res = test_impl(df1, df2) np.testing.assert_array_equal(h_res.key.values, res.key.values) @@ -258,9 +258,9 @@ def test_impl(df1, df2): hpat_func = hpat.jit(test_impl) # test left run where a key is repeated on left but not right side df1 = pd.DataFrame( - {'key': [2,3,5,3,2,8], 'A': np.array([4,6,3,9,9,-1], np.float)}) + {'key': [2, 3, 5, 3, 2, 8], 'A': np.array([4, 6, 3, 9, 9, -1], np.float)}) df2 = pd.DataFrame( - {'key': [1,2,9,3,10], 'B': np.array([1,7,2,6,5], np.float)}) + {'key': [1, 2, 9, 3, 10], 'B': np.array([1, 7, 2, 6, 5], np.float)}) h_res = hpat_func(df1, df2) res = test_impl(df1, df2) np.testing.assert_array_equal(h_res.key.values, res.key.values) @@ -275,9 +275,9 @@ def test_impl(df1, df2): hpat_func = hpat.jit(test_impl) df1 = pd.DataFrame( - {'key': [2,3,5,1,2,8], 'A': np.array([4,6,3,9,9,-1], np.float)}) + {'key': [2, 3, 5, 1, 2, 8], 'A': np.array([4, 6, 3, 9, 9, -1], np.float)}) df2 = pd.DataFrame( - {'key': [1,2,9,3,2], 'B': np.array([1,7,2,6,5], np.float)}) + {'key': [1, 2, 9, 3, 2], 'B': np.array([1, 7, 2, 6, 5], np.float)}) h_res = hpat_func(df1, df2) res = test_impl(df1, df2) self.assertEqual(set(h_res.key.values), set(res.key.values)) @@ -292,9 +292,9 @@ def test_impl(df1, df2): hpat_func = hpat.jit(test_impl) df1 = pd.DataFrame( - {'key': [2,3,5,1,2,8], 'A': np.array([4,6,3,9,9,-1], np.float)}) + {'key': [2, 3, 5, 1, 2, 8], 'A': np.array([4, 6, 3, 9, 9, -1], np.float)}) df2 = pd.DataFrame( - {'key': [1,2,9,3,2], 'B': np.array([1,7,2,6,5], np.float)}) + {'key': [1, 2, 9, 3, 2], 'B': np.array([1, 7, 2, 6, 5], np.float)}) h_res = hpat_func(df1, df2) res = test_impl(df1, df2) self.assertEqual(set(h_res.key.values), set(res.key.values)) @@ -313,22 +313,22 @@ def test_impl(df1, df2, df3, df4): hpat_func = hpat.jit(test_impl) n = 11 - df1 = pd.DataFrame({'A': np.arange(n)+3, 'AA': np.arange(n)+1.0}) - df2 = pd.DataFrame({'A': 2*np.arange(n)+1, 'AAA': n+np.arange(n)+1.0}) - df3 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BB': n+np.arange(n)+1.0}) - df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0}) + df1 = pd.DataFrame({'A': np.arange(n) + 3, 'AA': np.arange(n) + 1.0}) + df2 = pd.DataFrame({'A': 2 * np.arange(n) + 1, 'AAA': n + np.arange(n) + 1.0}) + df3 = pd.DataFrame({'B': 2 * np.arange(n) + 1, 'BB': n + np.arange(n) + 1.0}) + df4 = pd.DataFrame({'B': 2 * np.arange(n) + 1, 'BBB': n + np.arange(n) + 1.0}) pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1]) def test_join_cat1(self): def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C']) - dtypes = {'C1':np.int, 'C2': ct_dtype, 'C3':str} + dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} df1 = pd.read_csv("csv_data_cat1.csv", - names=['C1', 'C2', 'C3'], - dtype=dtypes, - ) + names=['C1', 'C2', 'C3'], + dtype=dtypes, + ) n = len(df1) - df2 = pd.DataFrame({'C1': 2*np.arange(n)+1, 'AAA': n+np.arange(n)+1.0}) + df2 = pd.DataFrame({'C1': 2 * np.arange(n) + 1, 'AAA': n + np.arange(n) + 1.0}) df3 = df1.merge(df2, on='C1') return df3 @@ -339,13 +339,13 @@ def test_join_cat2(self): # test setting NaN in categorical array def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C']) - dtypes = {'C1':np.int, 'C2': ct_dtype, 'C3':str} + dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} df1 = pd.read_csv("csv_data_cat1.csv", - names=['C1', 'C2', 'C3'], - dtype=dtypes, - ) + names=['C1', 'C2', 'C3'], + dtype=dtypes, + ) n = len(df1) - df2 = pd.DataFrame({'C1': 2*np.arange(n)+1, 'AAA': n+np.arange(n)+1.0}) + df2 = pd.DataFrame({'C1': 2 * np.arange(n) + 1, 'AAA': n + np.arange(n) + 1.0}) df3 = df1.merge(df2, on='C1', how='right') return df3 @@ -358,13 +358,13 @@ def test_join_cat_parallel1(self): # TODO: cat as keys def test_impl(): ct_dtype = CategoricalDtype(['A', 'B', 'C']) - dtypes = {'C1':np.int, 'C2': ct_dtype, 'C3':str} + dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} df1 = pd.read_csv("csv_data_cat1.csv", - names=['C1', 'C2', 'C3'], - dtype=dtypes, - ) + names=['C1', 'C2', 'C3'], + dtype=dtypes, + ) n = len(df1) - df2 = pd.DataFrame({'C1': 2*np.arange(n)+1, 'AAA': n+np.arange(n)+1.0}) + df2 = pd.DataFrame({'C1': 2 * np.arange(n) + 1, 'AAA': n + np.arange(n) + 1.0}) df3 = df1.merge(df2, on='C1') return df3 diff --git a/hpat/tests/test_ml.py b/hpat/tests/test_ml.py index d546d6b3e..d6f513cda 100644 --- a/hpat/tests/test_ml.py +++ b/hpat/tests/test_ml.py @@ -5,9 +5,9 @@ import numba import hpat from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, - count_parfor_OneD_Vars, count_array_OneD_Vars, - dist_IR_contains) + count_parfor_OneDs, count_array_OneDs, + count_parfor_OneD_Vars, count_array_OneD_Vars, + dist_IR_contains) class TestML(unittest.TestCase): @@ -17,12 +17,12 @@ class TestML(unittest.TestCase): def test_logistic_regression(self): def test_impl(n, d): iterations = 3 - X = np.ones((n,d))+.5 + X = np.ones((n, d)) + .5 Y = np.ones(n) D = X.shape[1] - w = np.ones(D)-0.5 + w = np.ones(D) - 0.5 for i in range(iterations): - w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X,w))) - 1.0) * Y), X) + w -= np.dot(((1.0 / (1.0 + np.exp(-Y * np.dot(X, w))) - 1.0) * Y), X) return w hpat_func = hpat.jit(test_impl) @@ -39,7 +39,7 @@ def test_impl(N, D): iterations = 3 g = 2 * np.ones(D) - 1 X = 2 * np.ones((N, D)) - 1 - Y = ((np.dot(X, g) > 0.0) == (np.ones(N) > .90))+.0 + Y = ((np.dot(X, g) > 0.0) == (np.ones(N) > .90)) + .0 w = 2 * np.ones(D) - 1 for i in range(iterations): @@ -89,7 +89,7 @@ def test_impl(n): p = X[i] d = (-(p - points)**2) / (2 * b**2) m = np.min(d) - exps += m - np.log(b * N)+np.log(np.sum(np.exp(d - m))) + exps += m - np.log(b * N) + np.log(np.sum(np.exp(d - m))) return exps hpat_func = hpat.jit(test_impl) @@ -105,12 +105,12 @@ def test_impl(numCenter, numIter, N, D): centroids = np.zeros((numCenter, D)) for l in range(numIter): - dist = np.array([[sqrt(np.sum((A[i,:]-centroids[j,:])**2)) - for j in range(numCenter)] for i in range(N)]) - labels = np.array([dist[i,:].argmin() for i in range(N)]) + dist = np.array([[sqrt(np.sum((A[i, :] - centroids[j, :])**2)) + for j in range(numCenter)] for i in range(N)]) + labels = np.array([dist[i, :].argmin() for i in range(N)]) - centroids = np.array([[np.sum(A[labels==i, j])/np.sum(labels==i) - for j in range(D)] for i in range(numCenter)]) + centroids = np.array([[np.sum(A[labels == i, j]) / np.sum(labels == i) + for j in range(D)] for i in range(numCenter)]) return centroids @@ -122,5 +122,6 @@ def test_impl(numCenter, numIter, N, D): self.assertEqual(count_parfor_OneDs(), 5) self.assertEqual(count_parfor_OneD_Vars(), 1) + if __name__ == "__main__": unittest.main() diff --git a/hpat/tests/test_rolling.py b/hpat/tests/test_rolling.py index 856b538ae..728e8425f 100644 --- a/hpat/tests/test_rolling.py +++ b/hpat/tests/test_rolling.py @@ -6,7 +6,7 @@ import numba import hpat from hpat.tests.test_utils import (count_array_REPs, count_parfor_REPs, - count_parfor_OneDs, count_array_OneDs, dist_IR_contains) + count_parfor_OneDs, count_array_OneDs, dist_IR_contains) from hpat.hiframes.rolling import supported_rolling_funcs LONG_TEST = (int(os.environ['HPAT_LONG_ROLLING_TEST']) != 0 @@ -132,17 +132,17 @@ def test_impl(n, w, center): def test_variable1(self): # test sequentially with manually created dfs df1 = pd.DataFrame({'B': [0, 1, 2, np.nan, 4], - 'time': [pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')]}) + 'time': [pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')]}) df2 = pd.DataFrame({'B': [0, 1, 2, -2, 4], - 'time': [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:09')]}) + 'time': [pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:09')]}) wins = ('2s',) if LONG_TEST: wins = ('1s', '2s', '3s', '4s') @@ -181,17 +181,17 @@ def test_variable2(self): def test_variable_apply1(self): # test sequentially with manually created dfs df1 = pd.DataFrame({'B': [0, 1, 2, np.nan, 4], - 'time': [pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')]}) + 'time': [pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')]}) df2 = pd.DataFrame({'B': [0, 1, 2, -2, 4], - 'time': [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:09')]}) + 'time': [pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:09')]}) wins = ('2s',) if LONG_TEST: wins = ('1s', '2s', '3s', '4s') @@ -294,6 +294,7 @@ def test_series_fixed1(self): pd.testing.assert_series_equal(hpat_func(S1, *args), test_impl(S1, *args)) pd.testing.assert_series_equal(hpat_func(S2, *args), test_impl(S2, *args)) # test apply + def apply_test_impl(S, w, c): return S.rolling(w, center=c).apply(lambda a: a.sum()) hpat_func = hpat.jit(apply_test_impl) @@ -312,12 +313,14 @@ def test_series_cov1(self): if LONG_TEST: wins = (2, 3, 5) centers = (False, True) + def test_impl(S, S2, w, c): return S.rolling(w, center=c).cov(S2) hpat_func = hpat.jit(test_impl) for args in itertools.product([S1, S2], [S1, S2], wins, centers): pd.testing.assert_series_equal(hpat_func(*args), test_impl(*args)) pd.testing.assert_series_equal(hpat_func(*args), test_impl(*args)) + def test_impl2(S, S2, w, c): return S.rolling(w, center=c).corr(S2) hpat_func = hpat.jit(test_impl2) @@ -336,12 +339,14 @@ def test_df_cov1(self): if LONG_TEST: wins = (2, 3, 5) centers = (False, True) + def test_impl(df, df2, w, c): return df.rolling(w, center=c).cov(df2) hpat_func = hpat.jit(test_impl) for args in itertools.product([df1, df2], [df1, df2], wins, centers): pd.testing.assert_frame_equal(hpat_func(*args), test_impl(*args)) pd.testing.assert_frame_equal(hpat_func(*args), test_impl(*args)) + def test_impl2(df, df2, w, c): return df.rolling(w, center=c).corr(df2) hpat_func = hpat.jit(test_impl2) @@ -349,5 +354,6 @@ def test_impl2(df, df2, w, c): pd.testing.assert_frame_equal(hpat_func(*args), test_impl2(*args)) pd.testing.assert_frame_equal(hpat_func(*args), test_impl2(*args)) + if __name__ == "__main__": unittest.main() diff --git a/hpat/tests/test_strings.py b/hpat/tests/test_strings.py index 37c4be50a..50229646c 100644 --- a/hpat/tests/test_strings.py +++ b/hpat/tests/test_strings.py @@ -65,7 +65,7 @@ def test_impl(_str): def test_concat(self): def test_impl(_str): - return (_str+'test_str') + return (_str + 'test_str') hpat_func = hpat.jit(test_impl) arg = 'a_' @@ -210,7 +210,7 @@ def test_impl(ds): self.assertTrue(isinstance(ds, pd.Series) and isinstance(rs, pd.Series)) self.assertTrue(ds[0] == 'one' and ds[2] == 'three' and - rs[0] == True and rs[2] == False) + rs[0] and rs[2] == False) def test_string_array_bool_getitem(self): def test_impl():