Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Add ops for strict decoding of windows-1251/1252
We have new versions of `encode`, `decode`, and `encoderep` that allow
a new config flag. These ops differ from the previous encode/decode
ops in that by default they decode using strict rather than permissive
methods. They throw on encountering a codepoint that is not mapped in
the standard and optionally will decode permissively.
These new ops all end with *config.

While all other ops are `config` variants of current ops, `decoderep`
never existed before, so `decoderepconfig` has been added to allow
decoding with replacements

By default with windows-1252 and windows-1251 these new ops:
* Throw when they encounter a character which does not map to the
  other encoding.
* When used under replacement mode (*repconfig), this causes
  replacements to be done with codepoints that may fit into the target
  encoding but are invalid (i.e. 129 in windows-1252).

This adds new ops:
* `decoderepconfig`: Strict and replaces decoded
   characters that don't have official mappings with a supplied
   replacement string. Currently it is limited to substituting the first
   grapheme of the supplied replacement string (should be useful in most
   cases).
* `encodeconfig`: like `encode` but strict by default and new config flag
* `decodeconfig`: like `decode` but strict by default and new config flag
* `encoderepconfig`: like `encoderep` but strict by default and new config flag
  • Loading branch information
samcv committed Feb 20, 2018
1 parent b39fa0e commit 314c523
Show file tree
Hide file tree
Showing 10 changed files with 462 additions and 239 deletions.
278 changes: 157 additions & 121 deletions lib/MAST/Ops.nqp
Expand Up @@ -792,74 +792,78 @@ BEGIN {
1970,
1972,
1975,
1978,
1981,
1984,
1987,
1986,
1990,
1994,
1996,
1995,
1998,
2000,
2002,
2001,
2004,
2006,
2008,
2007,
2010,
2012,
2014,
2017,
2016,
2018,
2020,
2023,
2022,
2024,
2026,
2027,
2029,
2033,
2036,
2039,
2042,
2045,
2048,
2051,
2054,
2057,
2060,
2063,
2066,
2069,
2072,
2075,
2078,
2081,
2085,
2028,
2030,
2032,
2034,
2037,
2040,
2043,
2046,
2047,
2049,
2053,
2056,
2059,
2062,
2065,
2068,
2071,
2074,
2077,
2080,
2083,
2086,
2089,
2092,
2095,
2098,
2101,
2104,
2107,
2110,
2113,
2116,
2119,
2122,
2126,
2105,
2109,
2112,
2115,
2118,
2121,
2124,
2127,
2130,
2131,
2133,
2135,
2137,
2141,
2143,
2145,
2145,
2145,
2136,
2139,
2142,
2146,
2147,
2147,
2148,
2150);
2150,
2151,
2153,
2155,
2157,
2161,
2163,
2165,
2165,
2165,
2166,
2167,
2167,
2168,
2170);
MAST::Ops.WHO<@counts> := nqp::list_i(0,
2,
2,
Expand Down Expand Up @@ -1648,6 +1652,10 @@ BEGIN {
2,
2,
3,
6,
5,
4,
5,
3,
3,
3,
Expand Down Expand Up @@ -3692,6 +3700,26 @@ BEGIN {
66,
65,
65,
66,
57,
57,
57,
65,
33,
66,
57,
57,
65,
33,
58,
65,
57,
33,
58,
65,
57,
57,
33,
65,
128,
152,
Expand Down Expand Up @@ -4659,75 +4687,79 @@ BEGIN {
'getarg_s', 785,
'getarg_o', 786,
'coerce_II', 787,
'sp_guard', 788,
'sp_guardconc', 789,
'sp_guardtype', 790,
'sp_guardsf', 791,
'sp_guardsfouter', 792,
'sp_rebless', 793,
'sp_resolvecode', 794,
'sp_decont', 795,
'sp_getlex_o', 796,
'sp_getlex_ins', 797,
'sp_getlex_no', 798,
'sp_getarg_o', 799,
'sp_getarg_i', 800,
'sp_getarg_n', 801,
'sp_getarg_s', 802,
'sp_fastinvoke_v', 803,
'sp_fastinvoke_i', 804,
'sp_fastinvoke_n', 805,
'sp_fastinvoke_s', 806,
'sp_fastinvoke_o', 807,
'sp_paramnamesused', 808,
'sp_getspeshslot', 809,
'sp_findmeth', 810,
'sp_fastcreate', 811,
'sp_get_o', 812,
'sp_get_i64', 813,
'sp_get_i32', 814,
'sp_get_i16', 815,
'sp_get_i8', 816,
'sp_get_n', 817,
'sp_get_s', 818,
'sp_bind_o', 819,
'sp_bind_i64', 820,
'sp_bind_i32', 821,
'sp_bind_i16', 822,
'sp_bind_i8', 823,
'sp_bind_n', 824,
'sp_bind_s', 825,
'sp_p6oget_o', 826,
'sp_p6ogetvt_o', 827,
'sp_p6ogetvc_o', 828,
'sp_p6oget_i', 829,
'sp_p6oget_n', 830,
'sp_p6oget_s', 831,
'sp_p6obind_o', 832,
'sp_p6obind_i', 833,
'sp_p6obind_n', 834,
'sp_p6obind_s', 835,
'sp_deref_get_i64', 836,
'sp_deref_get_n', 837,
'sp_deref_bind_i64', 838,
'sp_deref_bind_n', 839,
'sp_getlexvia_o', 840,
'sp_getlexvia_ins', 841,
'sp_jit_enter', 842,
'sp_boolify_iter', 843,
'sp_boolify_iter_arr', 844,
'sp_boolify_iter_hash', 845,
'sp_cas_o', 846,
'sp_atomicload_o', 847,
'sp_atomicstore_o', 848,
'prof_enter', 849,
'prof_enterspesh', 850,
'prof_enterinline', 851,
'prof_enternative', 852,
'prof_exit', 853,
'prof_allocated', 854,
'ctw_check', 855,
'coverage_log', 856);
'encoderepconf', 788,
'encodeconf', 789,
'decodeconf', 790,
'decoderepconf', 791,
'sp_guard', 792,
'sp_guardconc', 793,
'sp_guardtype', 794,
'sp_guardsf', 795,
'sp_guardsfouter', 796,
'sp_rebless', 797,
'sp_resolvecode', 798,
'sp_decont', 799,
'sp_getlex_o', 800,
'sp_getlex_ins', 801,
'sp_getlex_no', 802,
'sp_getarg_o', 803,
'sp_getarg_i', 804,
'sp_getarg_n', 805,
'sp_getarg_s', 806,
'sp_fastinvoke_v', 807,
'sp_fastinvoke_i', 808,
'sp_fastinvoke_n', 809,
'sp_fastinvoke_s', 810,
'sp_fastinvoke_o', 811,
'sp_paramnamesused', 812,
'sp_getspeshslot', 813,
'sp_findmeth', 814,
'sp_fastcreate', 815,
'sp_get_o', 816,
'sp_get_i64', 817,
'sp_get_i32', 818,
'sp_get_i16', 819,
'sp_get_i8', 820,
'sp_get_n', 821,
'sp_get_s', 822,
'sp_bind_o', 823,
'sp_bind_i64', 824,
'sp_bind_i32', 825,
'sp_bind_i16', 826,
'sp_bind_i8', 827,
'sp_bind_n', 828,
'sp_bind_s', 829,
'sp_p6oget_o', 830,
'sp_p6ogetvt_o', 831,
'sp_p6ogetvc_o', 832,
'sp_p6oget_i', 833,
'sp_p6oget_n', 834,
'sp_p6oget_s', 835,
'sp_p6obind_o', 836,
'sp_p6obind_i', 837,
'sp_p6obind_n', 838,
'sp_p6obind_s', 839,
'sp_deref_get_i64', 840,
'sp_deref_get_n', 841,
'sp_deref_bind_i64', 842,
'sp_deref_bind_n', 843,
'sp_getlexvia_o', 844,
'sp_getlexvia_ins', 845,
'sp_jit_enter', 846,
'sp_boolify_iter', 847,
'sp_boolify_iter_arr', 848,
'sp_boolify_iter_hash', 849,
'sp_cas_o', 850,
'sp_atomicload_o', 851,
'sp_atomicstore_o', 852,
'prof_enter', 853,
'prof_enterspesh', 854,
'prof_enterinline', 855,
'prof_enternative', 856,
'prof_exit', 857,
'prof_allocated', 858,
'ctw_check', 859,
'coverage_log', 860);
MAST::Ops.WHO<@names> := nqp::list_s('no_op',
'const_i8',
'const_i16',
Expand Down Expand Up @@ -5516,6 +5548,10 @@ BEGIN {
'getarg_s',
'getarg_o',
'coerce_II',
'encoderepconf',
'encodeconf',
'decodeconf',
'decoderepconf',
'sp_guard',
'sp_guardconc',
'sp_guardtype',
Expand Down
20 changes: 20 additions & 0 deletions src/core/interp.c
Expand Up @@ -5288,6 +5288,26 @@ void MVM_interp_run(MVMThreadContext *tc, void (*initial_invoke)(MVMThreadContex
cur_op += 6;
goto NEXT;
}
OP(encoderepconf):
GET_REG(cur_op, 8).o = MVM_string_encode_to_buf_config(tc, GET_REG(cur_op, 2).s,
GET_REG(cur_op, 4).s, GET_REG(cur_op, 8).o, GET_REG(cur_op, 6).s, GET_REG(cur_op, 10).i64);
cur_op += 12;
goto NEXT;
OP(encodeconf):
GET_REG(cur_op, 0).o = MVM_string_encode_to_buf_config(tc, GET_REG(cur_op, 2).s,
GET_REG(cur_op, 4).s, GET_REG(cur_op, 6).o, NULL, GET_REG(cur_op, 8).i64);
cur_op += 10;
goto NEXT;
OP(decodeconf):
GET_REG(cur_op, 0).s = MVM_string_decode_from_buf_config(tc,
GET_REG(cur_op, 2).o, GET_REG(cur_op, 4).s, NULL, GET_REG(cur_op, 6).i64);
cur_op += 8;
goto NEXT;
OP(decoderepconf):
GET_REG(cur_op, 0).s = MVM_string_decode_from_buf_config(tc,
GET_REG(cur_op, 2).o, GET_REG(cur_op, 4).s, GET_REG(cur_op, 6).s, GET_REG(cur_op, 8).i64);
cur_op += 10;
goto NEXT;
OP(sp_guard): {
MVMObject *check = GET_REG(cur_op, 0).o;
MVMSTable *want = (MVMSTable *)tc->cur_frame
Expand Down
8 changes: 4 additions & 4 deletions src/core/oplabels.h
Expand Up @@ -789,6 +789,10 @@ static const void * const LABELS[] = {
&&OP_getarg_s,
&&OP_getarg_o,
&&OP_coerce_II,
&&OP_encoderepconf,
&&OP_encodeconf,
&&OP_decodeconf,
&&OP_decoderepconf,
&&OP_sp_guard,
&&OP_sp_guardconc,
&&OP_sp_guardtype,
Expand Down Expand Up @@ -1021,10 +1025,6 @@ static const void * const LABELS[] = {
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
&&OP_CALL_EXTOP,
&&OP_CALL_EXTOP,
&&OP_CALL_EXTOP,
Expand Down
4 changes: 4 additions & 0 deletions src/core/oplist
Expand Up @@ -841,6 +841,10 @@ getarg_n w(num64) r(int16)
getarg_s w(str) r(int16)
getarg_o w(obj) r(int16)
coerce_II w(obj) r(obj) r(obj) :pure
encoderepconf w(obj) r(str) r(str) r(str) r(obj) r(int64)
encodeconf w(obj) r(str) r(str) r(obj) r(int64)
decodeconf w(str) r(obj) r(str) r(int64)
decoderepconf w(str) r(obj) r(str) r(str) r(int64)

# Spesh ops. Naming convention: start with sp_. Must all be marked .s, which
# is how the validator knows to exclude them.
Expand Down

0 comments on commit 314c523

Please sign in to comment.