Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

590 lines (522 sloc) 17.194 kB
/*
* MacRuby implementation of Ruby 1.9 String.
*
* This file is covered by the Ruby license. See COPYING for more details.
*
* Copyright (C) 2007-2010, Apple Inc. All rights reserved.
* Copyright (C) 1993-2007 Yukihiro Matsumoto
* Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
* Copyright (C) 2000 Information-technology Promotion Agency, Japan
*/
#include <string.h>
#include "ruby/macruby.h"
#include "ruby/encoding.h"
#include "encoding.h"
#include "symbol.h"
VALUE rb_cEncoding;
rb_encoding_t *default_internal = NULL;
static rb_encoding_t *default_external = NULL;
rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
static void str_undefined_update_flags(rb_str_t *self) { abort(); }
static void str_undefined_make_data_binary(rb_str_t *self) { abort(); }
static bool str_undefined_try_making_data_uchars(rb_str_t *self) { abort(); }
static long str_undefined_length(rb_str_t *self, bool ucs2_mode) { abort(); }
static long str_undefined_bytesize(rb_str_t *self) { abort(); }
static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
static void str_undefined_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length) { abort(); }
static void str_undefined_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *pos, char **bytes, long *bytes_length) { abort(); }
static VALUE
mr_enc_s_list(VALUE klass, SEL sel)
{
VALUE ary = rb_ary_new2(ENCODINGS_COUNT);
for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
rb_ary_push(ary, (VALUE)rb_encodings[i]);
}
return ary;
}
static VALUE
mr_enc_s_name_list(VALUE klass, SEL sel)
{
VALUE ary = rb_ary_new();
for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
rb_encoding_t *encoding = RENC(rb_encodings[i]);
// TODO: use US-ASCII strings
rb_ary_push(ary, rb_usascii_str_new2(encoding->public_name));
for (unsigned int j = 0; j < encoding->aliases_count; ++j) {
rb_ary_push(ary, rb_usascii_str_new2(encoding->aliases[j]));
}
}
return ary;
}
static VALUE
mr_enc_s_aliases(VALUE klass, SEL sel)
{
VALUE hash = rb_hash_new();
for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
rb_encoding_t *encoding = RENC(rb_encodings[i]);
for (unsigned int j = 0; j < encoding->aliases_count; ++j) {
rb_hash_aset(hash, rb_usascii_str_new2(encoding->aliases[j]),
rb_usascii_str_new2(encoding->public_name));
}
}
return hash;
}
static VALUE
mr_enc_s_find(VALUE klass, SEL sel, VALUE name)
{
StringValue(name);
rb_encoding_t *enc = rb_enc_find(RSTRING_PTR(name));
if (enc == NULL) {
rb_raise(rb_eArgError, "unknown encoding name - %s",
RSTRING_PTR(name));
}
return (VALUE)enc;
}
static VALUE
mr_enc_s_default_internal(VALUE klass, SEL sel)
{
return (VALUE)default_internal;
}
static VALUE
mr_enc_set_default_internal(VALUE klass, SEL sel, VALUE enc)
{
default_internal = rb_to_encoding(enc);
return (VALUE)default_internal;
}
static VALUE
mr_enc_s_default_external(VALUE klass, SEL sel)
{
return (VALUE)default_external;
}
static VALUE
mr_enc_set_default_external(VALUE klass, SEL sel, VALUE enc)
{
default_external = rb_to_encoding(enc);
return (VALUE)default_external;
}
static VALUE
mr_enc_name(VALUE self, SEL sel)
{
return rb_usascii_str_new2(RENC(self)->public_name);
}
static VALUE
mr_enc_inspect(VALUE self, SEL sel)
{
return rb_sprintf("#<%s:%s>", rb_obj_classname(self),
RENC(self)->public_name);
}
static VALUE
mr_enc_names(VALUE self, SEL sel)
{
rb_encoding_t *encoding = RENC(self);
VALUE ary = rb_ary_new2(encoding->aliases_count + 1);
rb_ary_push(ary, rb_usascii_str_new2(encoding->public_name));
for (unsigned int i = 0; i < encoding->aliases_count; ++i) {
rb_ary_push(ary, rb_usascii_str_new2(encoding->aliases[i]));
}
return ary;
}
static VALUE
mr_enc_ascii_compatible_p(VALUE self, SEL sel)
{
return RENC(self)->ascii_compatible ? Qtrue : Qfalse;
}
static VALUE
mr_enc_dummy_p(VALUE self, SEL sel)
{
return Qfalse;
}
// For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
rb_str_t *replacement_string_for_encoding(rb_encoding_t* destination)
{
rb_str_t *replacement_str = NULL;
if (destination == rb_encodings[ENCODING_UTF16BE]) {
replacement_str = RSTR(rb_enc_str_new("\xFF\xFD", 2, destination));
}
else if (destination == rb_encodings[ENCODING_UTF32BE]) {
replacement_str = RSTR(rb_enc_str_new("\0\0\xFF\xFD", 4, destination));
}
else if (destination == rb_encodings[ENCODING_UTF16LE]) {
replacement_str = RSTR(rb_enc_str_new("\xFD\xFF", 2, destination));
}
else if (destination == rb_encodings[ENCODING_UTF32LE]) {
replacement_str = RSTR(rb_enc_str_new("\xFD\xFF\0\0", 4, destination));
}
else if (destination == rb_encodings[ENCODING_UTF8]) {
replacement_str = RSTR(rb_enc_str_new("\xEF\xBF\xBD", 3, destination));
}
else {
replacement_str = RSTR(rb_enc_str_new("?", 1, rb_encodings[ENCODING_ASCII]));
replacement_str = str_simple_transcode(replacement_str, destination);
}
return replacement_str;
}
static void
define_encoding_constant(const char *name, rb_encoding_t *encoding)
{
char c = name[0];
if ((c >= '0') && (c <= '9')) {
// constants can't start with a number
return;
}
if (strcmp(name, "locale") == 0) {
// there is no constant for locale
return;
}
char *name_copy = strdup(name);
if ((c >= 'a') && (c <= 'z')) {
// the first character must be upper case
name_copy[0] = c - ('a' - 'A');
}
bool has_lower_case = false;
// '.' and '-' must be transformed into '_'
for (int i = 0; name_copy[i]; ++i) {
if ((name_copy[i] == '.') || (name_copy[i] == '-')) {
name_copy[i] = '_';
}
else if ((name_copy[i] >= 'a') && (name_copy[i] <= 'z')) {
has_lower_case = true;
}
}
rb_define_const(rb_cEncoding, name_copy, (VALUE)encoding);
// if the encoding name has lower case characters,
// also define it in upper case
if (has_lower_case) {
for (int i = 0; name_copy[i]; ++i) {
if ((name_copy[i] >= 'a') && (name_copy[i] <= 'z')) {
name_copy[i] = name_copy[i] - 'a' + 'A';
}
}
rb_define_const(rb_cEncoding, name_copy, (VALUE)encoding);
}
free(name_copy);
}
extern void enc_init_ucnv_encoding(rb_encoding_t *encoding);
enum {
ENCODING_TYPE_SPECIAL = 0,
ENCODING_TYPE_UCNV
};
static void
add_encoding(
unsigned int encoding_index, // index of the encoding in the encodings
// array
unsigned int rb_encoding_type,
const char *public_name, // public name for the encoding
unsigned char min_char_size,
bool single_byte_encoding, // in the encoding a character takes only
// one byte
bool ascii_compatible, // is the encoding ASCII compatible or not
... // aliases for the encoding (should no include the public name)
// - must end with a NULL
)
{
assert(encoding_index < ENCODINGS_COUNT);
// create an array for the aliases
unsigned int aliases_count = 0;
va_list va_aliases;
va_start(va_aliases, ascii_compatible);
while (va_arg(va_aliases, const char *) != NULL) {
++aliases_count;
}
va_end(va_aliases);
const char **aliases = (const char **)
malloc(sizeof(const char *) * aliases_count);
va_start(va_aliases, ascii_compatible);
for (unsigned int i = 0; i < aliases_count; ++i) {
aliases[i] = va_arg(va_aliases, const char *);
}
va_end(va_aliases);
// create the MacRuby object
NEWOBJ(encoding, rb_encoding_t);
encoding->basic.flags = 0;
encoding->basic.klass = rb_cEncoding;
rb_encodings[encoding_index] = encoding;
GC_RETAIN(encoding); // it should never be deallocated
// fill the fields
encoding->index = encoding_index;
encoding->public_name = public_name;
encoding->min_char_size = min_char_size;
encoding->single_byte_encoding = single_byte_encoding;
encoding->ascii_compatible = ascii_compatible;
encoding->aliases_count = aliases_count;
encoding->aliases = aliases;
// fill the default implementations with aborts
encoding->methods.update_flags = str_undefined_update_flags;
encoding->methods.make_data_binary = str_undefined_make_data_binary;
encoding->methods.try_making_data_uchars =
str_undefined_try_making_data_uchars;
encoding->methods.length = str_undefined_length;
encoding->methods.bytesize = str_undefined_bytesize;
encoding->methods.get_character_boundaries =
str_undefined_get_character_boundaries;
encoding->methods.offset_in_bytes_to_index =
str_undefined_offset_in_bytes_to_index;
encoding->methods.transcode_to_utf16 =
str_undefined_transcode_to_utf16;
encoding->methods.transcode_from_utf16 =
str_undefined_transcode_from_utf16;
switch (rb_encoding_type) {
case ENCODING_TYPE_SPECIAL:
break;
case ENCODING_TYPE_UCNV:
enc_init_ucnv_encoding(encoding);
break;
default:
abort();
}
}
// This Init function is called very early. Do not use any runtime method
// because things may not be initialized properly yet.
void
Init_PreEncoding(void)
{
add_encoding(ENCODING_BINARY, ENCODING_TYPE_SPECIAL, "ASCII-8BIT", 1, true, true, "BINARY", NULL);
add_encoding(ENCODING_ASCII, ENCODING_TYPE_UCNV, "US-ASCII", 1, true, true, "ASCII", "ANSI_X3.4-1968", "646", NULL);
add_encoding(ENCODING_UTF8, ENCODING_TYPE_UCNV, "UTF-8", 1, false, true, "CP65001", "locale", NULL);
add_encoding(ENCODING_UTF16BE, ENCODING_TYPE_UCNV, "UTF-16BE", 2, false, false, NULL);
add_encoding(ENCODING_UTF16LE, ENCODING_TYPE_UCNV, "UTF-16LE", 2, false, false, NULL);
add_encoding(ENCODING_UTF32BE, ENCODING_TYPE_UCNV, "UTF-32BE", 4, false, false, "UCS-4BE", NULL);
add_encoding(ENCODING_UTF32LE, ENCODING_TYPE_UCNV, "UTF-32LE", 4, false, false, "UCS-4LE", NULL);
add_encoding(ENCODING_ISO8859_1, ENCODING_TYPE_UCNV, "ISO-8859-1", 1, true, true, "ISO8859-1", NULL);
add_encoding(ENCODING_MACROMAN, ENCODING_TYPE_UCNV, "macRoman", 1, true, true, NULL);
add_encoding(ENCODING_MACCYRILLIC, ENCODING_TYPE_UCNV, "macCyrillic", 1, true, true, NULL);
add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, "CP950", NULL);
// FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, "eucJP", NULL);
add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, "SJIS", NULL);
//add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL);
//add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL);
//add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);
default_external = rb_encodings[ENCODING_UTF8];
default_internal = rb_encodings[ENCODING_UTF8];
}
void
Init_Encoding(void)
{
// rb_cEncoding is defined earlier in Init_PreVM().
rb_set_class_path(rb_cEncoding, rb_cObject, "Encoding");
rb_const_set(rb_cObject, rb_intern("Encoding"), rb_cEncoding);
rb_undef_alloc_func(rb_cEncoding);
rb_objc_define_method(rb_cEncoding, "to_s", mr_enc_name, 0);
rb_objc_define_method(rb_cEncoding, "inspect", mr_enc_inspect, 0);
rb_objc_define_method(rb_cEncoding, "name", mr_enc_name, 0);
rb_objc_define_method(rb_cEncoding, "names", mr_enc_names, 0);
rb_objc_define_method(rb_cEncoding, "dummy?", mr_enc_dummy_p, 0);
rb_objc_define_method(rb_cEncoding, "ascii_compatible?",
mr_enc_ascii_compatible_p, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "list", mr_enc_s_list, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "name_list",
mr_enc_s_name_list, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "aliases",
mr_enc_s_aliases, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "find", mr_enc_s_find, 1);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "compatible?",
mr_enc_s_is_compatible, 2); // in string.c
//rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
//rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external",
mr_enc_s_default_external, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external=",
mr_enc_set_default_external, 1);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_internal",
mr_enc_s_default_internal, 0);
rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_internal=",
mr_enc_set_default_internal, 1);
//rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
// Create constants.
for (unsigned int i = 0; i < ENCODINGS_COUNT; i++) {
rb_encoding_t *enc = rb_encodings[i];
define_encoding_constant(enc->public_name, enc);
for (unsigned int j = 0; j < enc->aliases_count; j++) {
define_encoding_constant(enc->aliases[j], enc);
}
}
}
// MRI C-API compatibility.
rb_encoding_t *
rb_enc_find(const char *name)
{
for (unsigned int i = 0; i < ENCODINGS_COUNT; i++) {
rb_encoding_t *enc = rb_encodings[i];
if (strcasecmp(enc->public_name, name) == 0) {
return enc;
}
for (unsigned int j = 0; j < enc->aliases_count; j++) {
const char *alias = enc->aliases[j];
if (strcasecmp(alias, name) == 0) {
return enc;
}
}
}
return NULL;
}
VALUE
rb_enc_from_encoding(rb_encoding_t *enc)
{
return (VALUE)enc;
}
rb_encoding_t *
rb_enc_get(VALUE obj)
{
switch (TYPE(obj)) {
case T_STRING:
if (IS_RSTR(obj)) {
return RSTR(obj)->encoding;
}
return rb_encodings[ENCODING_UTF8];
case T_SYMBOL:
return rb_enc_get(rb_sym_str(obj));
}
return NULL;
}
rb_encoding_t *
rb_to_encoding(VALUE obj)
{
rb_encoding_t *enc;
if (CLASS_OF(obj) == rb_cEncoding) {
enc = RENC(obj);
}
else {
StringValue(obj);
enc = rb_enc_find(RSTRING_PTR(obj));
if (enc == NULL) {
rb_raise(rb_eArgError, "unknown encoding name - %s",
RSTRING_PTR(obj));
}
}
return enc;
}
const char *
rb_enc_name(rb_encoding_t *enc)
{
return RENC(enc)->public_name;
}
VALUE
rb_enc_name2(rb_encoding_t *enc)
{
return rb_usascii_str_new2(rb_enc_name(enc));
}
long
rb_enc_mbminlen(rb_encoding_t *enc)
{
return enc->min_char_size;
}
long
rb_enc_mbmaxlen(rb_encoding_t *enc)
{
return enc->single_byte_encoding ? 1 : 10; // XXX 10?
}
rb_encoding *
rb_ascii8bit_encoding(void)
{
return rb_encodings[ENCODING_BINARY];
}
rb_encoding *
rb_utf8_encoding(void)
{
return rb_encodings[ENCODING_UTF8];
}
rb_encoding *
rb_usascii_encoding(void)
{
return rb_encodings[ENCODING_ASCII];
}
rb_encoding_t *
rb_locale_encoding(void)
{
// XXX
return rb_encodings[ENCODING_UTF8];
}
void
rb_enc_set_default_external(VALUE encoding)
{
assert(CLASS_OF(encoding) == rb_cEncoding);
default_external = RENC(encoding);
}
rb_encoding *
rb_default_internal_encoding(void)
{
return (rb_encoding *)default_internal;
}
static int
index_of_encoding(rb_encoding_t *enc)
{
if (enc != NULL) {
for (int i = 0; i <ENCODINGS_COUNT; i++) {
if (rb_encodings[i] == enc) {
return i;
}
}
}
return -1;
}
int
rb_enc_get_index(VALUE obj)
{
return index_of_encoding(rb_enc_get(obj));
}
void
rb_enc_set_index(VALUE obj, int encindex)
{
if (encindex < ENCODINGS_COUNT) {
return ;
}
rb_str_force_encoding(obj, rb_encodings[encindex]);
}
int
rb_to_encoding_index(VALUE enc)
{
if (CLASS_OF(enc) != rb_cEncoding && TYPE(enc) != T_STRING) {
return -1;
}
else {
int idx = index_of_encoding((rb_encoding_t *)enc);
if (idx >= 0) {
return idx;
}
else if (NIL_P(enc = rb_check_string_type(enc))) {
return -1;
}
if (!rb_enc_asciicompat(rb_enc_get(enc))) {
return -1;
}
return rb_enc_find_index(StringValueCStr(enc));
}
}
int
rb_enc_find_index(const char *name)
{
return index_of_encoding(rb_enc_find(name));
}
int
rb_ascii8bit_encindex(void)
{
return index_of_encoding(rb_encodings[ENCODING_BINARY]);
}
int
rb_utf8_encindex(void)
{
return index_of_encoding(rb_encodings[ENCODING_UTF8]);
}
int
rb_usascii_encindex(void)
{
return index_of_encoding(rb_encodings[ENCODING_ASCII]);
}
rb_encoding *
rb_enc_from_index(int idx)
{
assert(idx >= 0 && idx < ENCODINGS_COUNT);
return rb_encodings[idx];
}
VALUE
rb_enc_associate_index(VALUE obj, int idx)
{
if (TYPE(obj) == T_STRING) {
assert(idx >= 0 && idx < ENCODINGS_COUNT);
rb_str_force_encoding(obj, rb_encodings[idx]);
return obj;
}
rb_raise(rb_eArgError, "cannot set encoding on non-string object");
}
Jump to Line
Something went wrong with that request. Please try again.