Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
280 lines (265 sloc) 14.9 KB
#
# kana2rom.rb
# A Ruby module for converting between hiragana, katakana and romaji.
#
# ---------------------------------------------------------------------------------
# K.Kodama 2002.06
# This script is distributed freely in the sense of GNU General Public License.
# http://www.gnu.org/licenses/gpl.html
#
# ---------------------------------------------------------------------------------
# Paul Chapman (paul [a../t] longweekendmobile 2010-04-01)
# Repaired script to work with modern Ruby versions (1.86+), added comments,
# made it support gaijin friendly transliterations!
# ---------------------------------------------------------------------------------
# USAGE
#
# Include kana2rom
#
# kana2rom(str) かな --> ロ-マ字 変換 / hira/katakana ->> romaji conv
# rom2kata(str) ロ-マ字 --> 片仮名 変換 / romaji --> katakana conv
# rom2hira(str) ロ-マ字 --> 平仮名 変換 / romaji --> hiragana conv
# hira2kata(str) 平仮名 --> 片仮名 変換 / hiragana --> katakana conv
# kata2hira(str) 片仮名 --> 平仮名 変換 / katakana ->> hiragana conv
# kana2kana(str) attempts either to either, returns unique strings only
#
# ---------------------------------------------------------------------------------
module Kana2rom
Kana2romH={
""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
""=>"sa", ""=>"si", ""=>"su", ""=>"se", ""=>"so",
""=>"sa", ""=>"shi",""=>"su", ""=>"se", ""=>"so",
""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
""=>"ya", ""=>"yu", ""=>"yo",
""=>"ya", ""=>"yu", ""=>"yo",
""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
""=>"vu", ""=>"xka",""=>"ga",""=>"xwa",
""=>"xwa",
""=>"-", ""=>"-", ""=>'"', ""=>"'", ""=>",", ""=>".",
""=>":", " " => " ", "" => "@", "" => "(", "" => ")",
" " => " "
}
Kana2romH2={
"てぃ" => "ti", "でぃ" => "di"
}
# 1 character romaji patterns
Rom2KataH1={
"a"=>"", "i"=>"", "u"=>"", "e"=>"", "o"=>"", "-"=>""
}
# 2 character romaji patterns
Rom2KataH2={
"xa"=>"", "xi"=>"", "xu"=>"", "xe"=>"", "xo"=>"",
"ka"=>"", "ki"=>"", "ku"=>"", "ke"=>"", "ko"=>"",
"ca"=>"", "cu"=>"", "co"=>"",
"ga"=>"", "gi"=>"", "gu"=>"", "ge"=>"", "go"=>"",
"sa"=>"", "si"=>"", "su"=>"", "se"=>"", "so"=>"",
"za"=>"", "zi"=>"", "zu"=>"", "ze"=>"", "zo"=>"",
"ja"=>"ジャ","ji"=>"", "ju"=>"ジュ","je"=>"ジェ","jo"=>"ジョ",
"ta"=>"", "ti"=>"", "tsu"=>"", "te"=>"", "to"=>"",
"da"=>"", "di"=>"", "du"=>"", "de"=>"", "do"=>"",
"na"=>"", "ni"=>"", "nu"=>"", "ne"=>"", "no"=>"",
"ha"=>"", "hi"=>"", "hu"=>"", "he"=>"", "ho"=>"",
"ba"=>"", "bi"=>"", "bu"=>"", "be"=>"", "bo"=>"",
"pa"=>"", "pi"=>"", "pu"=>"", "pe"=>"", "po"=>"",
"va"=>"ヴァ","vi"=>"ヴィ","vu"=>"", "ve"=>"ヴェ","vo"=>"ヴォ",
"fa"=>"ファ","fi"=>"フィ","fu"=>"", "fe"=>"フェ","fo"=>"フォ",
"ma"=>"", "mi"=>"", "mu"=>"", "me"=>"", "mo"=>"",
"ya"=>"", "yi"=>"", "yu"=>"", "ye"=>"イェ", "yo"=>"",
"ra"=>"", "ri"=>"", "ru"=>"", "re"=>"", "ro"=>"",
"la"=>"", "li"=>"", "lu"=>"", "le"=>"", "lo"=>"",
"wa"=>"", "wi"=>"", "wu"=>"", "we"=>"", "wo"=>"",
"nn"=>""
}
# 3 character romaji patterns
Rom2KataH3={
"tsu"=>"",
"xka"=>"", "xke"=>"",
"xwa"=>"", "xtsu"=>"", "xya"=>"", "xyu"=>"", "xyo"=>"",
"kya"=>"キャ", "kyi"=>"キィ", "kyu"=>"キュ", "kye"=>"キェ", "kyo"=>"キョ",
"gya"=>"ギャ", "gyi"=>"ギィ", "gyu"=>"ギュ", "gye"=>"ギェ", "gyo"=>"ギョ",
"sya"=>"シャ", "syi"=>"シィ", "syu"=>"シュ", "sye"=>"シェ", "syo"=>"ショ",
"sha"=>"シャ", "shi"=>"", "shu"=>"シュ", "she"=>"シェ", "sho"=>"ショ",
"zya"=>"ジャ", "zyi"=>"ジィ", "zyu"=>"ジュ", "zye"=>"ジェ", "zyo"=>"ジョ",
"jya"=>"ジャ", "jyi"=>"ジィ", "jyu"=>"ジュ", "jye"=>"ジェ", "jyo"=>"ジョ",
"tya"=>"チャ", "tyi"=>"チィ", "tyu"=>"チュ", "tye"=>"チェ", "tyo"=>"チョ",
"cya"=>"チャ", "cyi"=>"チィ", "cyu"=>"チュ", "cye"=>"チェ", "cyo"=>"チョ",
"cha"=>"チャ", "chi"=>"", "chu"=>"チュ", "che"=>"チェ", "cho"=>"チョ",
"tha"=>"テャ", "thi"=>"ティ", "thu"=>"テュ", "the"=>"テェ", "tho"=>"テョ",
"dya"=>"ヂャ", "dyi"=>"ヂィ", "dyu"=>"ヂュ", "dye"=>"ヂェ", "dyo"=>"ヂョ",
"dha"=>"デャ", "dhi"=>"ディ", "dhu"=>"デュ", "dhe"=>"デェ", "dho"=>"デョ",
"nya"=>"ニャ", "nyi"=>"ニィ", "nyu"=>"ニュ", "nye"=>"ニェ", "nyo"=>"ニョ",
"hya"=>"ヒャ", "hyi"=>"ヒィ", "hyu"=>"ヒュ", "hye"=>"ヒェ", "hyo"=>"ヒョ",
"bya"=>"ビャ", "byi"=>"ビィ", "byu"=>"ビュ", "bye"=>"ビェ", "byo"=>"ビョ",
"pya"=>"ピャ", "pyi"=>"ピィ", "pyu"=>"ピュ", "pye"=>"ピェ", "pyo"=>"ピョ",
"mya"=>"ミャ", "myi"=>"ミィ", "myu"=>"ミュ", "mye"=>"ミェ", "myo"=>"ミョ",
"rya"=>"リャ", "ryi"=>"リィ", "ryu"=>"リュ", "rye"=>"リェ", "ryo"=>"リョ",
"lya"=>"リャ", "lyi"=>"リィ", "lyu"=>"リュ", "lye"=>"リェ", "lyo"=>"リョ"
}
Kata2hiraH={
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
""=>"", ""=>"", ""=>"", ""=>"",
""=>"う゛", ""=>"", ""=>"", ""=>""
}
Hira2kataH={}; Kata2hiraH.each_pair{|k,v| Hira2kataH[v]=k}; Hira2kataH[""]=""; Hira2kataH[""]=""
def kana2rom(str)
s="";str.each_char{|c|if(Kana2romH.key?(c))then s+=Kana2romH[c];else s+=c;end}
s=s.gsub(/(k)([aiueo])(")/,'g\2').gsub(/(s)([aiueo])(")/,'z\2').gsub(/(t)([aiueo])(")/,'d\2')
s=s.gsub(/(h)([aiueo])(")/,'b\2').gsub(/([fh])([aiueo])(')/,'p\2').gsub(/u"/,'vu') # [半]濁点゛゜
#---------------------------------------------------------
s=s.gsub(/\s(xtsu)?\s/,'xtsu') # Remove spaces before/after hanging 'っ'
#---------------------------------------------------------
sw=s;
while nil!=sw.gsub!(/(xtsu)([ckgszjtdhfbpmyrwnv])/,'\2\2') do; s=sw; end # ッカ-->xtsuka-->kka
#---------------------------------------------------------
# Compound Phoneme Pattern Rollbacks
# NB: Uses regex backrefs like "\1y\3" where \1 = 1st capture grp, y='y' and \3 = 3rd capture grp
#---------------------------------------------------------
s=s.gsub(/( +x)(.*)/,'x\2') # Avoid hanging chisaii moji due to leading spaces
s=s.gsub(/(ch)(ixy)([aueo])/,'\1\3') # チョ-->chixyo-->cho
s=s.gsub(/([kgszjtdnhfbpmr])(ixy)([auo])/,'\1y\3') # キャ-->kixya-->kya
s=s.gsub(/([kgszjtdnhfbpmr])(ix)([ie])/,'\1y\3') # キィ-->kixi-->kyi
#---------------------------------------------------------
s=s.gsub(/(sh)(y)([aueo])/,'\1\3') # シュ-->shyu-->shu
s=s.gsub(/(j)(y)([aueo])/,'\1\3') # ジュ-->jyu-->ju
#---------------------------------------------------------
s=s.gsub(/([td])(exy)([aueo])/,'\1h\3') # テャ-->texya-->tha
s=s.gsub(/([td])(ex)([ie])/,'\1\3') # ティ-->texi-->ti
s=s.gsub(/([td])(oxu)/,'\1oo') # ドゥ-->toxu-->too
s=s.gsub(/(tsu)(x)([aiueo])/,'ts\3') # ツァ-->tsuxa-->tsa
s=s.gsub(/([d])(oxy)/,'\1o\'y') # ドュ-->doxyu-->doyu
#---------------------------------------------------------
s=s.gsub(/(vux)([aieo])/ ,'v\2') # ヴァヴィヴェヴォ, ヴァ-->vuxa-->va
s=s.gsub(/(vuxy)([aueo])/ ,'vy\2') # ヴュ-->vuxyu-->vyu
s=s.gsub(/(ixe)/ ,'iye') # イェ-->ixe-->iye
s=s.gsub(/(hoxe)/ ,'howe') # ホェ-->hoxe-->howe
s=s.gsub(/(fux)([aieo])/ ,'f\2') # ファフィフェフォ, ファ-->fuxa-->fa
s=s.gsub(/(fuxy)([aueo])/,'fy\2') # フュ-->fuxyu-->fyu
s=s.gsub(/(ux)([ieo])/, 'w\2') # ウァウィウェ, ウァ-->uxa-->wa
#---------------------------------------------------------
s=s.strip.gsub(/(xtsu)$/,'h!') # Recombine hanging 'っ' followed by EOL
s=s.gsub(/([aiueo]?)(\-)/, '\1\1') # Replace boubiki chars and double preceding vowel
#---------------------------------------------------------
# Cleanup specifically for source strings that contain spaces!
s=s.gsub(/( +)([^a-z|A-z])/, '\2') # Remove spaces before any non-alphabetical char
s=s.gsub(/(nn)/,'n') # ン-->nn-->n
s=s.gsub(/( n)[^a-z|A-Z]?$/,'n') # Fix "n" appearing as separate word
s=s.gsub(/\s{2,}/, ' ') # Remove duplicate spaces!
#---------------------------------------------------------
return s
end
def rom2kata(str)
## THIS LINE DOES NOT WORK IN RECENT RUBY VERSIONS!!! r=""; w=[]; chars=str.split(//e)
result=""; word_buffer=[]; chars=str.each_char.collect{|c| c}
loop{
case word_buffer.size
##### When 0 characters in the buffer
when 0 then
if chars.size>0 then word_buffer.push(chars.shift) else return result; end
##### Patterns with 1 roman character
when 1 then
if word_buffer[0]=~/[aiueo-]/ then result+=Rom2KataH1[word_buffer[0]]; word_buffer=[] # a-->ア
elsif word_buffer[0]=~/[xkcgszjtdnhbpvfmyrlw]/ then
if chars.size>0 then word_buffer.push(chars.shift)
else return result+(word_buffer[0].gsub(/n/,""));
end
else result+=word_buffer.shift;
end
##### Patterns with 2 roman characters
when 2 then
if Rom2KataH2.key?(word_buffer.join) then result+=Rom2KataH2[word_buffer.join]; word_buffer=[];
elsif word_buffer.join=~/([kgszjtcdnhbpmrl]y)|([stcd]h)|ts|(x[wytk])/ then # goto 3
if chars.size>0 then word_buffer.push(chars.shift) # Consume next letter from source array
else return result+(word_buffer.join.gsub(/n/,""));
end
elsif word_buffer[0]=="n" then result+=""; word_buffer.shift # nk-->ンk
elsif word_buffer[0]==word_buffer[1] then result+=""; word_buffer.shift # kk-->ッk
else result+=word_buffer.shift;
end
##### Patterns with 3 roman characters
when 3 then
if Rom2KataH3.key?(word_buffer.join) then result+=Rom2KataH3[word_buffer.join]; word_buffer=[];
elsif word_buffer[0]=="n" then result+=""; word_buffer.shift;
else result+=word_buffer.shift;
end
end
}
end
def rom2hira(str)
return kata2hira(rom2kata(str))
end
def kata2hira(str)
s=""; str.each_char{|c| s+=( Kata2hiraH.key?(c) ? Kata2hiraH[c] : c )}
return s
end
def hira2kata(str)
s=""; str.each_char{|c|if(Hira2kataH.key?(c))then s+=Hira2kataH[c];else s+=c; end}
return s
end
# Added by Paul 2009-05-12 22:31
def kana2kana(str1)
result = []
str2 = Kana2rom::hira2kata(str1)
str3 = Kana2rom::kata2hira(str1)
result << str1
result << str2 if str2.length > 0 and str1 !=str2
result << str3 if str3.length > 0 and str2 !=str3 and str3 != str1
return result
end
module_function :kana2rom, :rom2kata, :kata2hira, :hira2kata, :rom2hira, :kana2kana
end
=begin
### Uncomment this section to test at command line
require 'jcode'
if $0 == __FILE__ then
# sample
str="ひらがな/カタカナ"
printf("ローマ字: %s\n", Kana2rom::kana2rom(str))
printf("平仮名 : %s\n", Kana2rom::kata2hira(str))
printf("片仮名 : %s\n", Kana2rom::hira2kata(str))
str="ro-maji"
printf("片仮名 : %s\n", Kana2rom::rom2kata(str))
printf("平仮名 : %s\n", Kana2rom::rom2hira(str))
end
=end