JuliaLang · archermarx · Nov 16, 2020 · Nov 16, 2020 · Nov 16, 2020 · Nov 16, 2020
diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -2,7 +2,7 @@
 
 module Unicode
 
-export graphemes
+export graphemes, isemoji
 
 """
     Unicode.normalize(s::AbstractString; keywords...)
@@ -89,4 +89,86 @@ letter combined with an accent mark is a single grapheme.)
 """
 graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
 
+const emoji_data = download("https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt")
+
+"""
+    extract_emoji_column(emoji_data, column = 1; type_field = "")
+Read the selected column from a provided unicode emoji data file
+(i.e. https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt).
+Optionally select only columns beginning with `type_field`
+"""
+function extract_emoji_column(emoji_data, column = 1; type_field = "")
+    lines = readlines(emoji_data)
+    filter!(line -> !isempty(line) && !startswith(line, "#"),  lines)
+    splitlines = [strip.(split(line, ";")) for line in lines]
+    first_col = [splitline[column] for splitline in splitlines if startswith(splitline[2], type_field)]
+end
+
+# parse a string of the form "AAAA...FFFF" into 0xAAAA:0xFFFF
+parse_unicode_range_str(range_str) = let s = split(range_str, "..")
+    if length(s) > 2 || length(s) < 1
+        return nothing
+    else
+        s1 = tryparse(UInt32, "0x" * s[1])
+        s1 === nothing && return nothing
+        if length(s) == 1
+            return s1:s1
+        else
+            s2 = tryparse(UInt32, "0x" * s[2])
+            s2 === nothing && return nothing
+            return s1:s2
+        end
+    end
+end
+
+# Get all ranges containing valid single emoji from file
+const EMOJI_RANGES = parse_unicode_range_str.(extract_emoji_column(emoji_data, 1, type_field = "Emoji"))
+const ZWJ = '\u200d'    # Zero-width joiner
+const VAR_SELECTOR = '\uFE0F'   # Variation selector
+# Handle England, Scotland, Wales flags and keycaps
+const SPECIAL_CASES = ["🏴󠁧󠁢󠁥󠁮󠁧󠁿", "🏴󠁧󠁢󠁳󠁣󠁴󠁿", "🏴󠁧󠁢󠁷󠁬󠁳󠁿", "#️⃣", "*️⃣", "0️⃣", "1️⃣", "2️⃣", "3️⃣", "4️⃣", "5️⃣", "6️⃣", "7️⃣", "8️⃣", "9️⃣"]
+
+"""
+    isemoji(Union{AbstractChar, AbstractString}) -> Bool
+
+Test whether a character is an emoji, or whether all elements in a given string are emoji. Includes identifying composite emoji.
+Empty strings return `true` as they contain no characters which aren't emoji.
+Combined emoji sequences separated by the zero-width joiner character `'\u200d'`
+such as 👨‍❤️‍👨 `['👨',  '\u200d', '❤', '\uFE0F', '\u200d', '👨']` are supported, though this function cannot determine whether a
+given sequence of emoji and zero-width joiners would result in a valid composite emoji.
+"""
+function isemoji(c::AbstractChar)
+    u = UInt32(c)
+    @inbounds for emojiset in EMOJI_RANGES
+        u in emojiset && return true
+    end
+    return false
+end
+
+function isemoji(s::AbstractString)
+    s in SPECIAL_CASES && return true
+    isempty(s) && return true
+    s[end] == ZWJ && return false
+    ZWJ_allowed = false
+    VAR_SELECTOR_allowed = false
+    emoji_allowed = true
+    # make sure string follows sequence of basic emoji chars
+    # separated by ZWJ and VAR_SELECTOR characters
+    @inbounds for c in s
+        if c == ZWJ
+            !ZWJ_allowed && return false
+            ZWJ_allowed = false
+            VAR_SELECTOR_allowed = false
+        elseif c == VAR_SELECTOR
+            !VAR_SELECTOR_allowed && return false
+            VAR_SELECTOR_allowed = false
+        else
+            !isemoji(c) && return false
+            ZWJ_allowed = true
+            VAR_SELECTOR_allowed = true
+        end
+    end
+    return true
+end
+
 end
diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
@@ -404,3 +404,56 @@ end
     @test prod(["*" for i in 1:3]) == "***"
     @test prod(["*" for i in 1:0]) == ""
 end
+
+@testset "Emoji tests" begin
+    # parse a string of the form "AAAA BBBB CCCC" into [0xAAAA, 0xBBBB, 0xCCCC]
+    function parse_sequence_str(seq_str)
+        s = split(seq_str)
+        res = tryparse.(UInt32, "0x" .* s)
+        if all(isnothing.(res))
+            return nothing
+        else
+            return res
+        end
+    end
+
+    # Parse string containing range (i.e. AAAA..FFFF) or sequence (AAAA BBBB CCCC) of unicode codepoints into an array of strings
+    # Ranges are parsed as independent characters ("AAAA...FFFF") -> ["\uAAAA", "\uBBBB", ..., "\u"FFFF"]
+    # Sequences are parsed as a single string ("AAAA BBBB") -> ["\uAAAA\uBBBB"]
+    function parse_col_entry(seq_str)
+        s = parse_sequence_str(seq_str)
+        if s === nothing
+            s = "" .* Char.(Unicode.parse_unicode_range_str(seq_str) |> collect)
+        else
+            s = [Char.(s) |> String]
+        end
+        return s
+    end
+
+    function extract_emoji_sequences(emoji_data)
+        codepoints = Unicode.extract_emoji_column(emoji_data)
+        emojis = parse_col_entry.(codepoints)
+        vcat(emojis...)
+    end
+
+    # See if all emojis are caught by the isemoji function
+    emoji_sequences = download("https://www.unicode.org/Public/emoji/13.1/emoji-sequences.txt")
+    emoji_zwj_sequences = download("https://www.unicode.org/Public/emoji/13.1/emoji-zwj-sequences.txt")
+    all_emojis = [extract_emoji_sequences(emoji_sequences) ; extract_emoji_sequences(emoji_zwj_sequences)]
+    @test all(isemoji.(all_emojis))
+
+    @test !isemoji('A')
+    @test !isemoji("🔹 some text bounded by emojis 🔹")
+    @test !isemoji("🚍 some text after an emoji")
+    @test !isemoji("some text before an emoji 🚘")
+    @test !isemoji("😮 😥 😨 😩 😪") # There are spaces between the emojis
+    @test !isemoji("No emojis here")
+
+    # Test emoji sequences
+    @test isemoji("😈😘")
+    @test isemoji("🚴🏿")
+    @test !isemoji("👨‍👧" * Unicode.ZWJ)
+    @test isemoji("🛌" * Unicode.ZWJ * '😎')
+    @test !isemoji("🤦🏽" * Unicode.ZWJ * Unicode.ZWJ * '😎')
+    @test isemoji("")
+end