In [None]:
import re
from nltk.tokenize import TweetTokenizer, RegexpTokenizer


try:
    # Wide UCS-4 build
    emoji_re = re.compile(u'['
        u'\U0001F300-\U0001F64F'
        u'\U0001F680-\U0001F6FF'
        u'\u2600-\u26FF\u2700-\u27BF]+', 
        re.UNICODE)
except re.error:
    # Narrow UCS-2 build
    emoji_re = re.compile(u'('
        u'\ud83c[\udf00-\udfff]|'
        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
        u'[\u2600-\u26FF\u2700-\u27BF])+', 
        re.UNICODE)


tokenizer = RegexpTokenizer(r'\w+')
tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=False)
hashtag_re = re.compile(r'(#[\w]+)')
url_re = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))')
retweet_re = re.compile(r'(RT @(?:\b\w+)+)')
numbers_re = re.compile(r"[ \n\r][-\d.]+[ \n\r!\\\"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]")

def get_words(text):
    """returns list of words"""
#     print repr(text)
    text = url_re.sub('', text)
    
    emojis = emoji_re.findall(text)
    text = emoji_re.sub('', text)
    
    hashtags = hashtag_re.findall(text)
    text = hashtag_re.sub('', text)
    
    retweets = [w.replace(" ", '') for w in retweet_re.findall(text)]
    text = retweet_re.sub('', text)
    
    numbers = numbers_re.findall(text)
    text = numbers_re.sub('', text)
    
    words = [word.lower() for word in tokenizer.tokenize(text)]
    
    return emojis + hashtags + retweets + words