# Task 2.3: Tweet pre-processing

In [1]:
from nltk.tokenize import word_tokenize

tweet = 'RT @JordiTorresBCN: Just an Example! :D http://fib.upc.edu #masterMEI'

print(word_tokenize(tweet))

['RT', '@', 'JordiTorresBCN', ':', 'Just', 'an', 'Example', '!', ':', 'D', 'http', ':', '//fib.upc.edu', '#', 'masterMEI']


In [2]:
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweetTokens = preprocess(tweet)

In [3]:
print tweetTokens

['RT', '@JordiTorresBCN', ':', 'Just', 'an', 'Example', '!', ':D', 'http://fib.upc.edu', '#masterMEI']


Once it has been processed, we can remove the punctuation and the stop words of the tweet:

In [4]:
from nltk.corpus import stopwords
import string

def cleanTweet(tweetTokens):
    tweetTokens_noPunctutation = [w for w in tweetTokens if w not in string.punctuation]
    tweetTokens_cleaned = [w for w in tweetTokens_noPunctutation if not w in stopwords.words('english')]
    return tweetTokens_cleaned

cleanedTweet = cleanTweet(tweetTokens)
print cleanedTweet

['RT', '@JordiTorresBCN', 'Just', 'Example', ':D', 'http://fib.upc.edu', '#masterMEI']


We can try to find out the name of the people mentioned in the tweet:

In [5]:
from nltk import word_tokenize, pos_tag, ne_chunk

def discoverProfile(tweet):
    tweetNewDataList = ([re.findall('[A-Z][^A-Z]*', w) for w in tweet if w[0] is "@"])
    tweetWithUsers = tweet
    if tweetNewDataList: tweetWithUsers += tweetNewDataList[0]
    taggedTweet = ne_chunk(pos_tag(tweetWithUsers))
    return taggedTweet

print discoverProfile(cleanedTweet)

(S
  RT/NNP
  @JordiTorresBCN/NNP
  Just/NNP
  Example/NNP
  :D/NNP
  http://fib.upc.edu/NN
  #masterMEI/NNP
  (PERSON Jordi/NNP Torres/NNP)
  B/NNP
  C/NNP
  N/NNP)


We are going to analyse another tweet:

In [6]:
import nltk

tweet = 'Strong comes in three colors! Which #OnePlus3T Otterbox is your favorite? http://onepl.us/3OB'

tweetTokens = preprocess(tweet)
print tweetTokens

['Strong', 'comes', 'in', 'three', 'colors', '!', 'Which', '#OnePlus3T', 'Otterbox', 'is', 'your', 'favorite', '?', 'http://onepl.us/3OB']


In [7]:
cleanedTweet = cleanTweet(tweetTokens)
print cleanedTweet

['Strong', 'comes', 'three', 'colors', 'Which', '#OnePlus3T', 'Otterbox', 'favorite', 'http://onepl.us/3OB']


In [8]:
print discoverProfile(cleanedTweet)

(S
  (GPE Strong/JJ)
  comes/VBZ
  three/CD
  colors/NNS
  (PERSON Which/NNP)
  #OnePlus3T/VBD
  (PERSON Otterbox/NNP)
  favorite/JJ
  http://onepl.us/3OB/NN)


We can see how this Python library tagging task (*Named Entity Recognition*) does not work with all kind of tweets, like for the previous one. It seems that it simply tags as proper noun all that words it doesn't know or recognize within the context of the sentece, for instance, or something similar.