In [2]:
import numpy as np
import pandas as pd

import getopt
import logging
import nltk
import os
import re
import sys
import tweepy

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Generate your own twitter api keys at https://apps.twitter.com/app
secret = open('secret.txt')
strs = secret.read().split("\n")
CONSUMER_KEY = strs[0]
CONSUMER_SECRET = strs[1]
OAUTH_TOKEN = strs[2] 
OAUTH_TOKEN_SECRET = strs[3]

# connect to twitter
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)

# batch size depends on Twitter limit, 100 at this time
batch_size = 100

#Some emojis have character length of more than 1
emoji_threshold = 3  

In [3]:
file = 'cleaned_img_train_plaintext.txt'
#file = 'img_train_plaintext.txt'
data = pd.read_csv(file, sep='\t', encoding = 'utf8', engine='c', header = 0)

#data = pd.DataFrame([j for i,j in enumerate(data.values) if len(data.iloc[i, 2].split(',')) == 1], columns=['id','imgid','annotations'])

data.head()

Unnamed: 0,id,imgid,annotations
0,742995415264546816,http://pbs.twimg.com/media/Ck80Z5TUYAAb4sM.jpg,865
1,746964081370865664,http://pbs.twimg.com/media/Cl2_3D7WkAAcNYE.jpg,1103110811031108
2,741083806547857408,http://pbs.twimg.com/media/CH1pK09UYAAhgCh.jpg,"1102,1135,1138,1241,1102,1135,1241,1102,1135,1..."
3,746749796262645761,http://pbs.twimg.com/media/ClxnlSaUkAAUruc.jpg,186
4,744903166806786049,http://pbs.twimg.com/media/ClZteXpUgAEzdIW.jpg,82011051413


In [4]:
def locate_emoji(emoji_pattern, text: str):
    emoji = ''.join(emoji_pattern.findall(text))
    try:
        index = text.index(emoji)
    except:
        index = - emoji_threshold
    return emoji, index

In [14]:
def get_tweets(twapi, data):
    '''
    Fetches content for tweet IDs in a file using bulk request method,
    which vastly reduces number of HTTPS requests compared to above;
    however, it does not warn about IDs that yield no tweet.
    `twapi`: Initialized, authorized API object from Tweepy
    '''
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        "]+",
        flags=re.UNICODE)

    tweet_ids = data.id.values.tolist()
    emoji_labels = data.annotations.values.tolist()

    all_tweets = []
    labels = []
    i = 0  #for debug
    # process list of ids until it's empty
    while len(tweet_ids) > 0:
        if len(tweet_ids) < batch_size:
            tweets = twapi.statuses_lookup(
                id_=tweet_ids, include_entities=False, trim_user=True)
            tweet_ids = []
        else:
            tweets = twapi.statuses_lookup(
                id_=tweet_ids[:batch_size],
                include_entities=False,
                trim_user=True)
            tweet_ids = tweet_ids[batch_size:]

        for tweet in tweets:
            
            # removes the link of the tweet
            text = re.sub(r'http\S+', '', tweet.text).strip(' ')            
            text = re.sub(r'\ART @\S+','', text).strip(' ')
            
            # remove tweets where emoji is not at the end
            emoji, index = locate_emoji(emoji_pattern, text)
            
            if index >= len(text) - emoji_threshold:
                #removes the emojis from the text
                text = emoji_pattern.sub(r'', text).strip(' ')

                #then appends the tweet and emoji to our final dataset
                all_tweets.append(np.array([text]))
                labels.append(emoji)

        i += 1
        if i == 5:
            break
            
    features = all_tweets
    #features = np.array(all_tweets)
    #labels = np.array(labels)
    return features, labels

In [15]:
X, y = get_tweets(api, data)

In [None]:
# create list of stopwords
stop_words = list(set(stopwords.words('english')))

In [7]:
def tweets_cleaning(tweets, stopwords: list):
    """
    Text cleaning function that performs all necessary text preprocessing steps.
    Function only keeps characters, that are alphanumerical (non-alphanumerical values are discarded).
    Digits are treated by regular expressions.
    Lower-casing is performed to reduce noise and normalize the text (convert it into a uniform representation).
    Stemming is performed to only keep the stem of each word token but not any other deviated form. 
    Stop words (i.e., words that occur more frequently than other words in a given corpus) are removed.
    """
    
     # initialize Lancaster stemmer
    st = LancasterStemmer()
    
    cleaned_data = []
    
    for tweet in tweets:
        
        cleaned_tweet = []
        tweet = re.sub(r'&amp\S+','', tweet)
        tweet = re.sub(r' & ', ' and ', tweet)
        tweet = re.sub(r'!!*', '!', tweet)
        tweet = re.sub(r'??*', '?', tweet)
        tweet = re.sub('[.\-_:/\n\t]+', ' ', tweet)
        tweet = tweet.split(" ")
        
        for word in tweet:
            
            # if emoticon is in word, keep the emoticon
            if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word):
                cleaned_word = word
                
            else:
                # keep special characters which might carry important information
                # perform lower-casing to normalize the text and reduce noise
                cleaned_word = ''.join([char for char in word if re.search('[<>$#€£!?@=]', char) or
                                        char.isalnum()]).lower()
            
            if "<3" not in cleaned_word:
                cleaned_word = re.sub('[0-9]', '0', cleaned_word)
  
            # removes each \n (i.e., new line) or \t (i.e., tab) -> pipe char denotes a disjunction
            cleaned_word = re.sub(r'( \n| \t)+', '', cleaned_word)
            
            # perform stemming
            cleaned_word = st.stem(cleaned_word)
                        
            if len(cleaned_word) > 0 and not in stopwords:
                cleaned_tweet.append(cleaned_word)
            
        
        if len(cleaned_tweet) > 1:
            cleaned_data.append(cleaned_tweet)
        
    return cleaned_data

In [None]:
def bag_of_words(tweets: list, ngram = (1, 1), vocab = None,):
    """
    Create a count (!) based bag-of-words unigram or bigram representation of provided tweets.
    Ngram is set to unigram by default. If bigram bag-of-words should be created, pass tuple (2, 2).
    
    Vocabulary argument is set to None by default. 
    You can pass a vocabulary to this function, which may then be used for CountVectorizer. 
    If you do not pass a vocabulary to this function, CountVectorizer will create a vocabulary itself.
    """
    
    # initialize vectorizer (word-ngram representation)
    vectorizer = CountVectorizer(encoding = 'utf-8', lowercase = True, ngram_range = ngram, analyzer = 'word')

In [5]:
def get_embeddings(text_file):

    """ 
    Read GloVe txt.-file, load pre-trained word embeddings into memory
    and create a word_to_embedding dictionary, where keys are the discrete word strings
    and values are the corresponding continuous word embeddings, retrieved from the GloVe txt.-file.
    For unkown words, the representation is an empty vector (i.e., zeros matrix).
    """
    embeddings_dict = {}

    with open(text_file, encoding="utf8") as file:

        for line in file:
            values = line.split()
            word = values[0]
            wordvec = np.array(values[1:], dtype = 'float32')
            embeddings_dict[word] = list(wordvec)

    return embeddings_dict

In [12]:
emoji_embeddings = get_embeddings("emoji2vec.txt")

In [13]:
def get_emojivecs(emoji_embeddings: dict, corpus: list, dims: int):

    N = len(corpus)
    M = dims
    
    emojivecs = []
    
    # document = tweet; corpus = all tweets
    for emoji in corpus:
        emoji_sequence = []

        try:
            emojivec = emoji_embeddings[emoji]
            assert len(emojivec) == M
            emoji_sequence.append(emojivec)
        except KeyError:
            emoji_sequence.append([0 for _ in range(M)])
            print("This {} does not exist in the pre-trained emoji embeddings.".format(emoji))

        emojivecs.append(emoji_sequence)

    assert len(emojivecs) == N
    return np.array(emojivecs)

In [None]:
def get_wordvecs(word_embeddings: dict, corpus: list, dims: int, zeros_padding = False):

    """ 
    Return a concatenated word vector representation of each tweet.
    The concatenated word vectors serve as the input data for the LSTM RNN.
    Each word (embedding) denotes a time step. (Number of timesteps is equal to the length of the input sentence.)
    
    Check whether length of word vector is equal to the number of dimensions we pass to this function.
    For unknown words (i.e., if key does not exist), the representation is an empty vector / zeros matrix of len dims.

    Sequences can have variable length (i.e., number of time steps per batch).
    However, in some cases you might want to zero pad the batch if a sequence < max length of sequences in the corpus.
    By default this argument is set to False as Keras and Tensorflow except input sequences of variable length.
    If set to True, zero padding is computed.
    """

    N = len(corpus)
    M = dims
    global max_length
    max_length = max([len(sequence) for sequence in corpus])
    wordvecs_corpus = []
    
    # document = tweet; corpus = all tweets
    for document in corpus:
        wordvec_sequence = []
        for word in document:
            
            try:
                wordvec = word_embeddings[word]
                assert len(wordvec) == M
                wordvec_sequence.append(wordvec)
            except KeyError:
                wordvec_sequence.append([0 for _ in range(M)])
                
        # needs to be resolved (!)
        if zeros_padding == True: 
            if len(document) < max_length:

                for _ in range(len(document), max_length):
                    wordvec_sequence.append([0 for _ in range(M)])

                assert len(wordvec_sequence) == max_length
        wordvecs_corpus.append(wordvec_sequence)

    assert len(wordvecs_corpus) == N
    return np.array(wordvecs_corpus)

In [31]:
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model.save_word2vec_format('word2vec.txt', binary=False)

In [None]:
word_embeddings = get_embeddings("word2vec.txt")