In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.python.keras import models, layers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score
import re
import pandas
from keras.utils import to_categorical

In [2]:
def predict_emoji(tweet):
    x = re.compile(r'[^a-z\s]').sub(r'', re.compile(r'[\W]').sub(r' ', tweet.lower()))
    x = tokenizer.texts_to_sequences(np.array([x]))
    x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=max_length)
    result = model.predict(x)
    print("'"+tweet+"' got the emoji: ",  np.argmax(result, axis=1)[0])


In [3]:
train_tweets = pandas.read_pickle("emoji_train.pkl")
train_emojis = pandas.read_pickle("emoji_train.pkl")["emoji_class"]
test_tweets = pandas.read_pickle("emoji_test.pkl")
test_emojis = pandas.read_pickle("emoji_test.pkl")["emoji_class"]
print(train_tweets.shape)
print(test_tweets.shape)

print(pandas.read_pickle("emoji_train.pkl").head())

(42627, 4)
(10657, 4)
                                               tweet  emoji_class emoji  \
0  Brought to you courtesy of the red white and b...            5    ðŸ‡ºðŸ‡¸   
1                 @user #taotuesday @ TAO Downtown\n            3     ðŸ”¥   
2  Ready to celebrate America with @user Happy #i...            5    ðŸ‡ºðŸ‡¸   
3  10min project w old footage #houstonphotograph...            2     ðŸ“¸   
4  Usually I don't put song on my insta but this ...            3     ðŸ”¥   

  predicted_class  
0            None  
1            None  
2            None  
3            None  
4            None  


In [4]:
def cleanUpTweet(txt):
    # Remove mentions (target)
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt)
    # Remove hashtags
    txt = re.sub(r'#', '', txt)
    # Remove retweets:
    txt = re.sub(r'RT : ', '', txt)
    # Remove urls
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt)
    return txt
def removePunSymNum(txt):
    #remove alle characters except a-zA-Z
    txt = re.sub(r'([^a-zA-Z ])', '', txt) #regex expression validated with https://regexr.com/
    # Removing additional whitespaces
    txt = re.sub(r" +", ' ', txt)
    return txt
def normalize(text):
    return text.lower()

def tokenize(text):
    text = word_tokenize(text,language='english')
    return text

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, WordNetLemmatizer

stopword_set = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [word for word in text if not word in stopword_set]  
    return text

from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text)
    for word in text:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return corrected_text

def pos_tagging(text):
    return nltk.pos_tag(text)

lemmatizer = WordNetLemmatizer()
tag_dict = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(pos_tagged_text):
    return [lemmatizer.lemmatize(word, tag_dict.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]

en_words_set = set(nltk.corpus.words.words())

def remove_none_en_word(text):
    return [word for word in text if word in en_words_set]

def clean_single_char(text):
    return [re.sub(r'(^| ).(( ).)*( |$)','', word) for word in text] 

def back_to_sentence(text):
    return " ".join(text) 

train_tweets_copy = train_tweets.copy()
test_tweets_copy = test_tweets.copy()

train_tweets_copy['tweet'] = train_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char) \
                        .apply(back_to_sentence)

test_tweets_copy['tweet'] = test_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char) \
                        .apply(back_to_sentence)
""" 
train_tweets_copy['tweet'] = train_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(correct_spellings) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char)
                        .apply(back_to_sentence)
"""

print(train_tweets_copy.head(10)["tweet"])
print(test_tweets_copy.head(10)["tweet"])

0    bring courtesy red white blue welcome home sol...
1                                         tao downtown
2                                ready celebrate happy
3                             min project  old footage
4                     usually dont put song fire thank
5      toilet talk radiate meditate life journey peace
6                                          south beach
7               red want spice red visit thats lighten
8    attempt first  together little nervous river half
9                                         hell kitchen
Name: tweet, dtype: object
0                    perfect look act like jean
1      like call tandem decide would share love
2                      crab dip toast miss miss
3              happy thanksgiving family county
4                               soho house west
5             love trash talk thing black coach
6                  beautiful day today downtown
7      go ignore fact look like video miss much
8    run little man today huge help fam

### Word Count

In [41]:
li_words = list()
for i in range(len(train_tweets_copy)):
    for k in  train_tweets_copy.tweet[i].split():
        li_words.append(k)

word_cnt = Counter(li_words)
dict_cnt_words = dict(word_cnt)
dict_sorted_most = sorted(dict_cnt_words.items(),key = lambda x : x[1],reverse=True)
dict_sorted_least = sorted(dict_cnt_words.items(),key = lambda x : x[1],reverse=False)


#output 20 most and least frequent words
dict_sorted_most[0:20],dict_sorted_least[0:20]


([('love', 3787),
  ('new', 2132),
  ('get', 2109),
  ('day', 2085),
  ('happy', 2074),
  ('beach', 1418),
  ('time', 1386),
  ('night', 1293),
  ('one', 1187),
  ('la', 1170),
  ('today', 1164),
  ('go', 1119),
  ('good', 1117),
  ('like', 1093),
  ('park', 1077),
  ('san', 1056),
  ('family', 1014),
  ('make', 1014),
  ('last', 976),
  ('come', 970)],
 [('radiate', 1),
  ('worthless', 1),
  ('enrol', 1),
  ('slink', 1),
  ('hogan', 1),
  ('artillery', 1),
  ('purslane', 1),
  ('whiz', 1),
  ('erika', 1),
  ('deteriorate', 1),
  ('deceitful', 1),
  ('anticrepuscular', 1),
  ('woodside', 1),
  ('priory', 1),
  ('printed', 1),
  ('desi', 1),
  ('historicalness', 1),
  ('welfare', 1),
  ('distract', 1),
  ('drench', 1)])

In [None]:
train_tweets = train_tweets_copy["tweet"].copy()
test_tweets = test_tweets_copy["tweet"].copy()

In [6]:
"""
#clean data
for i in range(len(train_tweets)):
    train_tweets[i] = re.compile(r'[^a-z0-9\s]').sub(r'', re.compile(r'[\W]').sub(r' ', train_tweets[i].lower()))
for i in range(len(test_tweets)):
    test_tweets[i] = re.compile(r'[^a-z0-9\s]').sub(r'', re.compile(r'[\W]').sub(r' ', test_tweets[i].lower()))
"""

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_tweets)
train_tweets = tokenizer.texts_to_sequences(train_tweets)
test_tweets = tokenizer.texts_to_sequences(test_tweets)

In [7]:
print("Text preprocessing ...")
max_length = max(max(len(train_r) for train_r in train_tweets), max(len(train_r) for train_r in test_tweets))
train_tweets = tf.keras.preprocessing.sequence.pad_sequences(train_tweets, maxlen=max_length)
test_tweets = tf.keras.preprocessing.sequence.pad_sequences(test_tweets, maxlen=max_length)

Text preprocessing ...


In [8]:
print("Splitting dataset ...")
train_tweets, validation_tweets, train_emojis, validation_emojis = train_test_split(train_tweets, train_emojis, test_size=0.2)

input = layers.Input(shape=(max_length,))
x = layers.Embedding(max_features, 128)(input)
x = layers.LSTM(128, return_sequences=False)(x)
x = layers.Dropout(0.5)(x)
# x = layers.LSTM(128, return_sequences=False)(x)
# x = layers.Dropout(0.5)(x)
output = layers.Dense(7, activation="softmax")(x)

model = models.Model(inputs=input, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


try:
    model.load_weights('weights/emojis.hdf5')
    print("\nLoading previous model weights:\n")
except:
    print("\nNo weights found. Training new model\n")

print("Training Model:\n")
model.fit(train_tweets, to_categorical(train_emojis), batch_size=128, epochs=2, validation_data=(validation_tweets, to_categorical(validation_emojis)))

preds = model.predict(test_tweets)
print("\nPredictions:\n")
print(preds, "\n\n")

print('Accuracy score: {:0.4}'.format(accuracy_score(test_emojis, np.argmax(preds, axis=1))))

print("\nTesting model: ")

predict_emoji("I love this picture")
predict_emoji("Look at the sunset")
predict_emoji("This is so sad")
predict_emoji("Bolt won the race again! First place")

print("\n\nSaving model weights ...")
model.save_weights('weights/emojis.hdf5')


Splitting dataset ...

No weights found. Training new model

Training Model:

Epoch 1/2
Epoch 2/2

Predictions:

[[0.2602802  0.2907211  0.3418643  ... 0.02481917 0.01989494 0.02661566]
 [0.5523219  0.28275636 0.06769247 ... 0.02009727 0.01078538 0.00885693]
 [0.89823985 0.01107103 0.03667618 ... 0.00526081 0.00897477 0.00517573]
 ...
 [0.24679469 0.11362917 0.16721694 ... 0.00993871 0.01319369 0.00731013]
 [0.15840226 0.12648988 0.54765606 ... 0.0113587  0.03268366 0.02481087]
 [0.65842324 0.16152276 0.05382883 ... 0.05396543 0.01665795 0.013126  ]] 


Accuracy score: 0.5342

Testing model: 
'I love this picture' got the emoji:  0
'Look at the sunset' got the emoji:  0
'This is so sad' got the emoji:  0
'Bolt won the race again! First place' got the emoji:  0


Saving model weights ...


OSError: Unable to create file (unable to open file: name = 'weights/emojis.hdf5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 302)