In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.python.keras import models, layers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score
import re
import pandas
from keras.utils import to_categorical
import talos
from pandas_profiling import ProfileReport
from tokenizer import tokenizer as tknzr
from nltk.corpus import stopwords
import nltk
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chieflaki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def predict_emoji(tweet):
    x = re.compile(r'[^a-z\s]').sub(r'', re.compile(r'[\W]').sub(r' ', tweet.lower()))
    x = tokenizer.texts_to_sequences(np.array([x]))
    x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=max_length)
    result = model.predict(x)
    print("'"+tweet+"' got the emoji: ",  np.argmax(result, axis=1)[0])

In [6]:
train_tweets = pandas.read_pickle("emoji_train.pkl")["tweet"]
train_emojis = pandas.read_pickle("emoji_train.pkl")["emoji_class"]
test_tweets = pandas.read_pickle("emoji_test.pkl")["tweet"]
test_emojis = pandas.read_pickle("emoji_test.pkl")["emoji_class"]

# print(train_tweets)

# print("Text preprocessing ...")

# T = tknzr.TweetTokenizer(regularize=True, preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False)
# english_stops = set(stopwords.words('english'))
# for i in range(len(train_tweets)):
#     tokens = T.tokenize(train_tweets[i].lower())
#     train_tweets[i] = " ".join(tokens)
# for i in range(len(test_tweets)):
#     tokens = T.tokenize(test_tweets[i].lower())
#     test_tweets[i] = " ".join(tokens)


import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, WordNetLemmatizer

stopword_set = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [word for word in text if not word in stopword_set]  
    return text

from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text)
    for word in text:
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return corrected_text

def pos_tagging(text):
    return nltk.pos_tag(text)

lemmatizer = WordNetLemmatizer()
tag_dict = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(pos_tagged_text):
    return [lemmatizer.lemmatize(word, tag_dict.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]

en_words_set = set(nltk.corpus.words.words())

def remove_none_en_word(text):
    return [word for word in text if word in en_words_set]

def clean_single_char(text):
    return [re.sub(r'(^| ).(( ).)*( |$)','', word) for word in text] 

def back_to_sentence(text):
    return " ".join(text) 

train_tweets_copy = train_tweets.copy()
test_tweets_copy = test_tweets.copy()

train_tweets_copy['tweet'] = train_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char) \
                        .apply(back_to_sentence)

test_tweets_copy['tweet'] = test_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char) \
                        .apply(back_to_sentence)
""" 
train_tweets_copy['tweet'] = train_tweets['tweet'] \
                        .apply(cleanUpTweet) \
                        .apply(removePunSymNum) \
                        .apply(normalize) \
                        .apply(tokenize) \
                        .apply(remove_stopwords) \
                        .apply(correct_spellings) \
                        .apply(pos_tagging) \
                        .apply(lemmatize_words) \
                        .apply(remove_none_en_word) \
                        .apply(clean_single_char)
                        .apply(back_to_sentence)
"""

print(train_tweets_copy.head(10)["tweet"])
print(test_tweets_copy.head(10)["tweet"])

train_tweets = train_tweets_copy["tweet"].copy()
test_tweets = test_tweets_copy["tweet"].copy()

max_features = 3000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(train_tweets)

# od = tokenizer.word_counts.copy()
# for key, value in tokenizer.word_counts.items():
#     if value < 5:
#         del od[key]
# tokenizer.word_counts = od.copy()
# print(len(tokenizer.word_counts))

train_tweets = tokenizer.texts_to_sequences(train_tweets)
test_tweets = tokenizer.texts_to_sequences(test_tweets)

max_length = max(max(len(train_r) for train_r in train_tweets), max(len(train_r) for train_r in test_tweets))
train_tweets = tf.keras.preprocessing.sequence.pad_sequences(train_tweets, maxlen=max_length)
test_tweets = tf.keras.preprocessing.sequence.pad_sequences(test_tweets, maxlen=max_length)

print("Splitting dataset ...")
train_tweets, validation_tweets, train_emojis, validation_emojis = train_test_split(train_tweets, train_emojis, test_size=0.2)

input = layers.Input(shape=(max_length,))
x = layers.Embedding(max_features, 160)(input) #128
x = layers.Dropout(0.4)(x)
x = layers.LSTM(224, return_sequences=False)(x) #128
x = layers.Dense(96, activation='sigmoid')(x) #96

output = layers.Dense(7, activation="sigmoid")(x)

model = models.Model(inputs=input, outputs=output)
model.compile(loss='poisson', optimizer='adam', metrics=['accuracy']) # categorical_crossentropy


try:
    model.load_weights('weights/emojis.hdf5')
    print("\nLoading previous model weights:\n")
except:
    print("\nNo weights found. Training new model\n")

print("Training Model:\n")
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_accuracy'),
]
model.fit(train_tweets, to_categorical(train_emojis),use_multiprocessing=True,callbacks=my_callbacks, 
          batch_size=128, epochs=30, validation_data=(validation_tweets, to_categorical(validation_emojis)))
preds = model.predict(test_tweets)
print("\nPredictions:\n")
print(preds, "\n\n")

print('Accuracy score: {:0.4}'.format(accuracy_score(test_emojis, np.argmax(preds, axis=1))*100))

print("\nTesting model: ")

print("\n\nSaving model weights ...")
model.save_weights('weights/noPrepEmojis.hdf5')

ModuleNotFoundError: No module named 'indexer'