https://www.analyticsvidhya.com/blog/2019/03/learn-to-use-elmo-to-extract-features-from-text/

In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [2]:
# read data
train = pd.read_csv("train_2kmZucJ.csv")
test = pd.read_csv("test_oJQbWVk.csv")

train.shape, test.shape

((7920, 3), (1953, 2))

In [3]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [5]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [6]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))


In [7]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect...
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i'm wired i know i'm george i was made that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple won't even talk to me about a question i have unless i pay them . for their stupid support


In [18]:
# import spaCy's language model
import en_core_web_sm
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [19]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [20]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
1323,1324,0,"RT @FollowBacg: #i #justinbieber #apple #ipad #iphone June 08, 2011 at 03:01PMJune 08, 2011 at 03:01PM","rt followbacg i justinbieber apple ipad iphone june , at pmjune , at pm"
1573,1574,0,Take a glimpse at my phone ;)) Samsung Softbank . Thanks kay Father :* :* :* #fromjapan #samsung https://www.facebook.com/photo.php?fbid=507135112678890&set=a.220037171388687.54278.100001472187182...,take a glimpse at -PRON- phone samsung softbank . thank kay father fromjapan samsung …
687,688,0,Enjoy a Cup Of Happiness #instadrink #photooftheday #samsung #samsunggalaxytab #tab2 #p3100… http://instagram.com/p/c9SKyfSM8L/,enjoy a cup of happiness instadrink photooftheday samsung samsunggalaxytab tab p …
2353,2354,1,FUCK YOU WITH ALL MY HEART MOTHER FUCKERS!! #bo2 #activision #ps3 #sony #playstation #blackops2,fuck -PRON- with all -PRON- heart mother fucker bo activision p sony playstation blackop
2948,2949,0,Look I made #art with my #iphone! #art #photography #edited #photoshop #mobile #usa #crafty #nifty #50pic.twitter.com/LJfbnxw6RE,look i make art with -PRON- iphone art photography edit photoshop mobile usa crafty nifty pic.twitter.comljfbnxw re
3997,3998,0,"Good morning rm #Japan 4 whole za #world friends,have a nice day! No hunger no poverty no war no nuclear w/ #twinagoya #iPad #iPhone","good morning rm japan whole za world friend , have a nice day no hunger no poverty no war no nuclear w twinagoya ipad iphone"
5914,5915,0,Love to find pics of me! (not my pic) Ice cream time!! I missed it!! #igers #hull #unitedkingdom #icecream #happiness #smile #iphone…,love to find pic of -PRON- not -PRON- pic ice cream time i miss -PRON- iger hull unitedkingdom icecream happiness smile iphone …
4795,4796,0,Coral Roses #tapestry #iphone #duvet #comforters #art #odjects #popart #pop #spring #queenofspring #queenhttp://buff.ly/2pP2lU9,coral rose tapestry iphone duvet comforter art odject popart pop spring queenofspr queen
4257,4258,1,#fuckYou #apple I'll #buy another though #lightningcable #fail 6'ter isn't #cheap https://www.instagram.com/p/BCT4r-fCgJZ/,fuckyou apple -PRON- will buy another though lightningcable fail ' ter be not cheap
5822,5823,1,I hate turning my phone to the side to type because of the crack. Can I just get a new phone already Iphone,i hate turn -PRON- phone to the side to type because of the crack . can i just get a new phone already iphone


In [22]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

W0421 10:16:35.660286  7264 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


AttributeError: module 'tensorflow' has no attribute 'init_scope'

In [None]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

In [None]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [None]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [None]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [None]:
# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [None]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)

In [None]:
preds_valid = lreg.predict(xvalid)

In [None]:
f1_score(yvalid, preds_valid)

In [None]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [None]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)