In [2]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

import io

In [3]:
train = pd.read_csv('data/train_2kmZucJ.csv')
train.shape

(7920, 3)

In [4]:
test = pd.read_csv('data/test_oJQbWVk.csv')
test.shape

(1953, 2)

## Read and inspect data

In [5]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

- 1 represents negative tweet and 0 represents non-negative tweet

In [6]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
label,0,0,0,0,1
tweet,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


## Text Processing

In [7]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [8]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
label,0,0,0,0,1
tweet,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!
clean_tweet,#fingerprint #Pregnancy Test #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias…,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [9]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [10]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
label,0,0,0,0,1
tweet,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!
clean_tweet,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…,we love this would you go talk makememories unplug relax iphone smartphone wifi connect...,i'm wired i know i'm george i was made that way iphone cute daventry home,what amazing service apple won't even talk to me about a question i have unless i pay them . for their stupid support


## SpaCy and TensorFlow

In [12]:
# import spaCy's language model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [11]:
import tensorflow as tf

In [15]:
import tensorflow_hub as hub

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [16]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

- Training samples x Max length of all strings x Length of ELMo vector
- Every word in the input sentence has an ELMo vector of size 1024.
- To represent the sentence embedding, take the mean of all ELMo vectors of all words in that sentence


In [19]:
def elmo_vectors(x):
    
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [20]:
# To prevent overloading, use batches of size 100 each

list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [1]:
# Extract ELMo embeddings, this takes a very long time
# elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
# elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

In [13]:
# load in the averaged elmo embeddings over all words in each sentence

pickle_in = open("data/elmo_train_03032019.pickle", "rb")
elmo_train = pickle.load(pickle_in)

pickle_in = open("data/elmo_test_03032019.pickle", "rb")
elmo_test = pickle.load(pickle_in)

In [15]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [17]:
elmo_train_new.shape

(7920, 1024)

In [18]:
elmo_test_new.shape

(1953, 1024)

In [19]:
# Train-test split the data

from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [20]:
# Import logistic regression for the data

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
preds_valid = lreg.predict(xvalid)

In [24]:
print('Prediction on the validation set is {}'.format(f1_score(yvalid, preds_valid)))

Prediction on the validation set is 0.7831325301204819


In [25]:
preds_test = lreg.predict(elmo_test_new)