In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

In [2]:
train = pd.read_csv("data/train_2kmZucJ.csv")
test = pd.read_csv("data/test_oJQbWVk.csv")

In [3]:
print(train.shape)
print(test.shape)

(7920, 3)
(1953, 2)


In [4]:
train['label'].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [5]:
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [6]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [7]:
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [8]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

# 使用预训练好的ELMo模型

In [9]:
import tensorflow_hub as hub
import tensorflow as tf
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [10]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [11]:
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
  print(embeddings.shape)
    
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [12]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [13]:
# Extract ELMo embeddings
elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

(100, 51, 1024)
(100, 41, 1024)
(100, 37, 1024)
(100, 45, 1024)
(100, 39, 1024)
(100, 37, 1024)
(100, 36, 1024)
(100, 53, 1024)
(100, 36, 1024)
(100, 40, 1024)
(100, 45, 1024)
(100, 50, 1024)
(100, 51, 1024)
(100, 50, 1024)
(100, 36, 1024)
(100, 53, 1024)
(100, 39, 1024)
(100, 54, 1024)
(100, 32, 1024)
(100, 40, 1024)
(100, 37, 1024)
(100, 34, 1024)
(100, 53, 1024)
(100, 48, 1024)
(100, 52, 1024)
(100, 32, 1024)
(100, 38, 1024)
(100, 42, 1024)
(100, 42, 1024)
(100, 33, 1024)
(100, 36, 1024)
(100, 36, 1024)
(100, 36, 1024)
(100, 30, 1024)
(100, 40, 1024)
(100, 41, 1024)
(100, 40, 1024)
(100, 36, 1024)
(100, 41, 1024)
(100, 38, 1024)
(100, 35, 1024)
(100, 37, 1024)
(100, 43, 1024)
(100, 40, 1024)
(100, 35, 1024)
(100, 39, 1024)
(100, 36, 1024)
(100, 51, 1024)
(100, 37, 1024)
(100, 41, 1024)
(100, 31, 1024)
(100, 36, 1024)
(100, 41, 1024)
(100, 41, 1024)
(100, 33, 1024)
(100, 39, 1024)
(100, 43, 1024)
(100, 34, 1024)
(100, 36, 1024)
(100, 43, 1024)
(100, 46, 1024)
(100, 35, 1024)
(100, 48

In [14]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [15]:
# save elmo_train_new
pickle_out = open("elmo_train_03032019.pickle","wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_03032019.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [16]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

In [17]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['label'],  
                                                  random_state=42, 
                                                  test_size=0.2)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
preds_valid = lreg.predict(xvalid)

In [20]:
f1_score(yvalid, preds_valid)

0.7752675386444708

In [21]:
# make predictions on test set
preds_test = lreg.predict(elmo_test_new)

In [22]:
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})

# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)