In [9]:
%load_ext autoreload
%autoreload 2
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from utilities import data_handler
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer # for tokenization only
from sklearn.decomposition import TruncatedSVD

# THE CELL BELLOW CONTAINS THE FILENAMES TO CHANGE

In [None]:
UNLABLED_DATA = '/Users/ianmagnusson/IITUDND/data/retrieved_data/harvey_extras.json'
LABLED_DATA = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/json/hurricane_harvey_final_data.json'

## Set up glove

In [None]:
# convert glove format to work with gensim. tutorial here https://radimrehurek.com/gensim/scripts/glove2word2vec.html
# _ = glove2word2vec('/Users/ianmagnusson/IITUDND/models/glove.twitter.27B.200d.txt', "gensim_glove.txt")

In [3]:
# load model, NOTE this is very slow!
glove = KeyedVectors.load_word2vec_format("gensim_glove.txt")

# Extract glove embeddings

In [None]:
datahandler = data_handler.DataHandler(UNLABLED_DATA,LABLED_DATA)

train_labeled, train_histories, test_labeled, test_histories = datahandler.get_train_test_split()

In [5]:
EMBED_DIM = 200

def embed_tweets(tweet_jsons):
    X_embedded = np.zeros((len(tweet_jsons),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, tweet_json in enumerate(tweet_jsons):
        text = tweet_json['text'].lower()
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded

def embed_histories(histories):
    X_embedded = np.zeros((len(histories),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded

In [6]:
train_len = len(train_labeled)
test_len = len(test_labeled)

X_labeled_train = embed_tweets(train_labeled)
X_histories_train = embed_histories(train_histories)
X_labeled_test = embed_tweets(test_labeled)
X_histories_test = embed_histories(test_histories)
    

In [7]:
# checkpoint!
np.save('./saved_variables/GloveExtraction_variables/X_labeled_train.npy', X_labeled_train)
np.save('./saved_variables/GloveExtraction_variables/X_histories_train.npy', X_histories_train)
np.save('./saved_variables/GloveExtraction_variables/X_labeled_test.npy', X_labeled_test)
np.save('./saved_variables/GloveExtraction_variables/X_histories_test.npy', X_histories_test)

#X_labeled_train = np.load('./saved_variables/GloveExtraction_variables/X_labeled_train.npy')  
#X_histories_train = np.load('./saved_variables/GloveExtraction_variables/X_histories_train.npy') 
#X_labeled_test = np.load('./saved_variables/GloveExtraction_variables/X_labeled_test.npy')
#X_histories_test  = np.load('./saved_variables/GloveExtraction_variables/X_histories_test.npy') 

# Reduce dimensionality with SVD (TBD)

In [14]:
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X_SVD =svd.fit_transform(X_labeled_train)

  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


In [15]:
svd.singular_values_

array([5.29150262e+00, 8.23780443e-33])