In [38]:
%reload_ext autoreload
%autoreload 2
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from utilities import data_handler
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer # for tokenization only



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# THE CELL BELLOW CONTAINS THE FILENAMES TO CHANGE

In [39]:
IMAGE_FEAT_DIR = '/Users/ianmagnusson/IITUDND/data/extracted_features/resnet/maria/'
UNLABLED_DATA = '/Users/ianmagnusson/IITUDND/data/retrieved_data/tweets/maria_extras.json'
LABLED_DATA = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/json/hurricane_maria_final_data.json'
CLASS_DATA = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/annotations/hurricane_maria_final_data.tsv'
NPY_OUTPUT_DIR = '/Users/ianmagnusson/IITUDND/data/extracted_features/combined_NLP/maria/kfold/'

# get tweets from file and split into test/train

In [40]:
data = data_handler.DataHandler(UNLABLED_DATA,LABLED_DATA,CLASS_DATA)

_, validation_sets = data.get_k_fold_split(10)

 #test_labeled, test_histories, test_histories_by_target, train_merged, train_classes, test_classes = data.get_train_test_split()


FileNotFoundError: [Errno 2] No such file or directory: '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/annotations/hurricane_maria_final_data.tsv'

In [None]:
# get img data; only enters overlapping img once
labeled_npz = np.load(IMAGE_FEAT_DIR + 'labeled.npz')
unlabeled_npz = np.load(IMAGE_FEAT_DIR + 'unlabeled.npz')

image_features = {}
for file in labeled_npz.files:
    image_features[file] = labeled_npz[file]
for file in unlabeled_npz.files:
    parse = file.split('_')
    file_edit = parse[0] + '_' + parse[2] # cut out username
    image_features[file_edit] = unlabeled_npz[file]



## Set up glove

In [None]:
conversion_file = '../models/gensim_glove.txt'
# convert glove format to work with gensim. tutorial here https://radimrehurek.com/gensim/scripts/glove2word2vec.html
# _ = glove2word2vec('/Users/ianmagnusson/IITUDND/models/glove.twitter.27B.200d.txt', conversion_file)

In [None]:
# load model, NOTE this is very slow!
glove = KeyedVectors.load_word2vec_format(conversion_file)

# Extract features


In [None]:
# CODE FOR EXTRACTIONS

EMBED_DIM = 200

def embed_tweets(tweet_jsons):
    X_embedded = np.zeros((len(tweet_jsons),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, tweet_json in enumerate(tweet_jsons):
        text = tweet_json['text'].lower()
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded

def embed_histories(histories):
    X_embedded = np.zeros((len(histories),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded

def proccess_seq(histories_by_target):
    X_seq = [] # a list of 2d tensors of shape (len(seq), embed_dim)
    for history in histories_by_target:
        X_seq.append(embed_tweets(history))
    
    return X_seq

SVD_COMPONENTS = 200
IMG_SVD_COMPONENTS = 400

def construct_vectorizer_and_SVD(merged):
    allTweets = []
    for i, tweet_json in enumerate(merged):
        text = tweet_json['text'].lower()
        allTweets.append(text)
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_merged = vectorizer.fit_transform(allTweets)
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, n_iter=7, random_state=42)
    svd.fit(tfidf_merged)
    return vectorizer, svd


def construct_image_svd(image_features, train_merged):
    '''
    Fits image SVD on all images from train_merged
    '''
    image_features_error = 0
    lookup_error = 0
    X_img_seq = [] # create n images by (SVD length) matrix - stores one vector per image
    for tweet in train_merged: # specific tweet history for each item in merged
        if 'extended_entities' in tweet.keys():
            for i in range(len(tweet['extended_entities']['media'])):
                imageID = (str(tweet['id']) +'_'+str(i)+'.jpg')
                try:
                    # Extract the image ID for each image, and look up the image_features corresponding vec
                    X_img_seq.append(image_features[imageID])
                except KeyError:
                    image_features_error += 1          
        else:
            lookup_error += 1
    print(X_img_seq[0])
    print(image_features_error, "cant-find-vector errors and ", lookup_error, "lookup errors for extended entities")
    X_img_seq = np.stack(X_img_seq, axis=0)
    print("Shape of SVD input:", X_img_seq.shape)
    svd = TruncatedSVD(n_components=IMG_SVD_COMPONENTS, n_iter=7, random_state=42)
    svd.fit(X_img_seq)
    return svd

def vectorize_labeledimgs(ImgSVD, train_labeled, test_labeled):
    train_svds = []
    test_svds = []
    noKeyMatch = 0
    noEntities = 0
    for tweet in train_labeled:
        images = [] # Iterate through each image in each labeled tweet
        if 'extended_entities' in tweet.keys():
            count = 0
            for j in range(len(tweet['extended_entities']['media'])):
                try:
                    images.append(image_features[str(tweet['id'])+'_'+str(count)+'.jpg'])
                    count += 1
                except KeyError:
                    noKeyMatch += 1
            # if images were found, aggregate into matrix and SVD it
            if count > 0:
                images = np.stack(images, axis=0) # stack img vectors into matrix
                images = ImgSVD.transform(images) # transform img vectors
                train_svds.append(images) # add img matrix to the target's hist list
            else:
                zeroArray = np.zeros(400)
                zeroArray = zeroArray.reshape(1, -1)
                train_svds.append([zeroArray])
                
                #train_svds.append([0] * 400) # otherwise, append an empty matrix to make sure order
                # is preserved
        else:
            noEntities += 1
            zeroArray = np.zeros(400)
            zeroArray = zeroArray.reshape(1, -1)
            train_svds.append([zeroArray])
            #train_svds.append([0] * 400)
    print("SVDs composed for labeled train tweets with", noEntities, "cases without images and",
          noKeyMatch, "failed image matches")
    noKeyMatch = 0
    noEntities = 0
    for tweet in test_labeled:
        images = [] # start list of images in the historic tweet
        if 'extended_entities' in tweet.keys():
            count = 0
            for j in range(len(tweet['extended_entities']['media'])):
                try:
                    images.append(image_features[str(tweet['id'])+'_'+str(count)+'.jpg'])
                    count += 1
                except KeyError:
                    noKeyMatch += 1
            # If images were found, aggregate into SVD and append it
            if count > 0:
                images = np.stack(images, axis=0) # stack img vectors into matrix
                images = ImgSVD.transform(images) # transform img vectors
                test_svds.append(images) # add img matrix to the target's hist list
            else:
                zeroArray = np.zeros(400)
                zeroArray = zeroArray.reshape(1, -1)
                test_svds.append([zeroArray])
                #test_svds.append([0] * 400) # append a null value to make sure values still align
        else:
            noEntities += 1
            zeroArray = np.zeros(400)
            zeroArray = zeroArray.reshape(1, -1)
            test_svds.append([zeroArray])
            # test_svds.append([0] * 400)
    print("SVDs composed for labeled test tweets with", noEntities, "cases without images and",
          noKeyMatch, "failed image matches")
    return train_svds, test_svds # Each returned array should be of same length as train/test.

def vectorize_images(image_features, histories_by_target, ImgSVD):
    lstmHistories = [] # first output: for the LSTMs
    nMatrices = [] # second output: list of matrices of all images for each of n tweets
    nVectors = [] # third output: matrix of average img vector for each of n tweets
    noEntities = 0 
    noKeyMatch = 0
    for target in histories_by_target: # specific tweet history from keys:TweetID, vals=vectors
        allTargetTweets = []
        historyTweets = []
        for historicTweet in target: # iterate through each tweet in history
            historicTweetImages = [] # start list of images in the historic tweet
            if 'extended_entities' in historicTweet.keys():
                count = 0
                for j in range(len(historicTweet['extended_entities']['media'])):
                    try:
                        # locate the feature vector for LSTM output
                        historicTweetImages.append(image_features[str(historicTweet['id'])+'_'+str(count)+'.jpg'])
                        # for MLP output
                        allTargetTweets.append(image_features[str(historicTweet['id'])+'_'+str(count)+'.jpg'])
                        count += 1
                    except KeyError:
                        noKeyMatch += 1
                # if historic tweets were found, aggregate into matrix and SVD it
                if count > 0:
                    historicTweetImages = np.stack(historicTweetImages, axis=0) # stack img vectors into matrix
                    historicTweetImages = ImgSVD.transform(historicTweetImages) # transform img vectors
                    historyTweets.append(historicTweetImages) # add img matrix to the target's hist list
            else:
                noEntities += 1
        if len(allTargetTweets) > 0: # see how many images you collected
            
            # extract mean img vector for that target
            nVectors.append(np.mean(allTargetTweets, axis=0))
            allTargetTweets = np.stack(allTargetTweets, axis=0)
            allTargetTweets = ImgSVD.transform(allTargetTweets)
            nMatrices.append(allTargetTweets)
        else:
            zeroArray = np.zeros(400)
            zeroArray = zeroArray.reshape(1, -1)
            nMatrices.append([zeroArray])
            nVectors.append([0] * 2048)
        if len(historyTweets) > 0:
            lstmHistories.append(historyTweets)
        else:
            zeroArray = np.zeros(400)
            zeroArray = zeroArray.reshape(1, -1)
            lstmHistories.append([zeroArray])
        
    # stack up the mean image vectors
    nVectors = np.stack(nVectors, axis=0)
    nVectors = ImgSVD.transform(nVectors)
    return lstmHistories, nMatrices, nVectors
    # lstmHistories contains output for LSTM 
    # nMatrices contains: list of n matrices of (#imgs, 200) for each tweet in history
    # nVectors: is one matrix with one row per target representing mean image vector


def vectorize_histories(histories, vectorizer, svd):
    rawHistories = [] # Will be in order
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        rawHistories.append(text)
    histArr = vectorizer.transform(rawHistories)
    histFeatureArr = svd.transform(histArr)
    return histFeatureArr


def vectorize_tweets(tweets, vectorizer, svd):
    labeledTweets = []  # Will be in order
    for i, tweet_json in enumerate(tweets):
        text = tweet_json['text'].lower()
        labeledTweets.append(text)
    tweetArr = vectorizer.transform(labeledTweets)
    tweetFeatureArr = svd.transform(tweetArr)
    return tweetFeatureArr

def proccess_seq_tfidf(histories_by_target, vectorizer, svd):
    X_seq = [] # a list of 2d tensors of shape (len(seq), SVD_COMPONENTS)
    for history in histories_by_target:
        X_seq.append(vectorize_tweets(history, vectorizer, svd))
    return X_seq

In [None]:
for i, val_set in enumerate(validation_sets):
    print(i)
    
    out_dir = NPY_OUTPUT_DIR + str(i) + '/'
    train_labeled, train_histories, train_histories_by_target, train_classes, train_merged = val_set[0]
    test_labeled, test_histories, test_histories_by_target, test_classes, _ = val_set[1]
    
    
    # New image methods
    imageSVD = construct_image_svd(image_features, train_merged) # fit SVD on merged
    trainMatrix_LSTM, trainMatrix_AllImgsMLP, trainMatrix_MeanImgMLP = vectorize_images(image_features, train_histories, imageSVD)  # Get image vectors for train
    testMatrix_LSTM, testMatrix_AllImgsMLP, testMatrix_MeanImgMLP = vectorize_images(image_features, test_histories, imageSVD) # Get image vectors for test
    
    # get data
    # classes:
    y_train = np.array(train_classes)
    y_test = np.array(test_classes)
    
    # glove baseline: 
    X_labeled_train = embed_tweets(train_labeled)
    X_histories_train = embed_histories(train_histories)
    X_labeled_test = embed_tweets(test_labeled)
    X_histories_test = embed_histories(test_histories)
    
    # glove sequence features
    X_seq_train = proccess_seq(train_histories_by_target)
    X_seq_test = proccess_seq(test_histories_by_target)
    
    # tfidf baseline
    vectorizer, svd = construct_vectorizer_and_SVD(train_merged)
    trainHistories = vectorize_histories(train_histories, vectorizer, svd)
    trainTweets = vectorize_tweets(train_labeled, vectorizer, svd)
    testHistories = vectorize_histories(test_histories, vectorizer, svd)
    testTweets = vectorize_tweets(test_labeled, vectorizer, svd)
    
    # tfidf sequence features
    X_seq_tfidf_train = proccess_seq_tfidf(train_histories_by_target, vectorizer, svd)
    X_seq_tfidf_test = proccess_seq_tfidf(test_histories_by_target, vectorizer, svd)
    
    
    # save data
    np.save(out_dir + 'y_train.npy', y_train)
    np.save(out_dir + 'y_test.npy', y_test)
    
    np.save(out_dir + 'X_labeled_train.npy', X_labeled_train)
    np.save(out_dir + 'X_histories_train.npy', X_histories_train)
    np.save(out_dir + 'X_labeled_test.npy', X_labeled_test)
    np.save(out_dir + 'X_histories_test.npy', X_histories_test)

    np.savez(out_dir + 'X_seq_glove_train.npz', *X_seq_train)
    np.savez(out_dir + 'X_seq_glove_test.npz', *X_seq_test)
    
    np.save(out_dir + 'trainHistories.npy', trainHistories)
    np.save(out_dir + 'trainTweets.npy', trainTweets)
    np.save(out_dir + 'testHistories.npy', testHistories)
    np.save(out_dir + 'testTweets.npy', testTweets)
    np.save(out_dir + 'trainClassifications.npy', train_classes)
    np.save(out_dir + 'testClassifications.npy', test_classes)
    
    np.savez(out_dir + 'X_seq_tfidf_train.npz', *X_seq_tfidf_train)
    np.savez(out_dir + 'X_seq_tfidf_test.npz', *X_seq_tfidf_test)
    
    np.savez(out_dir + 'images_lstm_train.npz', *trainMatrix_LSTM)
    np.savez(out_dir + 'images_lstm_test.npz', *testMatrix_LSTM)
    np.save(out_dir + 'images_matrixlists_train.npy', trainMatrix_AllImgsMLP)
    np.save(out_dir + 'images_matrixlists_test.npy', testMatrix_AllImgsMLP)
    np.savez(out_dir + 'images_meanvecs_train.npz', *trainMatrix_MeanImgMLP)
    np.savez(out_dir + 'images_meanvecs_test.npz', *testMatrix_MeanImgMLP)




