In [65]:
%reload_ext autoreload
%autoreload 2
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from utilities import data_handler_old as data_handler
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer # for tokenization only



from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# THE CELL BELLOW CONTAINS THE FILENAMES TO CHANGE

In [66]:
IMAGE_FEAT_DIR = '/Users/chloelarkin/Desktop/calfire/'
UNLABLED_DATA = '/Users/chloelarkin/Desktop/calfire_extras.json'
LABLED_DATA = '/Users/chloelarkin/Desktop/california_wildfires_final_data.json'
CLASS_DATA = '/Users/chloelarkin/Desktop/california_wildfires_final_data.tsv'
NPY_OUTPUT_DIR = '/Users/chloelarkin/IITUDND/data/extracted_features/images/calfire/'

In [85]:
# get img data; only enters overlapping img once
labeled_npz = np.load(IMAGE_FEAT_DIR + 'labeled.npz')
unlabeled_npz = np.load(IMAGE_FEAT_DIR + 'unlabeled.npz')

image_features = {}
for file in labeled_npz.files:
    image_features[file] = labeled_npz[file]
for file in unlabeled_npz.files:
    image_features[file] = unlabeled_npz[file]

KeyError: '917792158584922117_0.jpg is not a file in the archive'

# get tweets from file and split into test/train

In [68]:
data = data_handler.DataHandler(UNLABLED_DATA,LABLED_DATA,CLASS_DATA)

train_labeled, train_histories, train_histories_by_target, test_labeled, test_histories, test_histories_by_target, train_merged, train_classes, test_classes = data.get_train_test_split()


## Set up glove

In [69]:
conversion_file = '../models/gensim_glove.txt'
# convert glove format to work with gensim. tutorial here https://radimrehurek.com/gensim/scripts/glove2word2vec.html
# _ = glove2word2vec('/Users/ianmagnusson/IITUDND/models/glove.twitter.27B.200d.txt', conversion_file)

In [70]:
# load model, NOTE this is very slow!
# glove = KeyedVectors.load_word2vec_format(conversion_file)
glove = []

# Extract features

# Class labels



In [71]:
# build class labels

y_train = np.array(train_classes)
y_test = np.array(test_classes)

In [72]:
# checkpoint!

np.save(NPY_OUTPUT_DIR + 'y_train.npy', y_train)
np.save(NPY_OUTPUT_DIR + 'y_test.npy', y_test)


#y_train = np.load(NPY_OUTPUT_DIR + 'y_train.npy')
#y_test = np.load(NPY_OUTPUT_DIR + 'y_test.npy')

# GLOVE

In [73]:
EMBED_DIM = 200

def embed_tweets(tweet_jsons):
    X_embedded = np.zeros((len(tweet_jsons),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, tweet_json in enumerate(tweet_jsons):
        text = tweet_json['text'].lower()
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded

def embed_histories(histories):
    X_embedded = np.zeros((len(histories),EMBED_DIM))
    tokenizer = CountVectorizer().build_tokenizer()
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        tokens = [token for token in tokenizer(text) if token not in ENGLISH_STOP_WORDS]
        num_in_vocab = 0
        for token in tokens:
            if token in glove:
                X_embedded[i] += glove[token]
                num_in_vocab += 1
        X_embedded[i] = X_embedded[i] / num_in_vocab
    return X_embedded



In [74]:
# baseline features

X_labeled_train = embed_tweets(train_labeled)
X_histories_train = embed_histories(train_histories)
X_labeled_test = embed_tweets(test_labeled)
X_histories_test = embed_histories(test_histories)

  


In [75]:
# checkpoint!
np.save(NPY_OUTPUT_DIR + 'X_labeled_train.npy', X_labeled_train)
np.save(NPY_OUTPUT_DIR + 'X_histories_train.npy', X_histories_train)
np.save(NPY_OUTPUT_DIR + 'X_labeled_test.npy', X_labeled_test)
np.save(NPY_OUTPUT_DIR + 'X_histories_test.npy', X_histories_test)



#X_labeled_train = np.load(NPY_OUTPUT_DIR + 'X_labeled_train.npy')
#X_histories_train = np.load(NPY_OUTPUT_DIR + 'X_histories_train.npy')
#X_labeled_test = np.load(NPY_OUTPUT_DIR + 'X_labeled_test.npy')
#X_histories_test = np.load(NPY_OUTPUT_DIR + 'X_histories_test.npy')


In [76]:
def proccess_seq(histories_by_target):
    X_seq = [] # a list of 2d tensors of shape (len(seq), embed_dim)
    for history in histories_by_target: # One specific tweet history
        X_seq.append(embed_tweets(history))
    return X_seq

X_seq_train = proccess_seq(train_histories_by_target)
X_seq_test = proccess_seq(test_histories_by_target)

  


In [77]:
# checkpoint!
np.savez(NPY_OUTPUT_DIR + 'X_seq_glove_train.npz', *X_seq_train)
np.savez(NPY_OUTPUT_DIR + 'X_seq_glove_test.npz', *X_seq_test)



#X_seq_glove_train = np.load(NPY_INPUT_DIR + 'X_seq_glove_train.npz')
#X_seq_glove_test = np.load(NPY_INPUT_DIR + 'X_seq_glove_test.npz')

# TF-IDF

In [78]:
SVD_COMPONENTS = 200

def construct_vectorizer_and_SVD(merged):
    allTweets = []
    for i, tweet_json in enumerate(merged):
        text = tweet_json['text'].lower()
        allTweets.append(text)
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_merged = vectorizer.fit_transform(allTweets)
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, n_iter=7, random_state=42)
    svd.fit(tfidf_merged)    
    return vectorizer, svd

def vectorize_histories(histories, vectorizer, svd):
    rawHistories = [] # Will be in order
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        rawHistories.append(text)
        
        # loop thru all tweets in hist, all img in tweet and add to vector and count and div
    histArr = vectorizer.transform(rawHistories)
    histFeatureArr = svd.transform(histArr)
    return histFeatureArr


def vectorize_tweets(tweets, vectorizer, svd):
    labeledTweets = []  # Will be in order
    for i, tweet_json in enumerate(tweets):
        text = tweet_json['text'].lower()
        labeledTweets.append(text)
    tweetArr = vectorizer.transform(labeledTweets)
    tweetFeatureArr = svd.transform(tweetArr)
    return tweetFeatureArr    

Images

In [130]:
def construct_image_svd(image_features, train_merged):
    '''
    Fits image SVD on all images from train_merged
    '''
    image_features_error = 0
    lookup_error = 0
    print(len(image_features))
    print("Example image_features key:", next(iter(image_features.keys()))) # sanity check for format of keys in img_feats
    X_img_seq = [] # create n by (SVD length) matrix - stores one vector per target tweet
    for tweet in train_merged: # specific tweet history for each item in merged
        if 'extended_entities' in tweet.keys():
            for j in range(len(tweet['extended_entities']['media'])):
                count = 0
                imageID = (str(tweet['extended_entities']['media'][j]['id']) +'_'+str(count)+'.jpg')
                print("Image ID from extended entities:", imageID)
                try:
                    # Extract the image ID for each image, and look up the image_features corresponding vec
                    X_img_seq.append(image_features[imageID])
                    print(image_features[imageID])
                    print(X_img_seq[-1])
                    count += 1
                except KeyError:
                    image_features_error += 1
                                
        else:
            lookup_error += 1
    print(image_features_error, "cant-find-vector errors and ", lookup_error, "lookup errors for extended entities")
    print(len(X_img_seq)) # checking num of successful identifications
    X_img_seq = np.hstack(X_img_seq)
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, n_iter=7, random_state=42)
    svd.fit(X_img_seq)
    return svd

def vectorize_images(image_features, histories_by_target, ImgSVD):
    '''
    Method that returns n x SVD matrix. Each row is a list of matrices representing each 
    tweet in the target's history, up to and including the target
    '''
    allImageHistories = []
    for target in histories_by_target: # specific tweet history from keys:TweetID, vals=vectors
        historyTweets = []
        for historicTweet in target: # iterate through each tweet in history
            historicTweetImages = [] # start list of images in the historic tweet
            for j in range(len(historicTweet['extended_entities']['media'])):
                count = 0
                # Extract the image ID for each image, and look up the image_features corresponding vec
                historicTweetImages.append(str(image_features[(historicTweet['extended_entities']['media'][j]['id'])])+'_'+str(count)+'.jpg')
                count += 1
            historicTweetImages = ImgSVD.transform(historicTweetImages) # transform img vectors
            historicTweetImages = np.hstack(historicTweetImages) # stack img vectors into matrix
            historyTweets.append(historicTweetImages) # add img matrix to the target's hist list
        allImageHistories.append(historyTweets)
    return allImageHistories
    # Return n by (SVD length) matrix: each row contains list of matrices
    # Each matrix represents one historic tweet in that labeled tweet's history

In [131]:
# BASELINE
# assuming train_histories is a list of history IDs for each target

vectorizer, svd = construct_vectorizer_and_SVD(train_merged)
trainHistories = vectorize_histories(train_histories, vectorizer, svd)
trainTweets = vectorize_tweets(train_labeled, vectorizer, svd)
testHistories = vectorize_histories(test_histories, vectorizer, svd)
testTweets = vectorize_tweets(test_labeled, vectorizer, svd)


# New image methods
imageSVD = construct_image_svd(image_features, train_merged) # fit SVD on merged
trainImages = vectorize_images(image_features, train_histories, imageSVD)  # Get image vectors for train
testImages = vectorize_images(image_features, test_histories, imageSVD) # Get image vectors for test


1589
Example image_features key: 918273788143321088_0.jpg
Image ID from extended entities: 917791125263364096_0.jpg
Image ID from extended entities: 917787633299841024_0.jpg
Image ID from extended entities: 917500026187800576_0.jpg
Image ID from extended entities: 917500025910865920_0.jpg
Image ID from extended entities: 917623249823027201_0.jpg
Image ID from extended entities: 917792142956695552_0.jpg
Image ID from extended entities: 917715620514451457_0.jpg
Image ID from extended entities: 917794230164213760_0.jpg
Image ID from extended entities: 917433156189507585_0.jpg
Image ID from extended entities: 917433156189523968_0.jpg
Image ID from extended entities: 917433156650913792_0.jpg
Image ID from extended entities: 917793728512905216_0.jpg
Image ID from extended entities: 917793877297213440_0.jpg
Image ID from extended entities: 917792404203216896_0.jpg
Image ID from extended entities: 917696663879270400_0.jpg
Image ID from extended entities: 917773597300506624_0.jpg
Image ID from 

ValueError: need at least one array to concatenate

In [None]:
# Validation checks
print("Shape of train history np array:", trainHistories.shape)
print("Shape of train tweets np array):", trainTweets.shape)
print("Shape of test history np array:", testHistories.shape)
print("Shape of test tweets np array:", testTweets.shape)
print("Shape of train images np array:", trainImages.shape)
print("Shape of test images np array:", testImages.shape)

In [None]:
# Save to outfiles
np.save(NPY_OUTPUT_DIR + 'trainHistories.npy', trainHistories)
np.save(NPY_OUTPUT_DIR + 'trainTweets.npy', trainTweets)
np.save(NPY_OUTPUT_DIR + 'testHistories.npy', testHistories)
np.save(NPY_OUTPUT_DIR + 'testTweets.npy', testTweets)
np.save(NPY_OUTPUT_DIR + 'trainClassifications.npy', train_classes)
np.save(NPY_OUTPUT_DIR + 'testClassifications.npy', test_classes)

In [None]:
# sequence features

def proccess_seq_tfidf(histories_by_target, vectorizer, svd):
    X_seq = [] # a list of 2d tensors of shape (len(seq), SVD_COMPONENTS)
    for history in histories_by_target:
        X_seq.append(vectorize_tweets(history, vectorizer, svd))
    return X_seq

X_seq_tfidf_train = proccess_seq_tfidf(train_histories_by_target, vectorizer, svd)
X_seq_tfidf_test = proccess_seq_tfidf(test_histories_by_target, vectorizer, svd)

In [None]:
#TODO: Follow this format of saving matrices

# checkpoint!
np.savez(NPY_OUTPUT_DIR + 'X_seq_tfidf_train.npz', *X_seq_tfidf_train)
np.savez(NPY_OUTPUT_DIR + 'X_seq_tfidf_test.npz', *X_seq_tfidf_test)
np.savez(NPY_OUTPUT_DIR + 'images_train.npz', *trainImages)
np.savez(NPY_OUTPUT_DIR + 'images_test.npz', *testImages)


#X_seq_tfidf_train = np.load(NPY_OUTPUT_DIR + 'X_seq_tfidf_train.npz')
#X_seq_tfidf_test = np.load(NPY_OUTPUT_DIR + 'X_seq_tfidf_test.npz')
