In [27]:
%load_ext autoreload
%autoreload 2
import math
from collections import defaultdict
import json
import csv
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import tweepy
# We must use the API to determine whether the tweets are protected
from tweepy import TweepError
import numpy as np
from utilities import data_handler
from sklearn.decomposition import TruncatedSVD

In [36]:
UNLABELED_DATA = '/Users/ianmagnusson/IITUDND/data/retrieved_data/harvey_extras.json'
LABELED_DATA = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/json/hurricane_harvey_final_data.json'
CLASSIFICATIONS = '/Users/ianmagnusson/IITUDND/data/CrisisMMD_v1.0/annotations/hurricane_harvey_final_data.tsv'
NPY_OUTFILE = '/Users/ianmagnusson/IITUDND/data/extracted_features/TFIDF/harvey/'

In [30]:
def construct_vectorizer(merged):
    allTweets = []
    for i, tweet_json in enumerate(merged):
        text = tweet_json['text'].lower()
        allTweets.append(text)
    vectorizer = TfidfVectorizer()
    vectorizer.fit(allTweets)
    return vectorizer

In [31]:
def vectorize_histories(histories, vectorizer):
    rawHistories = [] # Will be in order
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        rawHistories.append(text)
    histArr = vectorizer.transform(rawHistories)
    historySVD = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
    histFeatureArr = historySVD.fit_transform(histArr)
    histFeatureArr = np.array(histFeatureArr)
    return histFeatureArr

In [32]:
def vectorize_tweets(tweets, vectorizer):
    labeledTweets = []  # Will be in order
    for i, tweet_json in enumerate(tweets):
        text = tweet_json['text'].lower()
        labeledTweets.append(text)
    tweetArr = vectorizer.transform(labeledTweets)
    tweetSVD = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
    tweetFeatureArr = tweetSVD.fit_transform(tweetArr)
    tweetFeatureArr = np.array(tweetFeatureArr)
    return tweetFeatureArr

In [33]:
datahandler = data_handler.DataHandler(UNLABELED_DATA, LABELED_DATA, CLASSIFICATIONS)
train_labeled, train_histories, test_labeled, test_histories, merged, trainClassifications, \
testClassifications = datahandler.get_train_test_split()

vectorizer = construct_vectorizer(merged)
trainHistories = vectorize_histories(train_histories, vectorizer)
trainTweets = vectorize_tweets(train_labeled, vectorizer)
testHistories = vectorize_histories(test_histories, vectorizer)
testTweets = vectorize_tweets(test_labeled, vectorizer)

In [34]:
# Add classifications to the end of labeled train and test tweet ndarrays
trainClassifications = np.array(trainClassifications)
testClassifications = np.array(testClassifications)

In [35]:
# Validation checks
print("Shape of train history np array:", trainHistories.shape)
print("Shape of train tweets np array):", trainTweets.shape)
print("Shape of test history np array:", testHistories.shape)
print("Shape of test tweets np array:", testTweets.shape)
print("Shape of train classifications:", trainClassifications.shape)
print("Shape of test classifications:", testClassifications.shape)

Shape of train history np array: (3200, 200)
Shape of train tweets np array): (3200, 200)
Shape of test history np array: (800, 200)
Shape of test tweets np array: (800, 200)
Shape of train classifications: (3200,)
Shape of test classifications: (800,)


In [37]:
# Save to outfiles
np.save(NPY_OUTFILE + 'trainHistories.npy', trainHistories)
np.save(NPY_OUTFILE + 'trainTweets.npy', trainTweets)
np.save(NPY_OUTFILE + 'testHistories.npy', testHistories)
np.save(NPY_OUTFILE + 'testTweets.npy', testTweets)
np.save(NPY_OUTFILE + 'trainClassifications.npy', trainClassifications)
np.save(NPY_OUTFILE + 'testClassifications.npy', testClassifications)