In [None]:
import math
from collections import defaultdict
import json
import csv
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import tweepy
# We must use the API to determine whether the tweets are protected
from tweepy import TweepError

In [None]:
UNLABELED_DATA = '/Users/chloelarkin/Desktop/harvey_extras.json'
LABELED_DATA = '/Users/chloelarkin/Desktop/hurricane_harvey_final_data.json'
CLASSIFICATIONS = '/Users/chloelarkin/Desktop/hurricane_harvey_final_data.tsv'

mmdUserHistories = defaultdict(list)
tweetAuthors = defaultdict(str)
mmdTweetsToText = defaultdict(str)
allTweetsToText = defaultdict(str)

In [None]:
def construct_vectorizer(merged):
    allTweets = []
    for i, tweet_json in enumerate(merged):
        text = tweet_json['text'].lower()
        allTweets.append(text)
    vectorizer = TfidfVectorizer()
    vectorizer.fit(allTweets)
    return vectorizer

In [None]:
def vectorize_histories(histories, vectorizer):
    rawHistories = [] # Will be in order
    for i, history in enumerate(histories):
        text = ' '.join([tweet_json['text'].lower() for tweet_json in history])
        rawHistories.append(text)
    histArr = vectorizer.transform(rawHistories)
    historySVD = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
    histFeatureArr = historySVD.fit_transform(histArr)
    histFeatureArr = np.array(histFeatureArr)
    return histFeatureArr

In [None]:
def vectorize_tweets(tweets, vectorizer):
    labeledTweets = []  # Will be in order
    for i, tweet_json in enumerate(tweets):
        text = tweet_json['text'].lower()
        labeledTweets.append(text)
    tweetArr = vectorizer.transform(labeledTweets)
    tweetSVD = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
    tweetFeatureArr = tweetSVD.fit_transform(tweetArr)
    tweetFeatureArr = np.array(tweetFeatureArr)
    return tweetFeatureArr

In [None]:
datahandler = data_handler.DataHandler(UNLABELED_DATA, LABELED_DATA, CLASSIFICATIONS)
train_labeled, train_histories, test_labeled, test_histories, merged, trainClassifications, \
testClassifications = datahandler.get_train_test_split()

vectorizer = construct_vectorizer(merged)
trainHistories = vectorize_histories(train_histories, vectorizer)
trainTweets = vectorize_tweets(train_labeled, vectorizer)
testHistories = vectorize_histories(test_histories, vectorizer)
testTweets = vectorize_tweets(test_labeled, vectorizer)

In [None]:
# Add classifications to the end of labeled train and test tweet ndarrays
trainClassifications = np.array(trainClassifications)
trainTweets = np.hstack(trainTweets, trainClassifications)
testClassifications = np.array(testClassifications)
testTweets = np.hstack(testTweets, testClassifications)

In [None]:
# Validation checks
print("Shape of train history np array:", trainHistories.shape)
print("Shape of train tweets np array (should be 201 including classification value):", trainTweets.shape)
print("Shape of test history np array:", testHistories.shape)
print("Shape of test tweets np array (should be 201 including classification value):", testTweets.shape)