In [1]:
import os
import sys
import string
import pickle
import tarfile
import numpy as np
import pandas as pd
from gensim import corpora
from gensim.models import ldamodel, Phrases, phrases
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import SnowballStemmer


class classifytweet:
    def __init__(self, model_files='model_files/'):
        """
        tweet_body: the individual tweet
        model_files: the location for the model files. leave the model files folder as is
                     unless you have changed the location of all the model files.
        """
        self.model_files = model_files
        #self.model, self.corpus, self.dictionary, self.phraser, self.valence_mean, self.arousal_mean, self.valence_sd, self.arousal_sd = self.load_model()
        self.load_model()
        self.n = len(self.dictionary.items())
        self.word_map = {v:k for k,v in self.dictionary.items()}
        print("Model items loaded and classifier initialized!")
        
    def load_model(self):
        """
        Loads the model, corpus, and dictionary.
        """
        # extract tarfiles
        for f in os.listdir(self.model_files):
            if f.endswith('.gz'):
                tar = tarfile.open(self.model_files + f, "r:gz")
                tar.extractall(path=self.model_files)
                tar.close()

        # load model, corpus, and dictionary objects
        fnames = [fn for fn in os.listdir(self.model_files) if '.gensim' in fn]
        self.model = ldamodel.LdaModel.load(self.model_files + fnames[0])
        self.corpus = corpora.MmCorpus(self.model_files + 'unigrams_corpus.mm')
        self.dictionary = corpora.Dictionary.load(self.model_files + 'unigrams_dictionary.pkl')
        self.model.id2word = self.dictionary
        self.phraser = phrases.Phrases.load(self.model_files + 'document_phraser.pkl')
        for f in ['unigrams_dictionary.pkl', 'unigrams_corpus.mm', 'unigrams_corpus.mm.index', 'NB_vectorizer.pkl', 'NB_sentiment_model.pkl']:
            fnames.append(f)
        
        # load the valence and arousal arrays
        self.valence_mean = pickle.load(open(self.model_files + 'valence_mean.pkl', 'rb'))
        self.arousal_mean = pickle.load(open(self.model_files + 'arousal_mean.pkl', 'rb'))
        self.valence_sd = pickle.load(open(self.model_files + 'valence_sd.pkl', 'rb'))
        self.arousal_sd = pickle.load(open(self.model_files + 'arousal_sd.pkl', 'rb'))
        
        # load the MinMaxScaler for the transforming the scores
        self.base_outrage_scaler = None
        self.expanded_outrage_scaler = None
        self.valence_scaler = None
        self.arousal_scaler = None
        self.emoji_scaler = None
        self.topic_valence_scaler = pickle.load(open(self.model_files + 'topic_valence_scaled.pkl', 'rb'))
        self.topic_arousal_scaler = pickle.load(open(self.model_files + 'topic_arousal_scaled.pkl', 'rb'))
        
        # load the Naive Bayes sentiment model
        try:
            self.nb_model = pickle.load(open(self.model_files + 'NB_sentiment_model.pkl', 'rb'))
        except:
            self.nb_model = pickle.load(open(self.model_files + 'NB_sentiment_model.pkl', 'rb'), encoding='latin1')
        try:
            self.nb_vectorizer = pickle.load(open(self.model_files + 'NB_vectorizer.pkl', 'rb'))
        except:
            self.nb_vectorizer = pickle.load(open(self.model_files + 'NB_vectorizer.pkl', 'rb'), encoding='latin1')
        
        # load the outrage dictionaries
        self.outrage_list = pd.read_csv(self.model_files + 'outrage_dictionary_stemmed.csv', header=None)
        self.exp_outrage_list = pd.read_csv(self.model_files + 'expanded_outrage_dictionary_stemmed.csv', header=None)

        # cleanup the unzipped files
        for f in fnames:
            os.remove(self.model_files + f)
            
        return "All modeling information loaded"

    def prepare_tweet(self, tweet):
        """
        Turn that unstructured text into sweet, sweet, "cleaned" up tokens!
        """
        self.tweet = tweet
        stemmer = SnowballStemmer("english")
        tokenizer = TweetTokenizer()
        self.tweet_tokenized = tokenizer.tokenize(self.tweet)
        try:
            self.tweet_tokenized = [unicode(y.encode("utf-8"), errors='ignore') for y in self.tweet_tokenized]
            self.stemmed = [stemmer.stem(y) for y in self.tweet_tokenized]
        except:
            #tweet_tokenized = [y.encode("utf-8") for y in tweet_tokenized]
            self.stemmed = [stemmer.stem(y) for y in self.tweet_tokenized]
        #self.stemmed = filter('', self.stemmed)
        #self.processed = ''.join(self.stemmed)
        #self.text_bigrams = [' '.join(self.stemmed[i:]) for i in range(2)]
        self.text_bigrams=list(bigrams(self.stemmed))
        self.text_bigrams=["%s %s" % x for x in self.text_bigrams]
        self.text_bigrams.extend(self.stemmed)

        keep = set(['!','?'])
        stop = set(stopwords.words('english'))
        remove = set([x for x in list(string.punctuation) if x not in keep])
        stop.update(remove)
        stop.update(['',' ','  '])
        stemmed = [d for d in self.stemmed if d not in stop]
        self.phrased = list(self.phraser[[stemmed]])[0]

        print('Phrased representation: "' + ' '.join(self.phrased) + '"')
        #return None
    
    def get_valence_score(self):
        """
        Creates the valence and arousal score for the tweet.
        """
        tweet_arr = np.zeros(self.n)
        for word in set(self.phrased) & set(self.word_map.keys()):
            tweet_arr[self.word_map[word]] = 1.
        mean = tweet_arr * self.valence_mean
        sd = tweet_arr * self.valence_sd
        total_sd = np.sum(sd) * tweet_arr
        with np.errstate(divide='ignore'):
            sd_ratio = total_sd / sd
            sd_ratio[sd == 0] = 0
        sd_weight = sd_ratio / np.sum(sd_ratio)
        
        self.valence_score = np.sum(mean*sd_weight)
        
        return self.valence_score

    def get_arousal_score(self):
        """
        Creates the valence and arousal score for the tweet.
        """
        tweet_arr = np.zeros(self.n)
        for word in set(self.phrased) & set(self.word_map.keys()):
            tweet_arr[self.word_map[word]] = 1.
        mean = tweet_arr * self.arousal_mean
        sd = tweet_arr * self.arousal_sd
        total_sd = np.sum(sd) * tweet_arr
        with np.errstate(divide='ignore'):
            sd_ratio = total_sd / sd
            sd_ratio[sd == 0] = 0
        sd_weight = sd_ratio / np.sum(sd_ratio)
        
        self.arousal_score = np.sum(mean*sd_weight)
        
        return self.arousal_score

    def get_sentiment_score(self):
        """
        Weights the posititive/negative sentiment of the tweet.
        """
        vectorized = self.nb_vectorizer.transform(self.text_bigrams)
        self.sentiment_score = np.average(1 - self.nb_model.predict_proba(vectorized)[:,1])

        return self.sentiment_score

    def get_topics(self):
        """
        Extract the topics from the tweet using the LDA model.
        """
        return self.model.get_document_topics(self.model.id2word.doc2bow(self.phrased), per_word_topics=False)

    def get_emoji_count(self):
        """
        Count the Mad! faces.
        """
        positives = ['<U+0082>', '<U+008D>']
        outrage = ['<U+00A0>', '<U+00A1>', '<U+00A4>', '<U+00A9>']
        positive_score = sum([y in positives for y in self.tweet_tokenized])
        outrage_score = sum([y in outrage for y in self.tweet_tokenized])
        self.emoji_count = outrage_score-positive_score
        return self.emoji_count

    def get_base_outrage_count(self):
        """
        Get the number of outrage words in the tweet.
        """
        self.base_outrage_count = len(set(self.stemmed) & set(self.outrage_list))
        return self.base_outrage_count

    def get_expanded_outrage_count(self):
        """
        Get the number of outrage words in the tweet.
        """
        outrage = set(self.stemmed) & set(self.exp_outrage_list)
        self.expanded_outrage_count = 0
        for i in self.stemmed:
            if i in (self.exp_outrage_list):
                self.expanded_outrage_count += 1

        return self.expanded_outrage_count

    def get_outrage_score(self):
        """
        Uses the results of each of the index measures to create one score.
        .20 outrage dict
        .15 expanded outrage dict
        .15 valence
        .13 arousal
        .11 sentiment
        .10 emoji
        .08 topic valence
        .08 topic arousal
        """
        self.topics = self.get_topics()
        topic_valence_score = 0
        topic_arousal_score = 0
        for tup in self.topics:
            topic_valence_score += self.topic_valence_scaler[tup[0]] * tup[1]
            topic_arousal_score += self.topic_valence_scaler[tup[0]] * tup[1]
            
        scores = np.array([
            self.get_base_outrage_count(),
            self.get_expanded_outrage_count(),
            self.get_valence_score(),
            self.get_arousal_score(),
            self.get_sentiment_score(),
            self.get_emoji_count(),
            topic_valence_score,
            topic_arousal_score
            ])
        weights = np.array([0.2, 0.15, 0.15, 0.13, 0.11, 0.10, 0.08, 0.08])

        self.outrage_meter = np.sum(scores*weights)
        return self.outrage_meter

In [2]:
#import tweetclassifier
from time import time
import datetime

t0 = time()
tweeter = classifytweet()
elapsed = time() - t0
print("Took %s to initialize." % (str(datetime.timedelta(seconds=elapsed))))

Model items loaded and classifier initialized!
Took 0:00:21.018352 to initialize.


In [3]:
tweeter.prepare_tweet('"When scientists say bears are going extinct, I want people to realize what it looks like," says photographer Paul Nicklen')

Phrased representation: "scientist_say bear go extinct want peopl realiz look_like say photograph paul nicklen"


In [4]:
tweeter.get_valence_score()



5.7438875653772072

In [5]:
tweeter.get_arousal_score()



4.3143414353251792

In [6]:
tweeter.get_topics()

[(3, 0.088334416350984671),
 (18, 0.09443194166093366),
 (30, 0.65034827955069929),
 (33, 0.089107584659602848)]

In [7]:
for tup in tweeter.get_topics():
    print("Topic",tup[0],tweeter.model.print_topic(tup[0],topn=15))

Topic 3 0.037*"wed_dress" + 0.033*"climat_chang" + 0.020*"greatest_threat" + 0.018*"ted_cruz" + 0.013*"hot" + 0.009*"..." + 0.008*"level" + 0.008*"ocean" + 0.008*"disast" + 0.008*"messag" + 0.007*"new" + 0.007*"#ourstolos" + 0.006*"greatest" + 0.006*"alert" + 0.006*"manag"
Topic 18 0.094*"climat_chang" + 0.039*"link" + 0.018*"worri" + 0.011*"religion" + 0.010*"terror" + 0.010*"pay" + 0.009*"contribut" + 0.009*"@sensand" + 0.009*"evid" + 0.008*"stop" + 0.008*"..." + 0.007*"scam" + 0.007*"project" + 0.006*"side" + 0.006*"corpor"
Topic 30 0.025*"marriag" + 0.020*"..." + 0.018*"like" + 0.016*"i'm" + 0.014*"one" + 0.013*"get" + 0.013*"don't" + 0.012*"go" + 0.009*"peopl" + 0.009*"know" + 0.009*"think" + 0.008*"thing" + 0.008*"would" + 0.008*"becaus" + 0.007*"guy"
Topic 33 0.025*"women" + 0.023*"climat_chang" + 0.017*"#uniteblu_#tcot" + 0.015*"#scienc" + 0.012*"..." + 0.008*"marriag" + 0.007*"anniversari" + 0.007*"bet" + 0.007*"need" + 0.005*"inspir_action" + 0.005*"rt_song" + 0.005*"vid_#sta

In [8]:
tweeter.get_sentiment_score()

0.49762867844555025

In [9]:
tweeter.get_emoji_count()

0

In [10]:
tweeter.get_base_outrage_count()

0

In [11]:
tweeter.get_expanded_outrage_count()

0

In [12]:
tweeter.get_outrage_score()



1.5889777124511231

In [13]:
[(float(x.split('*')[0]),x.split('*')[1][1:-1]) for x in tweeter.model.print_topics(num_topics=60, num_words=30)[0][1].split(" + ")]

[(0.332, 'global_warm'),
 (0.013, 'new_york'),
 (0.013, '...'),
 (0.009, 'via_@dailycal'),
 (0.009, 'time'),
 (0.008, 'caus'),
 (0.007, 'predict'),
 (0.005, 'hoax'),
 (0.004, 'climat_chang'),
 (0.004, 'theori'),
 (0.004, 'excit'),
 (0.004, '#isi'),
 (0.004, 'obama'),
 (0.004, '#tcot'),
 (0.004, 'antarct_ice'),
 (0.003, '#forecast'),
 (0.003, 'develop'),
 (0.003, 'say'),
 (0.003, 'claim'),
 (0.003, 'crowd'),
 (0.003, 'trend'),
 (0.003, 'summer'),
 (0.003, 'gain_ice'),
 (0.003, 'send_forc'),
 (0.003, 'metorologist_fight'),
 (0.003, 'https://t.co/gprzfbifbm_obama'),
 (0.003, 'new'),
 (0.003, 'denial'),
 (0.003, 'enrich_famili'),
 (0.003, 'speed')]