In [370]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [371]:
df = pd.read_csv("data/train/training_data.csv")

In [372]:
tweets=df.tweet

## Davidson Feature Generation

In [373]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = tweet.split() #[stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [338]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [339]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [340]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )

In [341]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [233]:
def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Preprocess for slang

In [234]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [235]:
def load_slang_dict():
    slang_dict = {}
    with open("slang_to_words.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            #print(tokens[1])
            slang_dict[tokens[1]] = tokens[0]
    return slang_dict
slang_dict_one = load_slang_dict()
#slang_dict

In [236]:
def load_slang_two_dict():
    slang_dict_two = {}
    with open("noslangdotcom.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split(':')
            #print(tokens[1])
            slang_dict_two[tokens[0]] = tokens[1]
    return slang_dict_two
slang_dict_two = load_slang_two_dict()
#slang_dict_two


In [237]:
def load_slang_three_dict():
    slang_dict_three = {}
    with open("internet_slangsDotNet.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('==')
            slang_dict_three[tokens[0]] = tokens[1]
            #print("first ", tokens[0], "second ", tokens[1])
    return slang_dict_three
slang_dict_three = load_slang_three_dict()

In [238]:
def load_slang_four():
    slang_dict_four = {}
    with open("common_twitter_abbreviations.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('=')
            slang_dict_four[tokens[0]] = tokens[1]
    return slang_dict_four
slang_dict_four = load_slang_four()

In [239]:
def merge_dicts(*dict_args):
    """
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    """
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

slang_dict = merge_dicts(slang_dict_one, slang_dict_two, slang_dict_three, slang_dict_four)

## Replace slang with definitions

In [240]:
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()

In [241]:
def slang_sentiment(text):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return senti

def positive(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def negative(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def objective(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def remove_slang(text):
    s = tknzr.tokenize(text)
    soFar = ''
    
    for word in s: 
        if word.lower() in slang_dict:
            soFar += slang_dict[word.lower()] + ' '
        else:
            soFar += word  + ' ' 
    return soFar.split(' ')

def get_sentiment_text(strList):
    text = ' '.join(strList)
    pos_values = nltk.pos_tag(text)
    pos_senti = []
    for (x, y) in pos_values:
        if len(get_sentiment(x,y)) > 1:
            pos_senti.append(get_sentiment(x,y))
        else: 
            pos_senti.append([0, 0, 0])       
    return pos_senti
        
def get_pos(text, indicator):
    x = 0
    pos = get_sentiment_text(text)
    for v in pos:
        x +=  v[indicator]
    return x

In [242]:
from nltk.corpus import wordnet as wn

In [243]:
def penn_to_wn(tag):
#Convert between the PennTreebank tags to simple Wordnet tags"""
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [244]:
def get_sentiment(word,tag):
#""" returns list of pos neg and objective score. But returns empty list if not present in senti wordnet. """
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []
    
    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

In [245]:
data = pd.read_csv('SentiWordNet_3.0.0.txt', sep='\t', header=None)
data.columns = ["POS","ID","PosScore","NegScore","SynsetTerms","Gloss"]

Check for quotes

In [246]:
def contains_quotes(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word == '"' or word == "'" else 0, s))
    if score > 0: 
        return 1 
    return 0

Check if self-referential

In [247]:
ethnic_groups = []
with open('ethnic_groups_and_common_slurs.txt', 'r') as fileinput:
    for line in fileinput:
        ethnic_groups.append((line.split('\n'))[0].lower())


#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this']
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want', 'wants', 'are']
reclaiming = ['proud', 'reclaim', 'reclaming', 'offensive', 'like']
me = ['i\'m', 'we', 'i', 'me', 'this']

def contains_target_self_referential(text):
    words = tknzr.tokenize(text)
                
    #check word in ethnic_groups comes before word in me   
    #e.g. the beaner in me forgets I like beans
    for word in ethnic_groups:
        if word in words[0:]:
            for key in me:
                if key in words[words.index(word):]:
                    return 1
    
    #check if word in me comes before word in reclaiming
    #e.g. i'm a proud beaner
    for key in me:
        if key in words[0:]:
            for word in reclaiming:
                if word in words[words.index(key):]:
                    return 1
                #check if word in me comes before word in ethnic_groups
            for word in ethnic_groups:
                if word in words[words.index(key):]:
                    return 1
            #check if word in me comes before word in ethnic_groups
            #e.g. We beaners have to stick together
            for word in ethnic_groups:
                if word in words[words.index(key):]:
                    return 1
     #check if word in reclaiming comes after modality 
    #e.g. all beaners should go home is offensive
    for key in modality:
        if key in words[0:]:
            for word in reclaiming:
                if word in words[words.index(key):]:
                    return 1
    return 0

Offensive to women/words that hurt

In [248]:
words_that_hurt = {
    'bitch': 'Targets and dehumanizes women, even if used toward men, including queer and gay men. Devalues women and femininity. Reinforces sexism.',
    'ghetto' :'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'ratchett':'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'illegal alien': 'Reduces undocumented immigrants to something less than human. Fixates on legal status instead of people as individuals. Asserts that some people belong here more than others do. Ignores political, social, and economic factors that impact people of color.',
    'no homo': 'Stresses the speaker\'s heterosexuality, masculinity, and/or other traits to avoid being perceived as LGBTQIA. Goes to great lengths to avoid association with anything queer. Reinforces that to be LGBTQIA is bad.',
    'retarded': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'retard': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'lame': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'crazy':'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'dumb': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'that\'s so gay': 'Stigmatizes gay and queer people. Uses their identities to describe something as undesirable and bad. Replaces negative adjectives with words related to LGBTQIA identities.',
    'whore': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'ho': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'slut': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'Bisexuality doesn\'t really exist. People are just gay or straight.': 'This denies the fluidity of sexuality and dismisses people\'s experiences and definitions of self. People deserve the right to define their own identities any way they wish and have those definitions honored.',
    'i think everyone is bisexual': 'While this is often meant to acknowledge the fluidity of sexuality, it dismisses the reality of people who identify as bisexual and erases their experiences. It also invalidates the self-identifications of non-bisexual people.',
    'You\'re too femme to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'You\'re too butch to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'Bisexual people just want straight privilege':'Bisexual people experience discrimination within straight communities and lesbian/gay communities. They never fully experience straight privilege because they do not identify as straight. Often their identities are made invisible and denied.',
    'Bisexual people are just greedy and want to have sex with everyone.':'This stereotypes bisexual people and assumes they are all promiscuous - and that this is a bad thing. It creates negative attitudes toward sex and works against creating a sex positive climate. It also demonstrates an underlying belief that bisexuality is only about behavior and is not a legitimate identity.',
    'Who do you see yourself ending up with?':'This is another way of implying one has to "end up" gay or straight and ignores bisexuality as an identity versus a relationship status. It also assumes everyone desires to be in a long-term monogamous relationship.',
    'Tranny':'Whether or not someone identifies as trans*, calling anyone "tranny" is extremely offensive. While some folks within the trans* community may choose to reclaim this word for themselves, it is not a word that is okay to use to label another person or use as a joke.',
    'That person doesn\'t really look like a woman':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'That person doesn\'t really look like a man':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'What is your REAL name? I mean the one you were given at birth':'This implies that the person\'s gender identity and chosen name are not "real" and perpetuates the idea of Trans people as deceptive. It removes agency and any right to make decisions for themselves, and is incredibly invalidating. It presumes a right to intimate information, disregards privacy, and places Trans lives on public display.',
    'He-She':'This hyphenated term is demeaning and invalidates an individual\'s identity and the pronouns that they use.',
    'What are you REALLY? Have you had surgery?': 'Asking anyone personal questions about their bodies and/or surgeries is invasive and inappropriate. We don\'t ask cisgender people about what is under their clothes; we shouldn\'t ask Trans* people either.',
    'cunt':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'twat':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'pussy':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'thot':'Word created to express womxn or people who are sexually promiscuous. There are speculations that the word comes from the KKK organization that referred to Black women who were forced into prostitution (i.e. Sarah Baartman: Hottentot).',
    'ugly':'Word used to put down someone for the way they look, can be connected back to white supremacist, ableist, sizeist standards of beauty.',
    'you guys':'Erases the identities of people who are in the room. Generalizing a group of people to be masculine.',
    'I\'m being such a fat-ass':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.',
    'I\'m being so fat right now!':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.'
}

hurtfulWords = list(words_that_hurt.keys())

In [249]:
#Binary Feature #6 1) ID tweets with female pronouns 2) Check if these words are in the tweet 

#these words are used disproportionately often against women
#the behaviour they describe often goes unremarked in men.
#source: http://sacraparental.com/2016/05/14/everyday-misogyny-122-subtly-sexist-words-women/
#EVERYDAY MISOGYNY: 122 SUBTLY SEXIST WORDS ABOUT WOMEN (AND WHAT TO DO ABOUT THEM)
female_and_nongender_Pronouns = set(['you','she','its','their','yours',
                                    'her', 'it', 'they', 'them',
                                    'yourself', 'herself', 'themselves',
                                    'your','hers'])

pronouns = {'I': ('personal', True, 'first'),
 'me': ('personal', True, 'first'),
 'we': ('personal', False, 'first'),
 'us': ('personal', False, 'first'),
 'you': ('personal', False, 'second'),
 'she': ('personal', True, 'third'),
 'he': ('personal', True, 'third'),
 'her': ('possessive', True, 'third'),
 'him': ('personal', True, 'third'),
 'it': ('personal', True, 'third'),
 'they': ('personal', False, 'third'),
 'them': ('personal', False, 'third'),
 'myself': ('reflexive', True, 'first'),
 'ourselves': ('reflexive', False, 'first'),
 'yourself': ('reflexive', True, 'second'),
 'yourselves': ('reflexive', False, 'second'),
 'himself': ('reflexive', True, 'third'),
 'herself': ('reflexive', True, 'third'),
 'itself': ('reflexive', True, 'third'),
 'themselves': ('reflexive', False, 'third'),'my': ('possessive', True, 'first'),
 'your': ('possessive', False, 'second'),
 'his': ('possessive', True, 'third'),
 'hers': ('possessive', True, 'third'),
 'its': ('possessive', True, 'third'),
 'our': ('possessive', False, 'first'),
 'their': ('possessive', False, 'third'),
 'mine': ('possessive', True, 'first'),
 'yours': ('possessive', False, 'second'),
 'ours': ('possessive', False, 'first')}

female_offensive = ['bossy', 'abrasive', 'ball-buster', 'aggressive', 
'shrill', 'bolshy', 'intense', 'stroppy', 'forward', 
'mannish', 'gossipy', 'Dramatic', 'Drama Queen', 'Catty', 
'Bitchy', 'Nag', 'Cold', 'Ice queen', 'Shrew', 'Humourless',
'Man-hater', 'Banshee', 'Fishwife', 'Lippy', 'Ditzy', 'Feminazi', 
'militant feminist', 'Bridezilla', 'Diva', 'Prima donna', 'Blonde moment',
'Feisty', 'Supermum','Working mother', 'Career woman', 'Yummy mummy', 'Little old lady', 
'WAHM', 'Slut', 'Trollop','Frigid','Easy','Tease','Loose','Man-eater','Cougar',
'Asking for it','prude','the town bike', 'Mutton dressed as lamb','Slutty','Curvy','Mumsy',
'Cheap','That dress is flattering','Frumpy','Let herself go','Faded beauty','Mousey',
 'Plus-size','Clotheshorse','Brunette ','Ladylike','Bubbly','Vivacious','Flirty',
'Sassy','Chatty','Demure','Modest','Emotional','Hysterical','Hormonal',
'Menstrual ',' pre-menstrual ','Flaky','Moody','Over-sensitive',
'Clucky','Neurotic','Irrational','Baby brain','Baby weight','Mummy blogger',
'Female engineer','That’s good, for a girl','Like a girl','run like a girl', 
'throw like a girl','Mumpreneur','Spinster','Barren','She wears the pants','Housewife',
'Houseproud','Soccer mom','Mistress','Kept woman','Incompetent cervix',
'Failure to progress','Elderly primagravida','Irritable uterus','Tomboy',
'Girly','a girly girl','Little lady','Jail-bait','Heart-breaker','pretty little thing','Catfight','Mommy wars','Caring','Compassionate','Hard-working',
'Conscientious','Dependable','Diligent','Dedicated','Tactful','Interpersonal','Warm',
'Helpful','Maternal', 'Princess', 'Heart-breaker']
#most tweeted to Megyn Kelly by Trump and trump supporters
#https://www.vox.com/2016/1/27/10852876/donald-trump-supporters-sexist-tweets-megyn-kelly
trump_suppporters_megynKelly = ["ugly", "cheap", 'bitch', 'whore', 'bimbo',
                                'cunt', 'hooker', 'slut', 'skank']
others = ['hoe', 'pussy', 'bitches', 'fatty', 'fatass', 'fat-ass']
offsensive_words_toward_women = female_offensive + trump_suppporters_megynKelly + others + hurtfulWords

In [250]:
female_offensive_words = set()
for word in offsensive_words_toward_women:
    female_offensive_words.add(word.lower())
#female_offensive_words

def check_offensive_to_women(text):
    #split tweet by white space and make lower case
    li = set([word.lower() for word in text.split()]) 
    isFemale = female_and_nongender_Pronouns.intersection(li)
    if len(isFemale) == 0:
        return 0
    isOffensive = female_offensive_words.intersection(li)
    if isOffensive:
        return len(isOffensive)
    return 0

NRC emotions

In [251]:
nrc_emotions_df = pd.read_csv("nrc_emotions.csv")

In [252]:
anger = nrc_emotions_df.loc[nrc_emotions_df['anger']][['term']].values
anticipation = nrc_emotions_df.loc[nrc_emotions_df['anticipation']][['term']].values
disgust = nrc_emotions_df.loc[nrc_emotions_df['disgust']][['term']].values
fear = nrc_emotions_df.loc[nrc_emotions_df['fear']][['term']].values
joy = nrc_emotions_df.loc[nrc_emotions_df['joy']][['term']].values
sadness = nrc_emotions_df.loc[nrc_emotions_df['sadness']][['term']].values
surprise = nrc_emotions_df.loc[nrc_emotions_df['surprise']][['term']].values
trust = nrc_emotions_df.loc[nrc_emotions_df['trust']][['term']].values

In [253]:
def anger_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anger else 0, s))
    return score

In [254]:
def anticipation_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anticipation else 0, s))
    return score

In [255]:
def disgust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in disgust else 0, s))
    return score

In [256]:
def joy_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in joy else 0, s))
    return score

In [257]:
def fear_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in fear else 0, s))
    return score

In [258]:
def sadness_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in sadness else 0, s))
    return score

In [259]:
def surprise_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in surprise else 0, s))
    return score

In [260]:
def trust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in trust else 0, s))
    return score

In [261]:
#groups = open('groups.txt','r').read().split('\n')
ethnic_groups = []
with open('ethnic_groups_and_common_slurs.txt', 'r') as fileinput:
    for line in fileinput:
        ethnic_groups.append((line.split('\n'))[0].lower())
#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this', ]
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want']

In [262]:
#If tweet contains a targeted statement referring to a certain group, i.e. "all you Asians" or "every Mexican"
#also checks if a group word is followed by some sort of modal verb

def contains_target(text):
    s = tknzr.tokenize(text)
    
    for i in range(len(s)):
        if s[i].lower() in targets:
            if i != len(s)-1:
                if s[i+1].lower() in ethnic_groups:
                    return 1
            
        elif s[i].lower() in ethnic_groups:
            if i != len(s)-1:
                if s[i+1].lower() in modality:
                    return 1            
        return 0
    

In [266]:
def other_features_base(tweet):
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features
    
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    #sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    #Our features
    text_only = preprocess(tweet) #Get text only
    words = remove_slang(text_only) #replace slang/abbreviations with full words 
    
    senti = slang_sentiment(text_only)
    pos = positive(tweet, 0)
    neg = negative(tweet, 1)
    obj = objective(tweet, 2)
    
    no_slang_str = ''.join(words)
    trustCount = trust_count(no_slang_str)
    surpriseCount = surprise_count(no_slang_str)
    sadnessCount = sadness_count(no_slang_str)
    fearCount = fear_count(no_slang_str)
    joyCount = joy_count(no_slang_str)
    disgustCount = disgust_count(no_slang_str)
    anticipationCount = anticipation_count(no_slang_str)
    angerCount = anger_count(no_slang_str)
    isSelfReferential = contains_target_self_referential(no_slang_str)
    hasQuotes = contains_quotes(tweet)
    targeted = contains_target(text_only)
    immigrant_ref = 0
    if text_only.find('immigrant') or text_only.find('immigrants'):
        immigrant_ref = 1
    isOffensiveToWomen = check_offensive_to_women(tweet)
    
    #Davidson features
    syllables = textstat.syllable_count(text_only)
    num_chars = sum(len(w) for w in text_only)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(tweet.split())
    #avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(text_only.split()))
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
        
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, targeted, immigrant_ref, isOffensiveToWomen,
                trustCount, surpriseCount, sadnessCount, angerCount, fearCount, 
                joyCount, disgustCount, anticipationCount, isSelfReferential, hasQuotes, pos, neg, obj]
   
    return features

def get_feature_array(tweets, base):
    feats=[]
    for t in tweets:
        if base:
            feats.append(other_features_base(t))
        else:
            feats.append(other_features(t))
    return np.array(feats)

In [264]:
other_features_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet", "targeted", "immigrant_ref", "isOffensiveToWomen",
                "trustCount", "surpriseCount", "sadnessCount", "angerCount", "fearCount", 
              "joyCount", "disgustCount", "anticipationCount", "isSelfReferential", "hasQuotes", "pos", "neg", "obj"]

other_features_base_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet"]

In [268]:
base_feats = get_feature_array(tweets, True)

In [269]:
hand_built_feats = get_feature_array(tweets, False)

## Flair

In [None]:
from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, CharacterEmbeddings, StackedEmbeddings, FlairEmbeddings, BertEmbeddings
import torch

In [None]:
#stack word-level twitter embeddings and forward/backward flair sentence embeddings
news_forward = FlairEmbeddings('news-forward-fast')
news_backward = FlairEmbeddings('news-backward-fast')
twitter = WordEmbeddings('twitter')
bert = BertEmbeddings('bert-base-uncased')
#elmo = ELMoEmbeddings('small')

In [None]:
from flair.data import Sentence

Create embedding for tweets by getting token-level embeddings from stacked embedding

In [None]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
        
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [296]:
def embed_tweets(model, row_len, tweets):
    embeddings = np.empty([len(tweets), row_len])
    num = 0
    total = len(tweets)
    for tweet in tweets:
        s = Sentence(tweet)
        model.embed(s)
        flattened = np.array(s.get_embedding().detach()).flatten()
        #print(flattened.shape[0])
        embeddings[num] = (flattened)
        num+=1
        update_progress(num / total)

    update_progress(1)
    return embeddings

In [21]:
#tweet_embed = DocumentPoolEmbeddings([twitter])

In [270]:
#%%time
#tweet_embeddings = embed_tweets(tweet_embed,100)
#np.savetxt('tweet_embeddings.txt', tweet_embeddings)

tweet_embeddings = np.loadtxt('tweet_embeddings.txt')

In [8]:
#bert_embed = DocumentPoolEmbeddings([bert])

In [368]:
#%%time
#bert_embeddings = embed_tweets(bert_embed, 3072)
#np.savetxt('bert_embeddings.txt', bert_embeddings)

#bert_embeddings = np.loadtxt('bert_embeddings.txt')

In [380]:
bert_news_twitter_embed = DocumentPoolEmbeddings([news_forward, news_backward, bert, twitter])

In [381]:
#bert_news_twitter_embeddings = embed_tweets(bert_news_twitter_embed, 5220)
#np.savetxt('bert_news_twitter_embeddings.txt', bert_news_twitter_embeddings)

bert_news_twitter_embeddings = np.loadtxt('bert_news_twitter_embeddings.txt')

## Train Models with features

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

Flair embeddings

In [407]:
#Now join them all up
M1 = np.concatenate([tfidf,pos,base_feats,bert_news_twitter_embeddings],axis=1)

In [271]:
#M2 = np.concatenate([tfidf,pos,feats,bert_embeddings],axis=1)
M3 = np.concatenate([tfidf,pos,base_feats,tweet_embeddings],axis=1)

Hand-built features

In [272]:
M4 = np.concatenate([tfidf,pos,hand_built_feats],axis=1)

Combine hand-built and twitter embeddings

In [277]:
M5 = np.concatenate([tfidf,pos,hand_built_feats,tweet_embeddings],axis=1)

In [395]:
M6 = np.concatenate([tfidf,pos,hand_built_feats,bert_news_twitter_embeddings],axis=1)

## Train 

In [391]:
def train_model(M):
    X = pd.DataFrame(M)
    y = df['class'].astype(int)
    pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])
    param_grid = [{}] # Optionally add parameters here
    grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv=StratifiedKFold(n_splits=5, 
                                              random_state=42).split(X, y), 
                           verbose=2)
    model = grid_search.fit(X, y)
    return model

In [392]:
tweet_only_LR = train_model(M3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  26.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.6s remaining:    0.0s


[CV] ................................................. , total=  12.4s
[CV]  ................................................................
[CV] ................................................. , total=  11.4s
[CV]  ................................................................
[CV] ................................................. , total=  13.2s
[CV]  ................................................................
[CV] ................................................. , total=  11.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [None]:
# np.savetxt("tweet_only_true.txt", tweet_only_true)
# np.savetxt("tweet_only_pred.txt", tweet_only_pred)

In [None]:
#bert_only_true, bert_only_pred, bert_only_model = train_model(M2)

In [None]:
# np.savetxt("bert_only_true.txt", bert_only_true)
# np.savetxt("bert_only_pred.txt", bert_only_pred)

In [None]:
bert_news_twitter_model = train_model(M1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=29.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 29.2min remaining:    0.0s


[CV]  ................................................................
[CV] ................................................. , total=46.6min
[CV]  ................................................................
[CV] ................................................. , total=16.8min
[CV]  ................................................................
[CV] ................................................. , total=26.3min
[CV]  ................................................................
[CV] ................................................. , total=28.0min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 147.2min finished


In [None]:
# np.savetxt("bert_news_twitter_true.txt", bert_news_twitter_true)
# np.savetxt("bert_news_twitter_pred.txt", bert_news_twitter_pred)

In [393]:
hand_built_model = train_model(M4)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  16.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.5s remaining:    0.0s


[CV] ................................................. , total=  10.5s
[CV]  ................................................................
[CV] ................................................. , total=   9.3s
[CV]  ................................................................
[CV] ................................................. , total=   9.0s
[CV]  ................................................................
[CV] ................................................. , total=  10.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   56.2s finished


In [None]:
# np.savetxt("hand_built_true.txt", hand_built_true)
# np.savetxt("hand_built_pred.txt", hand_built_pred)

In [394]:
combined_model = train_model(M5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  41.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.9s remaining:    0.0s


[CV] ................................................. , total=  12.6s
[CV]  ................................................................
[CV] ................................................. , total=  11.7s
[CV]  ................................................................
[CV] ................................................. , total=  10.5s
[CV]  ................................................................
[CV] ................................................. , total=  13.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.5min finished


In [420]:
bnt_hb_combined_model = train_model(M6)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=34.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 34.3min remaining:    0.0s


[CV]  ................................................................
[CV] ................................................. , total=25.8min
[CV]  ................................................................
[CV] ................................................. , total=41.0min
[CV]  ................................................................
[CV] ................................................. , total=20.9min
[CV]  ................................................................
[CV] ................................................. , total=29.7min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 151.8min finished


## Training Evaluation

In [50]:
all_tweets = df[['tweet', 'class']]

In [451]:
def evaluate(y_true, y_preds, tweet):
    report = classification_report( y_true, y_preds )
    print(report)
    
    misses = np.where(np.asarray(y_true) != y_preds)
    missed_preds = []
    for i in range(len(y_true)):
        if np.asarray(y_true)[i] != y_preds[i]:
            missed_preds.append(y_preds[i])


    missed = [list(y_true.index)[i] for i in misses[0]]
    missed_tweets = tweet.iloc[missed]
    missed_tweets.loc[:,'prediction'] = missed_preds
    
    corrects = np.where(np.asarray(y_true) == y_preds)
    correct_preds = []
    for i in range(len(y_true)):
        if np.asarray(y_true)[i] == y_preds[i]:
            correct_preds.append(y_preds[i])


    correct = [list(y_true.index)[i] for i in corrects[0]]
    correct_tweets = tweet.iloc[correct]
    correct_tweets.loc[:,'prediction'] = correct_preds
    return missed_tweets, correct_tweets

Tweet sentence embeddings

In [431]:
tweet_missed = evaluate(tweet_only_true, tweet_only_pred, all_tweets)

              precision    recall  f1-score   support

           0       0.30      0.44      0.36       104
           1       0.94      0.88      0.91      1507
           2       0.75      0.83      0.79       364

   micro avg       0.85      0.85      0.85      1975
   macro avg       0.66      0.72      0.68      1975
weighted avg       0.87      0.85      0.86      1975



BERT embeddings

In [65]:
bert_missed = evaluate(bert_only_true, bert_only_pred, all_tweets)

              precision    recall  f1-score   support

           0       0.31      0.44      0.37       104
           1       0.94      0.89      0.91      1507
           2       0.77      0.84      0.80       364

   micro avg       0.86      0.86      0.86      1975
   macro avg       0.67      0.72      0.69      1975
weighted avg       0.87      0.86      0.87      1975



Combined BERT, news, and tweet embeddings

In [66]:
bert_news_twitter_missed = evaluate(bert_news_twitter_true, bert_news_twitter_pred)

              precision    recall  f1-score   support

           0       0.34      0.45      0.39       104
           1       0.94      0.89      0.92      1507
           2       0.78      0.86      0.82       364

   micro avg       0.86      0.86      0.86      1975
   macro avg       0.69      0.73      0.71      1975
weighted avg       0.88      0.86      0.87      1975



In [215]:
hand_built_missed = evaluate(hand_built_true, hand_built_pred)

              precision    recall  f1-score   support

           0       0.27      0.41      0.33       104
           1       0.92      0.86      0.89      1507
           2       0.68      0.77      0.72       364

   micro avg       0.82      0.82      0.82      1975
   macro avg       0.62      0.68      0.65      1975
weighted avg       0.84      0.82      0.83      1975



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
both_missed = evaluate(combined_true, combined_pred)

## Run on test set

In [309]:
testing = pd.read_csv("data/test/testing_data.csv")
dev = pd.read_csv("data/dev/development_data.csv") #dev wasn't used in training
test = pd.concat([testing, dev], sort=False)

In [310]:
y_test = test['class'].astype(int)

In [311]:
test_tweets = test['tweet']

In [345]:
#use transform instead of fit_transform to get vector in same space as training data
test_tfidf = vectorizer.transform(test_tweets).toarray() 

In [346]:
test_tweet_tags = []
for t in test_tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    test_tweet_tags.append(tag_str)

In [347]:
test_pos = pos_vectorizer.transform(pd.Series(test_tweet_tags)).toarray()

In [319]:
base_test_feats = get_feature_array(test_tweets, True)

In [320]:
hand_built_test_feats = get_feature_array(test_tweets, False)

Test tweet only embeddings

In [295]:
tweet_embed = DocumentPoolEmbeddings([twitter])

In [297]:
test_tweet_embeddings = embed_tweets(tweet_embed,100, test_tweets)

Progress: [####################] 100.0%


In [360]:
test_M3 = np.concatenate([test_tfidf,test_pos,base_test_feats,test_tweet_embeddings],axis=1)

In [362]:
tweet_only_preds = tweet_only_LR.predict(test_M3)

Test hand-built features

In [350]:
test_M4 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats],axis=1)

In [353]:
hand_built_preds = hand_built_model.predict(test_M4)

Test combined hand-built and tweet embeddings

In [354]:
test_M5 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats,test_tweet_embeddings],axis=1)

In [356]:
combined_preds = combined_model.predict(test_M5)

Bert, news, and twitter embeddings

In [None]:
test_bert_news_twitter_embeddings = embed_tweets(bert_news_twitter_embed, 5220, test_tweets)

Progress: [################----] 78.6%


In [409]:
#test_bert_news_twitter_embeddings.shape, bert_news_twitter_embeddings.shape,

In [410]:
#test_tfidf.shape, tfidf.shape,test_pos.shape, pos.shape,base_test_feats.shape, base_feats.shape

In [401]:
test_M1 = np.concatenate([test_tfidf,test_pos,base_test_feats,test_bert_news_twitter_embeddings],axis=1)

In [408]:
test_M1.shape, M1.shape

((4937, 9046), (19746, 9046))

In [414]:
bert_news_twitter_preds = bert_news_twitter_model.predict(test_M1)

Bert, news, and twitter embeddings with hand-built features

In [421]:
test_M6 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats,test_bert_news_twitter_embeddings ],axis=1)

In [423]:
bnt_hb_combined_preds = bnt_hb_combined_model.predict(test_M6)

## Test Evaluation

In [433]:
testing_tweets = test[['tweet','class']]

In [452]:
test_tweet_only_missed, test_tweet_only_correct = evaluate(y_test, tweet_only_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.25      0.35      0.29       265
           1       0.92      0.87      0.90      3875
           2       0.68      0.75      0.71       797

   micro avg       0.83      0.83      0.83      4937
   macro avg       0.62      0.66      0.63      4937
weighted avg       0.84      0.83      0.83      4937



In [454]:
test_hand_built_missed, test_hand_built_correct = evaluate(y_test, hand_built_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.24      0.36      0.29       265
           1       0.91      0.86      0.89      3875
           2       0.65      0.71      0.68       797

   micro avg       0.81      0.81      0.81      4937
   macro avg       0.60      0.64      0.62      4937
weighted avg       0.83      0.81      0.82      4937



In [455]:
test_combined_missed, test_combined_correct = evaluate(y_test, combined_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.27      0.38      0.32       265
           1       0.92      0.87      0.90      3875
           2       0.68      0.75      0.71       797

   micro avg       0.83      0.83      0.83      4937
   macro avg       0.62      0.67      0.64      4937
weighted avg       0.85      0.83      0.84      4937



In [456]:
test_bert_news_twitter_missed, test_bert_news_twitter_correct = evaluate(y_test, bert_news_twitter_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.31      0.47      0.38       265
           1       0.95      0.88      0.91      3875
           2       0.74      0.85      0.79       797

   micro avg       0.86      0.86      0.86      4937
   macro avg       0.67      0.74      0.69      4937
weighted avg       0.88      0.86      0.87      4937



In [457]:
test_bnt_hb_combined_missed, test_bnt_hb_combined_correct = evaluate(y_test, bnt_hb_combined_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.31      0.47      0.38       265
           1       0.95      0.89      0.92      3875
           2       0.74      0.84      0.79       797

   micro avg       0.86      0.86      0.86      4937
   macro avg       0.67      0.73      0.69      4937
weighted avg       0.88      0.86      0.87      4937



In [377]:
combined_missed.to_csv("test_tweet_hand_combined_missed.csv", sep='\t')

In [378]:
test_hand_built_missed.to_csv("test_hand_built_missed.csv", sep='\t')

In [379]:
test_tweet_only_missed.to_csv("test_tweet_only_missed.csv", sep='\t')

In [419]:
test_bert_news_twitter_missed.to_csv("test_bert_news_twitter_missed.csv", sep='\t')

In [425]:
test_bnt_hb_combined_missed.to_csv("test_bnt_hb_combined_missed.csv", sep='\t')

In [460]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 700)

In [510]:
test_bnt_hb_combined_correct_hate = test_bnt_hb_combined_correct.loc[test_bnt_hb_combined_correct['class']==2][['tweet']]


In [548]:
test_bnt_hb_combined_correct_hate.columns

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [511]:
test_bnt_hb_combined_correct_hate

Unnamed: 0,tweet
18,RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO
35,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9
58,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf"
63,Watching #Steelers colts getting blinded by the Steelers uniys look like bunch of bumblebees
65,RT @netflix: Here's your first look at Charlie Cox as Matt Murdock in #Marvel's @Daredevil on #Netflix #NYCC http://t.co/SIYuCgok8J
67,It's a matter of pure weight ratios... A five ounce bird can not carry a one pound coconut!
78,Ravioli stuffed wit lobster nd a nice stuffed clam #eating betterthanamobster http://t.co/PzdoESaR
80,@AB_crispyy_andy I laugh Oreos and lady gaga on a Sunday day
100,#Yankees Damn. Well Joe that move to the bullpen really helped.
103,Next time you leave magic trash in my jacket try not to have your initials written on them. @whatupag http://t.co/tFV1cZtr


In [476]:
test_bnt_hb_combined_missed_hate = test_bnt_hb_combined_missed.loc[test_bnt_hb_combined_missed['class']==2][['tweet']]


In [477]:
#test_bnt_hb_combined_missed.to_csv("test_bnt_hb_combined_missed.csv", sep='\t')

In [478]:
#test_bnt_hb_combined_correct.to_csv("test_bnt_hb_combined_correct.csv", sep='\t')

In [479]:
#test_hand_built_missed.to_csv("test_hand_built_missed.csv", sep='\t')

In [480]:
#test_hand_built_correct.to_csv("test_hand_built_correct.csv", sep='\t')

## Compare with baseline

In [526]:
test_baseline_correct = pd.read_csv('test_baseline_correct.csv', sep='\t')[['tweet','class','prediction']]
test_baseline_correct_hate = test_baseline_correct[test_baseline_correct['class']==2][['tweet']]
test_baseline_correct_hate

Unnamed: 0,tweet
13,RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO
28,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9
45,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf"
50,Watching #Steelers colts getting blinded by the Steelers uniys look like bunch of bumblebees
52,RT @netflix: Here's your first look at Charlie Cox as Matt Murdock in #Marvel's @Daredevil on #Netflix #NYCC http://t.co/SIYuCgok8J
63,Ravioli stuffed wit lobster nd a nice stuffed clam #eating betterthanamobster http://t.co/PzdoESaR
67,I want a four loco right now so I can chug it and go straight to sleep.
81,#Yankees Damn. Well Joe that move to the bullpen really helped.
84,Next time you leave magic trash in my jacket try not to have your initials written on them. @whatupag http://t.co/tFV1cZtr
100,Yankees getting killed


In [516]:
test_baseline_missed = pd.read_csv('test_baseline_missed.csv', sep='\t')[['tweet','class','prediction']]


In [527]:
len(test_bnt_hb_combined_correct_hate), len(test_baseline_correct_hate)

(688, 665)

In [543]:
df_all = pd.concat([test_bnt_hb_combined_correct_hate.set_index('tweet'), test_baseline_correct_hate.set_index('tweet')], 
                   axis='columns', keys=['First', 'Second'])

In [545]:
df_final = df_all.swaplevel(axis='columns')

In [546]:
def highlight_diff(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    other = data.xs('First', axis='columns', level=-1)
    return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                        index=data.index, columns=data.columns)

df_final.style.apply(highlight_diff, axis=None)

"""@EdgarPixar: Overdosing on heavy drugs doesn't sound bad tonight."" I do that pussy shit every day."
"""@UberFacts: 15 sad TV character deaths we're still bitter over... http://t.co/uLclFUF8nC http://t.co/1RXYQOOsM0""ned stark was the man"
"""@ayyee_ceee_: One mans trash 🚮 is another mans treasure 🏆"""
"""How about we draft all those gung-ho folks and send them to Afghanistan or other war zones so they can show their... http://t.co/sR1zceHx"
"""Is that an albino Mexican?"" ""No dad, he's Asian."""
"""Your teeth are like the stars."" ""Aww thanks!"" ""Yeah... yellow and far away from each other."""
#NowPlaying : BT feat. Tori Amos - Blue Skies (radio edit) on #1069TheArrow - http://t.co/R5wuXA9jB6
#WorldSeriesGame3 Hunter Pence is so annoying he should be a Red Sox player. Shave fool and take your Vyvanse⚾️⚾️👊👊#Yankees 2015!!
#Yankees Damn. Well Joe that move to the bullpen really helped.
#Yankees I ain't complaining about the Royals win or lose game7. They showed heart. AL ride or die! No flex zone. They earned it!
#Yankees Pineda needed that 6'7. Great play!
#Yankees got beaten by a thug
"#hoosier fans, is cody zeller nominating for this years #NBA draft? #iubb"
#mt #goauche #monkey #oldpainting #vinyl #jazz #lounge http://t.co/hueH4HCxx7
#tikiti - rough mock for the base of what will eventually be an ashtray @ Tiki Ti http://t.co/8zE8aqY0
😂😂 maaaaannnn i hope so! RT @ABrown252: @VonshayeB did we get rid of John Legend looking Charlie Batch?
“@AbstractLife: @NigelDixon1 @SeanTHarrington Orton is trash idc” he ain't been playing. Trashlex Smith only sat out Kaep took his spot.
“@CommonBlackGirI: here’s to all the kids who have never found their name on anything in a souvenir store” thanks mom for the ghetto name.
“@HeyKeifer: Oreos are basically dirt circles with cream.” True.
“@HumbleTeej: @elisabethepps where am I on this rating scale???” Depends. You still wearing them colored contacts or naw?
“@MannyBDlopez: When you open your starburst candy and get 2 yellows http://t.co/F0dDbfEzB6” lmao bruh
"“@TWOLVEGIRL: I find myself wanting to google ""the yellow king"" #TrueDetective” No kidding! What is going on?"
“@WORSTRAPLYRlCS: http://t.co/hE19TIfqFX” How is this trash...
“@anallanusa32: 83 you're really cool you're good at basketball. And your a zebra 🏀🐼” is this @WillBall4Life?
“@iCARLEYBBY: “@DJZeeti: twitpic a selfie with light colored eyes” http://t.co/BSOYlL83Iz”😍
“@wassssupsarah: Car hopping is a different world these ghetto guys tried asking for my number LOL” did they have to be ghetto Tho ?
“@wizkhalifa: On the east coast there's colored hair everywhere. This must be the sweetest wave ever.” @LayaFace
“@wwwobert_: B Oreo a stunt”come thru 🔥🔥🔥
★ BEST ASIAN MASSAGE ON THE park slope --TOP RATED SPA ★ 718-622-0221 - 24 http://t.co/Nkvy9nwyzj
".@AlyssSmithh @KeveeeD Yes indeed! Camels, ostriches and zebras! Sept. 5-7 http://t.co/X3Oapfm3e8 #VCCamelRaces"
"5 bags of chips, 3 Rice Krispies, 3 seeds, 3 gummies, a pop tart, some crackers, 3 powerades, and a water yuhhh I'm good lol"
":) ""@PeterDavidsonII: At some point, we need to discuss how delicious the vegan brownie from @busboysandpoets on u St. tastes!"""
@1Corinthians126 #BooksOfGodWidom Proverbs 1:26 I also will laugh at your calamity; I will mock when your fear cometh;
@1Corinthians126 #Jesus Genesis 2:19 Out of the ground the LORD God formed every beast of the field and every #bird of the sky #Initiating
@AB_crispyy_andy I laugh Oreos and lady gaga on a Sunday day
@AJ0427 Just watched the Llewyn Davis doc. Warm fuzzies abound ☺️
@ASourAppleTree Have heard hillbilly grousings that bugs much worse in the KV since MIC shut down.
@AdamSchein @TheMayorMatt I may have just cried a tad #illmissu shiner
@Alludra_AIE just remember to keep the skirt dark colored. It camouflages all that pan dimensional gore
@Anotherpotheadd lucky monkey
@Auroramwj @marylmcaffry @BBAnimals It's a mutation. Genealogy is convoluted. Basically the zebra gene currieries are slaughtered=Extinction
@Boobah_ they trash
@Buckm00se @LEXXX_RUGER @ivanrabago_ the kikes want to charge you 💯 dollaz for the complete version http://t.co/ZJDLwiAjEo
@CapitolEVAN That's where I saw the leprechaun. He told me to burn things.
@ChrisTrondsen @IM5band #askim5 would you rather drink squirrel pee or Johnny Depp's wig in Charlie and the Chocolate Factory?
@DannyMndz93 @Titan21Mtzzz he's still a pogue though
@Dietrich1892 I'll show up and chug a beer or 8
"@EASPORTSFIFA next update, would be AMAZIN if two celebrations were added. 1: Shirt off ( we'll take the yellow) and 2: The RKO"
@GingerDemoness @RiotCast @weirdmedicine I wants to call bouts my vitaligro but I ain't sho if you a doctor fo da colored folk
"@Gladvillian u want me to make u one of my ghetto nunchucks, they come in singles and double_ended"
@GregHillWAAF soo what you putting on those crackers Gregg?
@HBergHattie @snkscoyote I wonder if the progs didn't relegate young black men to the ghettos to keep them away from harry reid's friends...
@HalleyBrenks3 interesting #hick
"@HeatherEliseP and plus, it's trash talk. You should know that especially with ole miss."
@InedibleFood Quickscoping trash. TAKE OF THE TONY HAWK SETUP AND PLAY NORMALLY. THAT'S NOT WHAT SNIPERS ARE FOR!
@Ish_MallyMal @JayJuice22 all trash to me
@JonJ_07 like their players really been talking trash about Bama.
@JonnieSantana He's definitely trash. But if Bill Cohwer or Stanford 's head coach want the job I'd hire them. With a new gm of course
@JuMosq @zachkruse2 Yep. *sobs* I got my hopes up when a couple of the national guys (including that Kiper/McShay mock) had him slipping.
@KeiveeB yellow shirt hat turnt to da side
"@Liveitupjersey BWAHAAAA I just choked on an Oreo cookie! I'M figuring, this may have something to do with it! http://t.co/62S7BgMYPX"
@LoLDyrus It's retarded. Ignore the harlem shake fad and hope it goes away soon.
@Lvl_7_Eevee @UnicornCowboy @hisorraya I just wanted brownies man
@MahNameIsJared be my guest. I want to be a bird I think
@McLaffyTaffy Reason #2 I didn't order a shirt: Taffy will know where I live and he would probably find me and stab me.
@MinuteManX1776 @TheyKnowNotWhat << closet muzzie @MayberryJustin @AllForCountry @LilMissPrepper @ritzy_jewels @glinsan1
@Misters206 You look like a monkey
@Nashgrier purple monkey
@Ncoleycole u still a mutt tho lol
@Notsosweetpea I like brownies 😢
@OldManRo you should check it out... it aint animated and has Charlie Murphy in it
@Pepper_Redbone @Yankees @Mets Oh yeah. And the annoying damn duck calls?? They outta be banned. Duck horns??
@ShojoAIE a bright neon yellow gauntlet
@Stephicans mhmm tweet bird 🐥
@SusanneWhite they're so goooood. I just want like a savoury salty cheesy cracker
"@Swamper60 ""Happy Happy Helloween, Helloween whoa oh whoa ho"" @helloweenorg http://t.co/Tw0jeExqTy"
@TheMayorMatt has to be #live version ! #pulledpork #clams #babyjesus #lilsmokey
@TheREALJB1RD Gotta get my superbowl shirt at the cleaners then we hitting the big bird.
@The_Gambit Ha. He ain't welcome in Washington State or Nashville. Serpas is straight trash.
@WHAScameradan We might have more. Can't really be sure. But some weird little bird said that might be a possibility. Stay tuned. Maybe.
@WakeUpPeeps1 Read w/o slant your orig statement said 'everything' we have...that is the only extreme comment made
@XxminijokerXx I HATE birds! They just piss me off about the fact that I can't fly!
@Yankees isn't that kind of saying the #Braves won't make it to the World Series? LOL
@YoungJeezy got me feelin like #trappin ain't dead. Got me ready to call the #guala! #Salute da truth! #SeenItAll #SeenItAllTheAutobiography
@YungCofGOA @12YearsAHaitian need to start mcbob because Williams only shoot let him play with bird
@_BradleyC skate 2 trash :(:( skate 3 had goat DLC
@_spenceyy @5SOS not scanger
@ashleyypat17 if im trash ur trash since we baes now
@b0ssladyre just ate a wees brownie and is seeing random cats
@boxingscene @RPopBox @BronzeBomber @boxing this Charlie Z really does need mental help check this out http://t.co/CkBS6WcZu1
@brianabrazilian oh... my phone is some trash! but you knew that!
"@britty_jurgens a music suggestion is ""lil spook"""
@c_drew_ I'm guessing so. The players are talking trash.
@caraabastow That's a cute and fuzzy one...hehehe ;)
"@cxslug I never did those games, but I was obsessed with Angry birds. Which is why it took me so long to do CCS. I was scared. Lol"
"@datachick Humanity really needs to get past this whole ""slow down and stare at the scene of an accident"" ape mentality we have :-/"
@erinscafe Bu I do. I do hate the playa. And the Yankees. If I hate the Yankees for their payroll I have to hate the Dodgers.
@fredabrahams @Brown_Moses I saw the actual tweet while scanning Jihadi accounts. They put a screen shot of it with: all aid workers r spies
@goldnsilvercoin the worst terrorists are republican teabaggers...
"@gregorious13 I don't understand it, there's trash cans/dumpsters everywhere and if there's not, just wait till you go somewhere with one."
"@hardball @Milbank Chris stop your angry bird spitting n Dana go hail ur racial tendicies, u didn't infor, carter rt n u never talk abt him"
@hxhsharingan666 did you make an animal cracker taco
"@ilovemytroops @SenatorTimScott I see. So you feel betrayed, your racism comes out. Shocked you didn't call him Uncle Tom you bitter loser"
"@iowahawkblog I also recommend 6pack of shiner Bock and 12 hours of brisket duty at ""the pit"" feeding oak coals 😉"
@karmendanielle_ let the coons be great it's in the dream
"@kayt27 poor coon.......snow stopped here,now if the wind would it might be nice out,well for winter"
"@littlelouie33 eh, take off hoser."
@louiev_ lol just getting this..early bird catches the worm!
"@luchadora41 so was slavery, prohibition and not letting women to vote... segregation jim crow laws.. etc etc i can do this all day lol"
@mandaxll2963 yellow is even better
@mark_mac18 @sheppast @rianscalia groves is trash IMO
@mis_sarahd @basedpapi1017 tranny
@mtmcly he will need monkey bars on his tricycle
"@oliver_______ if you're making 300 clams for betting 5, i hope you do it for a living"
"@paolo Like...Charlie Chan-style? (Never saw that, either.) Hmm...makes me think of the ""Cockneys"" in the Killzone series."
@rochaaxx yellow
@shrekkalove You speak colored too? I thinks I be in love
@teapot Great GIF. Got a source URL for @gifs?
"@tmundal just because some moist bint lobbed a sword at you, you think you're some la-de-da king?"
@urbasicyabish nigor??
@washingtonpost The people who work at those companies tend to be trash.
@wodaeeex3 it didnt open das y i asked cus i thought yu ate wat i had Pork chops yellow rice n beans
"A white lighter is bad luck, not yellow ur stupid"
Absolutely absurd that they even had to think about that call. #joke #zebras
"After seeing Kansas play Duke a couple weeks ago, I could tell you I'm surprised that Charlie Weis has been let go...but I'm not."
All my so whitey bois gone ride
Almost half of North American bird species threatened by climate change http://t.co/dWSYB5BodA http://t.co/F0PVuLNf4c #Climate #Environment
"Apolonia Gardner of Imperial, CA misspells ""phalarope"" - a bird that resembles a sandpiper. The dreaded schwa! #spellingbee"
Apparently im a honky. Pfft. Like that's a bad thing.
"Be fearful of the wrath of God in a country where many mock Christianity, embrace perversion, abortion, and bows to PC/multiculturalism."
Best rider? That goes to a blaxican.
"Bet they want a cookie, Bergbrain? You mean like you want a cracker? Kinda bitter here today aren'tcha? #stribpol mngop #mnleg"
CAIR Publishes a Hit List for Jihadis http://t.co/hvVfXypmHm via @RealJTP If you are a red blooded #American you may be on the list..
"Can someone please explain to me, Why are the birds so angry & destroying pigs' homes?"
Charlie Baltimore look good for 45 😳
"Charlie Rangel: Some Republicans 'Believe That Slavery Isn't Over' | Truth Revolt: http://t.co/NpbdPk0Cqz Bam, Race Card Vote Democrats out"
Charlie is so adorable. http://t.co/TMpBYyjGFL
Conocí tus ojos negros Y ahora si que no puedo vivir sin ellos
Country Bumpkin Word of the Day : yokel http://t.co/aBpFZgJU
Currently being hit with the jig. Word.
Dallas hoe RT @beanreturns: Why http://t.co/D6HAlItFmP
Damn internal alarm clock. Woke up at 6:35am feeling chunky from my 3am Cheerios splurge. Guilt led me to the treadmill. #Gym #nosleep
"Do you have a nickname? What is it? — Tator, Tator tot, Tightey whitey, whitey Jr, Taylor whitey pants, agent ti... http://t.co/xLBhDJfBF4"
Do you really believe people evolved from apes? — Not really. I'm quiet convinced with the evolution of man but ... http://t.co/1RswGNpLtc
Don’t believe AP’s ho-hum tone: latest poll is great news for Republicans - Hot Air http://t.co/5HHrFlFP13
Early bird catches the worm 👊
Early bird gets the worm right? Hopefully I get a job soon after today! http://t.co/lvgfVBqkdz
Early bird gets the worm.... #GoGetIt
Early worm gets the bird #Bslfe #GoodMorning http://t.co/dfE8MF2Q28
Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9
Fairy tale world lmao
Finally got an updated pic with angel wing colored http://t.co/nWN9qRt5Xd
"Finished Yankeeography and still hate the Arizona D'backs for winning in 2001, Yankees shoulda had that"
First jig of the night
Frozen peas and ibuprofen. My new best friends # cripple
Game 126 Preview: Houston #Astros at New York #Yankees http://t.co/sVg7dIIcZG
"Good morning Twitter. Make today a positive one. If you can't do that, beat up as many meanie-faces as possible. Also, I hate flappy bird."
Got chunky marmalade today of a greatful customer for my service over the past 2 yrs #PostieTweet #ServiceMatters http://t.co/pSEnGvyxrw
Got my vans on.. My pockets chunky
Gotta keep the sox down while they're down! #RedSox #Yankees #mlb
"Grandma's Homemade Potato Soup - An old fashioned, simple, chunky potato soup like Grandma made, wit http://t.co/ZV8ZsmHNlm"
HOW “@darrenrovell: Deep Fried Sweet Tea on a graham cracker crust at State Fair of Texas (via @gravweldon) http://t.co/KVzhlmCsls”
Hella just ordered me some beanies
"Hold all your ghetto labeled and endorsed vodka brands, you can rum me silly any day. #ShiverMeTimbers"
Holy monkey balls its warm today
I ate a monkey. #AndNowIHaveEbola
I hate Hashtag activism. No lie. I think 95% of it is trash.
I hate that wee boy wae the squinty eyes in the game of thrones. #Suchaprick #hopehegetskilled
"I hope the Yankees get Soriano, shoulda never gave him away for Alex Rodriguez"
"I mean most ppl confuse when to use ""to"" and ""too"", this nicca used the word ""two"" when he should've used ""to"" 😒"
I see ya big Boi in the Oreo V's
I sho do miss my Uncle Tom
I still can't understand why we don't dump our trash in the ocean. it's pretty big and it'd surely be cheaper than launching it into space.
I wanna try pot brownies.
I want a four loco right now so I can chug it and go straight to sleep.
I want a slope job so I can get away from everything and be by myself for two weeks. I hate being around people unless they are my friends.
"I was flying down the slope and just rammed into this small Asian boy, kid got up like a champ and walked it off"
I would describe @whatupag's work as blaxican.
I'd be the best at owning a bird. I could teach them the funniest things to say.
I'm not really a phone kinda guy.. I actually hate talking on the phone & texting kinda trash to me also.
I'm such a retard sometimes.
I'm the biggest redskins dam right now if they get this stop
I've pretty much eaten an entire box of my nephew's cheese nips. I only feel bad a little.
If only I dated a girl that lived off Mack Road I could just get the holy trinity of Sacramento ghettos on a daily basis
If you a hoe ima treat you like one just like if you're a good girl ima treat you like one. I don't discriminate
"If you've ever been to drunk to fish, you might be a redneck"
Is Yankee in a good mood today or what
Is dare any colored players in hockey?
Is there a looser #college town than #BTown ??? If yes id like to visit #IU #hoosiers
Islamic Jihadis run away to Pakistan @PureMonotheist @BemetOr8 #UniteBlue
It's a matter of pure weight ratios... A five ounce bird can not carry a one pound coconut!
"It's almost Christmas, Charlie Brown..."
Jay Z - Takeover & U Don't Know LIVE @ Home & Home Concert Yankee Stadium: http://t.co/4EuK80uLiS via @YouTube
Jihadi streams: British #ISIS fighter identified as Abu Abdurahman Al Britani killed in action. http://t.co/Ksowd8rRCk
Jihadis betting on public protests in Britain to force UK gov out of the anti-ISIS coalition http://t.co/5gJSmrfzWJ
Jindal seemed to be having an out-of-body experience enhanced by special brownies http://t.co/W0cyEcqKLo Bj Up and At 'Em #stribpol #mngop
Leave it up to me it would've been in the trash.... All of it.
Legion of jig 😭😭😭
Little things can just kill a negros mood man
Lmao RT @iPOSTBADTWEETS: Lowkey Peyton Manning trash
Lmaoooooo @ Supreme being trash. Guess Bape is trash to huh?
Looking back I liked colored better. Let's go back to that
Looking forward to #NCAA and #NBA basketball 🏀 in the states soon #iubb #hoosiers #pacers #PacerNation and of course breadsticks.
"MT: ""@DocKozlowski: Ted Cruz ""We are proud wacko birds"". #StandWithCruz #KeepCruzing"" is that a pun?"
Magellan? You suck at trash talk. @physguy2 @submix8c
Man talk to em RT @StraightCash08: Of course nothing is wrong with applebee’s if you like trash food.
My sister told me that I look like a bird. While my past told me that I look like an alien.
Next time you leave magic trash in my jacket try not to have your initials written on them. @whatupag http://t.co/tFV1cZtr
No more speculation. No more mock drafts. It's finally Draft Day. #BortlesKombat #StormWarning #BuiltByUCF #ChargeOn
Noap. Not taking them. RT @TiffNCompany: Maca root pills making my vagina do the wop
Nothing good on TV and the @Yankees are eating their balls 3 innings in. Great.
Nothing more yokel MSU fan than to constantly reassess Izzo *aka our only consistent winner since D. Daugherty or Biggie flipping Munn*
Now infuriated Jihadis are putting out murder requests on #Saudi fighter pilots who participated in the bombings in #Syria #ISIS
Omer Asik so trash on the offensive side of the floor dude real life murky doo doo water
One mans trash is another mans treasure
"Oops, I just had a Freudian slit."
"Oreo Shake, I want you."
Oriental rugs cause I'm flyer than a dove.
Other Jihadi news #ISIS executed #Kuwait preacher Redha Lari coz he was..“too extreme” & hampered reconciliation with Nusra with his fatwas
"POLLOS A L'AST Dos pollos estaban dando vueltas en una pollería mientras los estaban cocinando a l'ast, cuando... http://t.co/fXeEAJmyfZ"
"Perhaps I'm not a ""diehard"" Yankee fan, but there's no more point in watching this debacle #Yankees #Tigers #MLB So what else is on? LOL"
"Pic circulating on Jihadi feeds claiming an ""ISIS flag"" on top of Mount Arafa in the holiest day of pilgrimage http://t.co/46YfuflyZV"
RT @23Vnds: Check out the homie @renz360 for that sole sauce! That stuff does miracles!!! @renz360 @renz360… http://t.co/EecEzJRx6C
RT @49ers: Oregon State WR @brandincooks is a popular #49ers pick in recent mock drafts. DRAFT TRACKER: http://t.co/AEueKmhkQX http://t.c…
RT @A_single_bear: I accidentally ate a few fireflies last night while chewing some leaves I found. The shameful yellow glow of my mouth wa…
"RT @AmandaMarcotte: “Not all men interrupt women to quibble over irrelevant issues for the sole purpose of derailing a conversation,” he sa…"
RT @AwwAdorable: I want a pet monkey 😩😍 http://t.co/8wy8gDJIEJ
RT @AyMrCarter: Apologies trash.. What's understood ain't gotta be explained
RT @BasedPaco: The NFL is very equal this season everybody showed glimpses of being trash
RT @BeyonceLand: Bey lost her ring at the Met Gala. Jay found it & instead of just giving it back he gave it to her as a mock proposal http…
RT @Bidenshairplugs: I can't believe the cops shot this fine young man. There goes the cure for cancer... #FergusonOctober http://t.co/Leb…
RT @BirdGang316: If you a bird throw it up #birdgang #EaglesNation
RT @CHlLDHOODRUINER: she paid $5 to hold the monkey and look what he did 😂😭😩 this monkey real af http://t.co/NgeCuWehHb
RT @CyFyre: @WesOrrJames love you too my nig #squad
RT @Dalilaaaaaa__: Flappy birds cheat codes http://t.co/nu6GjYQDJJ
RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO
RT @DogeTheDog: Wow  Such banana  So potassium Much yellow http://t.co/xINHDwyTqS
"RT @ESPNNFL: For the 1st time in the Tom Brady era, the Patriots are in sole possession of last place in the AFC East http://t.co/zMkwGNzOCT"
RT @Elisaa_Martinez: Happy birthday cotton picker lol @1NOnlyDirtyMike
"RT @Fact: Fake friends are no different than shadows, they stick around during your brightest moments, but disappear during your darkest ho…"
RT @FeinsandNYDN: Yankees acquire Chase Headley from the Padres for Yangervis Solarte and Rafael DePaula.
RT @HMSguy: Rahal's an idiot. Second time this year he's wrecked someone under yellow.
"RT @HoodJesusYo: Last name, Sinner First name, Imma Like ur pancakes in tha mornin, He got u covered like Aunt Jemima"
RT @I_Be_kOoLz Food be good...except that rice they cook that bitch on Monday for the week + Tootsies?
"RT @JStac825: There's coon classic (R. Kelly, Usher) and then there's classic (Stevie Wonder, Prince). Not that there's anything wrong with…"
RT @JoeyG_145: Downloaded flappy bird 5 minutes ago... It's already deleted.
"RT @JohnnyFootbalI: Yeah Kaepernick might have biceps like a Greek God, but the dude looks like he was conceived by a Proboscis monkey http…"
RT @KarenRFM: No-TV-During-the-Week #parenting rule has thrown quite a monkey wrench into #RooseveltsPBS viewing schedule. #DVRtime @wcve #…
RT @Kevineffinskaff: My idea of a diet is eating regular Oreos instead of double stuft and takin off half the cookie (still dip it in 2% ...
RT @Kiaranicole_x: @_JoAries @Kash_WingateLFC @chanchan_ox babes we buy brazilian and Peruvian not Bengali
RT @Kick_Man: Giants- Pitiful .. Jets-Pitiful .. Mets- Pitiful .. Yankees-Pitiful .. Nets- Pitiful .. Knicks-Pitiful ... Ny sports- Pitiful
RT @LOHANTHONY: i'm so hungry if someone would send me a flying parachute with brownies attached to it like they do in the hunger games tha…
RT @LilyBean1313: And I thought I had a problem! #Oreos http://t.co/jfrBKzE1T5
RT @LuvKittensDaily: Oreo being adorable http://t.co/5ZMZVT0sUl
"RT @MTVUK: #Directioners, 1D's @zaynmalik & @Louis_Tomlinson monkey around on set of #StealMyGirl video: http://t.co/sOEbNlR8Re http://t.co…"
"RT @Macbeth870: #bowebergdahl There hasn't been a trade this lopsided since Babe Ruth was traded to the Yankees for ""No No Nanette"" cash."
RT @MarcACaputo: BREAKING: Charlie Crist files emergency motion for a mulligan.
RT @MaryWCVB: Still waiting for Charlie Baker to take the stage in Swampscott. #wcvb http://t.co/lYb3Thm199
RT @Maxicat: Charlie Rangel Re-Writes History: On GOP “They Think They Won The Civil War” http://t.co/moCeUBRUTf
"RT @MindOnSuccess_: My mother be on some trash man 😂 I ask her EVERYDAY, you need/ want some money? She say ""Naw I'm good!"" But come payday…"
RT @MobJoe: Word. And it don't make u a hoe RT @100granHman: It's okay to have sex on first date long as the feeling is mutual
"RT @NateKlempa: @ROOTSPORTSPIT must not realize how much #WVU games mean to all of us ""hillbillies"". The @Pirates play 162 games a year. Sh…"
RT @NickSwagyPYoung: Happy Halloween from Rick James (aka Swaggy) and Charlie Murphy ( @bigmeat2000 ) haha darkness everybody http://t.co/y…
RT @Perspective_pic: Retweet if you see the bird. http://t.co/o7reXL0teu
"RT @PlMPCESS: A silent protest in Philadelphia, performance artists protest the murder of Mike Brown while passerby's mock it. http://t.co/…"
"RT @PlMPCESS: In the fashion world ""urban"" definitely means ""stolen from black people"" it's ghetto on us, and urban on them haha http://t.c…"
RT @ProBirdRights: A marry between a man and woman people sloppery slop;then what is prevent a bird from marriage my sandwich??? I'm in lo…
"RT @ProBirdRights: i see you again didn't election bird for government, america. that okay. i guess i just take this free cookie for everyo…"
"RT @RayHudson: Art thou the bird whom Man loves bestThe pious bird with t/ scarlet breast, Our Robin;that leaves these summer winds sobbing…"
"RT @RealTimeWWII: 6000 people/day now being deported from Warsaw ghetto, packed into cattle cars for rail trip to Treblinka death camp. htt…"
RT @RudeBoi_Drew: @PAPER_CHAYSIN condom were invent for that sole purpose
RT @SenorSteez: This whole week was trash
RT @Shalewis09: Larry Elder is an Uncle Tom. I cannot believe how he is degrading a child.
RT @ShelleT1986: Great start the morning! My daughter danced herself into the coffee table and gave herself a nice shiner... Yea she's a wh…
"RT @SimpsonsQOTD: “That's a very nice jig, Kearney. Now isn't dancing much more fun than bullying?"" http://t.co/dk7rcqM1FS"
RT @SneakerPics23: Oreo 6's http://t.co/Jx9DD9oahY
RT @SunSentinel: The @SunSentinel Editorial Board endorses Charlie Crist for governor -- Better for Floridians http://t.co/t21z2YBZlQ http:…
"RT @TheDrunkStory: ""My bro made out with an ugly chick at a festival while drunk so now he's known as Larry bird because he hits ""threes"""" …"
RT @TheRisingStar23: Shy Glizzy Decent Af
"RT @TheRoot: Don Lemon gets called an Uncle Tom, doesn't like it: http://t.co/eV4xgI5Pgc http://t.co/i6Xs8HCnrb"
RT @TomBradysEgo: Might have a better chance if we put some trash cans out there.
"RT @TwitchyTeam: Low turnout reported, but Charlie Crist just filed and emergency motion to extend voting hours in Broward Co. http://t.co/…"
RT @USAgov: Our national bird - the American bald eagle - is an endangered species success story. http://t.co/OJscNNMYEm
RT @WashTimes: John Kerry caught appearing to mock Israel’s ‘pinpoint’ operation in Gaza - http://t.co/3yqFR9LZsR #Israel #Gaza http://t.co…
RT @Yankees: #Yankees win! Final score: 7-4.
"RT @Yankees: Congrats to Derek Jeter, who will start at shortstop in his 14th and final All-Star Game! #FarewellCaptain http://t.co/5nhtJOx…"
RT @Yankees: Time to bounce in the Bronx. #LetsGoYankees #Walkoff http://t.co/W5ATfRsLKK
RT @Yankees: You may cry after watching the Derek Jeter @Gatorade ad: http://t.co/hMO9Bxol0x http://t.co/0xdlxToci8
RT @anilkohli54: @mediacrooks so the ball did roll down the slope & found a natural spot @madhutrehan defending brother & the scumbag @sard…
RT @bakedbeansbro: Throw me in trash http://t.co/GvGMJJ6vYi
"RT @bitterarab: Now that Halloween is fast approaching please understand this. We are a culture, not a costume to mock and ridicule! http:/…"
RT @bjs5555: ELECT Democratic Nominee Charlie Brown @Govcharliebrown for #Gov of #TN DEFEAT (R) Gov. Bill Haslam! @UniteBlueTN http://t.co/…
RT @briangaar: Old white people complaining about government tyranny is like the Yankees being upset that players make too much money
RT @chelseaxlaser: I really need to take my rose colored glasses off though. I gotta stop thinking everybody does shit with good intentions…
RT @chilltweetss: flappy bird is deep... hahaha http://t.co/EMuhmR3jzE
RT @chilltweetss: when I make the paper ball into the trash can 💨 http://t.co/Xnx2XEGACm
"RT @cpabry: Curious George, the curious little monkey or deranged serial cat killer? The story the man with the yellow hat doesn't want you…"
RT @cwissi: beanies for life ◡̈⃝ http://t.co/iywPwmPGTu
RT @daqraca: How can anyone hate Charlie Sheen http://t.co/AF4T3MtmxU
"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf"
RT @druggedvibes: Top 5 Thot Tattoos : 1) dream catchers 2) bird feathers 3) inspirational quotes that they dont live by 4) animal p…
"RT @espn: Hot off the press, here’s Todd McShay’s first post-Combine NFL mock draft, including a new No. 1. http://t.co/su5ixWrthq"
RT @foodbibIe: Oreo Cheesecake http://t.co/yW299XjzB2
RT @harrryaf: being a good citizen picking up trash (: http://t.co/PMFR5eopwO
RT @iLLmak3Ufamous: Touch my swag. Wish you could. RT “@VizyIsIgnant: I LOOK FLY. I LOOK GOOD. RT @Wisdom_Stature: On my momma.... On my ho…
"RT @jrsalzman: Buzzfeed males who live in the city and buy meat under plastic, mock girls who use guns and go hunting. Who is more manly?"
RT @kendramarie91: “@vngelinaa_: Tornado sirens.... Atleast I'll be dying with you guys @Xo_t33 @a_krizanovic @kendramarie91 😭” and this ho…
RT @koolestbreeze: Thanks Charlie!! http://t.co/iw06cCEkND
"RT @kristinhersh: my son, Wyatt: ""do rednecks have mullets to keep from getting red necks?"""
RT @lnsaneTweets: when other girls wear beanies they look cute & stylish but when i wear them i look like a member of a drug cartel who sel…
"RT @lowcountry_luv: Attack animals, last house on the left by the lake, and she got colored eyes...we should have known.! Lol"
RT @maycie32: @Whitecholo23 np big nig nig
"RT @michaelpshipley: Martin Short On - CONAN - Oct 3, 2014 ""With your yellow skin and red hair you look like a #2 pencil."" https://t.co/U…"
RT @missmollylo: Charlie wears women's clothing
RT @netflix: Here's your first look at Charlie Cox as Matt Murdock in #Marvel's @Daredevil on #Netflix #NYCC http://t.co/SIYuCgok8J
RT @ochocinco: If your girl didn't camp out overnight to buy you the iPhone 6 your pee pee is trash...
RT @renz360: Got 5 20% off slots for sole sauce for the early birds today Use “SAVE20” at checkout http://t.co/x5bq5vzpPK
"RT @robertbevan77: It's like The Walking Dead, but with less Carl and more talking birds. ZOMBIE ATTACK!!! http://t.co/UOCsPyksSC http://t.…"
"RT @runofplay: I'm confused. Martins Indi just drew a yellow card, but I didn't see him steal a tank from the Brazilian army and open fire …"
"RT @russnelligan: .@KarynPolito declares victory by introducing ""our next governor, Charlie Baker"" @CharlieForGov #wcvb http://t.co/HYkWrVD…"
RT @ryandolan123: when people say coloured http://t.co/2lKUkctPf0
"RT @sportswithjohn: Dani Alves uses his new gray hair to talk himself out of a yellow card, telling the referee, ""I am but a humble senior …"
"RT @ticiaverveer: Royal city of the Kushite kings at Meroe (Sudan),near River Nile.It was the seat of the rulers who occupied Egypt http://…"
RT @walshnyc: Which World Cup Teams Are Jihadis Rooting For? http://t.co/ecU754zHKk @versharma @vocativ http://t.co/8z0B7ocMqP
Ravioli stuffed wit lobster nd a nice stuffed clam #eating betterthanamobster http://t.co/PzdoESaR
Selling ex lax brownies in front of Meijer to pregnant women. #GoodDeed
She trash tho.
Shut up birds it's bed time.
"Sitting on the deck, enjoying a #StoneIPA, watching/smelling the neighbors burn trash. #Missouri http://t.co/m9wYnrLtWz"
So am I the only cracker up playing his guitar (thank you @Tampa_Rick ) on this Saturday night?
So many weird people in the ghetto at this time.
"So my light was off & door was closed , mom walks in turn lights on . Tells me to take trash out in the morning . then walked out.. Light on"
"Special thanks to Tina, Charlie and staff at Waverly Hills... thanks to everyone that dedicated time to make the... http://bit.ly/aKyqk2"
Swap meet bound #beaner
"Thank you Charlie, Happy Thursday @Charlie4927 @StacyDmomof5 @PattiSM74 @Kacado @cantUCIMblonde @RevkahJC"
"Thanks Charlie, have a great evening @Charlie4927 @StacyDmomof5 @RevkahJC @cantUCIMblonde @Kacado @PattiSM74"
That warm fuzzy feeling when you're able to give someone a helping hand. Talking about medical problems with others can really help.
That was trash. I'm just gonna put my phone down. Goodnight
The #teabagger #freemarket really doing wonders in #Bangladesh these last couple of days. #tcot #p2 #teaparty #randpaul
The Bucs has 2 great uniforms and ruined them both for that trash smh
The Walking Dead is trash people... That is all. 😂
"The beautiful Fairy Pools near Glenbrittle on the Isle of Skye, Scotland http://t.co/cyi6xSvn97"
The right for an English person to go about his business is more important than any suspicion of some rouge wog cop.
"This Billy Crystal tribute to Derek Jeter is crazy. I hate the Yankees, but man... Derek Jeter.... #RE2PECT"
This Charlie Murder game is pretty dope #Xbla
"This mint brownies recipe is made with almond flour, coconut palm sugar, coconut oil, and coconut mi http://t.co/XZZfDoLZDn"
This music is trash
"This one night I was tripping off brownies I put my running shoes on and I was still in pjs, it was 2am ! http://t.co/UwmBpCOMNE"
"This picture on Weibo made China's censors add ""big yellow duck"" to its list of banned terms → http://t.co/87NSk91rJR http://t.co/j3aAKxwIfe"
This waiting thing is for the birds...
"Thw White Iron Band plays this weekend in Fargo,ND at the Aquarium(21+) ,Friday(10-29-10) with Charlie Parr. The next night,Saturday..."
Tommy trash
Tomorrow's agenda: bake brownies & visit a couple open house.
"Tuna is good the rest trash RT @AyeoFool: Cans of spam RT @MsBTxo: Most struggle food HAS TO BE any meat In a can, tuna included"
U serious bro?? lol RT @CheezMoeJenkinz 2-3:10am early bird special
"Un pollito cruzaba la frontera, un zorro se tropieza con el pollo, el zorro le dice, im sorry el pollo le dice im... http://t.co/oXcOYauJi1"
Watching #Steelers colts getting blinded by the Steelers uniys look like bunch of bumblebees
We over here struggling with college and work and ice jj fish making money for being trash lol. What has this world come to.
Western Jihadi fanboys use of language: peppering sentences with Arabic words to sound knowledgeable - #Comical http://t.co/G2rca1XGh7
What's a trash party ?
Whipped http://t.co/EFSVjPqrwq
Whipped out some french in front of some babes at the post office. #winning
Who needs an alarm when you have the worlds most annoying bird outside your window?!??
Whoever mocks the poor insults his Maker; he who is glad at calamity will not go unpunished. http://t.co/P7zwAoXi4l
Why are people saying don't let guys treat you like a yellow starburst? That's my favorite.
Wicked Oreo
Wow. No yellow yet? = Pagenaud and Marco screwed. Maybe no yellow coming
Yankees getting killed
Yeah they do. RT @Yankees: Six complete here at The Stadium. #Yankees lead the Red Sox 3-1. #LetsGoYankees
"Yep, sources confirmed it. That Iggy concert last night was trash"
"You know I'm not big on the NFL, but I'm so sick of hearing all of this ""Black and yellow"" shit. LOL @ bandwagon fans and hell, GO PACKERS!"
"Your A Good Gravedigger, Charlie Brown #SpookyBroadway @midnight"
bae won his game 😍 #redskins
being up at 5am cuz your cat meowed at a nocturnal bird is the ultimate #toughbreak
can my cat not kill birds and make a huge mes of feathers
cousin: why is there a bird on your screen me: it's twitter cousin: i poke you me: that's facebook
"cut up porkchop, canned pinto beans, a small can of carrots, can of corn, celery, a potato. cayenne powder + crushed red pepper"
i made sweet & SPICY drunken noodles with handmade and handcut ho fun noodles with marinated chicken/shrimp. my back is hurting.
i thought garbage men were supposed to bring trash to the dump but they wouldn't take me ha ha ha. self depreciation ftw.
i was such a teacher's pet in elementary school. like i brought them cookies and brownies and gave them hugs. im ashamed
"if ice jj fish gets all these looks , who really trash me or him"
if u eat fried eggs and leave the yolk entirely yellow ur a disgrace and also nasty
if ur girl watch Snapped in front of u she trying to spook u!!!
it's so weird outside. it's like yellow.
jacob u look like a monkey in dis http://t.co/rkaqn0GRQe
"last thought before bed: in '92 we had a ""mock election"" in 3rd grade and i voted for Perot. maybe i related to his really big ears, idk."
oops: 'constitutionalists' #teabaggers in #mississippi forgot ratify a centerpiece of the constitution: 13th amendment banning slavery.
pancakes trash
the train station Harry's at is trash I've been to the same Eurostar too they refunded my whole trip to Paris bc train accident
third party keyboards on iOS 8 trash
this movie is actually good cuz its so retarded
versace bird feeder
who's downie like a brownie 4 brunch? Need mimosas
wish I had a brownie or a dab to knock me out rn
yaya ho.. cute avi tho RT @ViVaLa_Ari I had no idea she was sleep 😩
yea so about @N_tel 's new friend.. all my friends kno they're only allowed one cute darkie friend n that's me! lol


In [None]:
df_final

In [507]:
m = test_bnt_hb_combined_correct_hate.merge(test_baseline_correct.drop_duplicates(),how='outer',indicator=True)


In [512]:
m

Unnamed: 0,tweet,class,prediction,_merge
0,RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO,2.0,2.0,both
1,RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO,2.0,2.0,both
2,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9,2.0,2.0,both
3,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9,2.0,1.0,both
4,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9,2.0,2.0,both
5,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9,2.0,1.0,both
6,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf",2.0,2.0,both
7,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf",2.0,1.0,both
8,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf",2.0,2.0,both
9,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf",2.0,1.0,both
