In [370]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [371]:
df = pd.read_csv("data/train/training_data.csv")

In [372]:
tweets=df.tweet

## Davidson Feature Generation

In [373]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = tweet.split() #[stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [338]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [339]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [340]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )

In [341]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [233]:
def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Preprocess for slang

In [234]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [235]:
def load_slang_dict():
    slang_dict = {}
    with open("slang_to_words.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            #print(tokens[1])
            slang_dict[tokens[1]] = tokens[0]
    return slang_dict
slang_dict_one = load_slang_dict()
#slang_dict

In [236]:
def load_slang_two_dict():
    slang_dict_two = {}
    with open("noslangdotcom.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split(':')
            #print(tokens[1])
            slang_dict_two[tokens[0]] = tokens[1]
    return slang_dict_two
slang_dict_two = load_slang_two_dict()
#slang_dict_two


In [237]:
def load_slang_three_dict():
    slang_dict_three = {}
    with open("internet_slangsDotNet.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('==')
            slang_dict_three[tokens[0]] = tokens[1]
            #print("first ", tokens[0], "second ", tokens[1])
    return slang_dict_three
slang_dict_three = load_slang_three_dict()

In [238]:
def load_slang_four():
    slang_dict_four = {}
    with open("common_twitter_abbreviations.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('=')
            slang_dict_four[tokens[0]] = tokens[1]
    return slang_dict_four
slang_dict_four = load_slang_four()

In [239]:
def merge_dicts(*dict_args):
    """
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    """
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

slang_dict = merge_dicts(slang_dict_one, slang_dict_two, slang_dict_three, slang_dict_four)

## Replace slang with definitions

In [240]:
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()

In [241]:
def slang_sentiment(text):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return senti

def positive(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def negative(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def objective(text, indicator):
    text = remove_slang(text)
    senti = get_sentiment_text(text)
    return get_pos(text, indicator)

def remove_slang(text):
    s = tknzr.tokenize(text)
    soFar = ''
    
    for word in s: 
        if word.lower() in slang_dict:
            soFar += slang_dict[word.lower()] + ' '
        else:
            soFar += word  + ' ' 
    return soFar.split(' ')

def get_sentiment_text(strList):
    text = ' '.join(strList)
    pos_values = nltk.pos_tag(text)
    pos_senti = []
    for (x, y) in pos_values:
        if len(get_sentiment(x,y)) > 1:
            pos_senti.append(get_sentiment(x,y))
        else: 
            pos_senti.append([0, 0, 0])       
    return pos_senti
        
def get_pos(text, indicator):
    x = 0
    pos = get_sentiment_text(text)
    for v in pos:
        x +=  v[indicator]
    return x

In [242]:
from nltk.corpus import wordnet as wn

In [243]:
def penn_to_wn(tag):
#Convert between the PennTreebank tags to simple Wordnet tags"""
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [244]:
def get_sentiment(word,tag):
#""" returns list of pos neg and objective score. But returns empty list if not present in senti wordnet. """
    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []
    
    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

In [245]:
data = pd.read_csv('SentiWordNet_3.0.0.txt', sep='\t', header=None)
data.columns = ["POS","ID","PosScore","NegScore","SynsetTerms","Gloss"]

Check for quotes

In [246]:
def contains_quotes(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word == '"' or word == "'" else 0, s))
    if score > 0: 
        return 1 
    return 0

Check if self-referential

In [247]:
ethnic_groups = []
with open('ethnic_groups_and_common_slurs.txt', 'r') as fileinput:
    for line in fileinput:
        ethnic_groups.append((line.split('\n'))[0].lower())


#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this']
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want', 'wants', 'are']
reclaiming = ['proud', 'reclaim', 'reclaming', 'offensive', 'like']
me = ['i\'m', 'we', 'i', 'me', 'this']

def contains_target_self_referential(text):
    words = tknzr.tokenize(text)
                
    #check word in ethnic_groups comes before word in me   
    #e.g. the beaner in me forgets I like beans
    for word in ethnic_groups:
        if word in words[0:]:
            for key in me:
                if key in words[words.index(word):]:
                    return 1
    
    #check if word in me comes before word in reclaiming
    #e.g. i'm a proud beaner
    for key in me:
        if key in words[0:]:
            for word in reclaiming:
                if word in words[words.index(key):]:
                    return 1
                #check if word in me comes before word in ethnic_groups
            for word in ethnic_groups:
                if word in words[words.index(key):]:
                    return 1
            #check if word in me comes before word in ethnic_groups
            #e.g. We beaners have to stick together
            for word in ethnic_groups:
                if word in words[words.index(key):]:
                    return 1
     #check if word in reclaiming comes after modality 
    #e.g. all beaners should go home is offensive
    for key in modality:
        if key in words[0:]:
            for word in reclaiming:
                if word in words[words.index(key):]:
                    return 1
    return 0

Offensive to women/words that hurt

In [248]:
words_that_hurt = {
    'bitch': 'Targets and dehumanizes women, even if used toward men, including queer and gay men. Devalues women and femininity. Reinforces sexism.',
    'ghetto' :'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'ratchett':'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'illegal alien': 'Reduces undocumented immigrants to something less than human. Fixates on legal status instead of people as individuals. Asserts that some people belong here more than others do. Ignores political, social, and economic factors that impact people of color.',
    'no homo': 'Stresses the speaker\'s heterosexuality, masculinity, and/or other traits to avoid being perceived as LGBTQIA. Goes to great lengths to avoid association with anything queer. Reinforces that to be LGBTQIA is bad.',
    'retarded': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'retard': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'lame': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'crazy':'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'dumb': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'that\'s so gay': 'Stigmatizes gay and queer people. Uses their identities to describe something as undesirable and bad. Replaces negative adjectives with words related to LGBTQIA identities.',
    'whore': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'ho': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'slut': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'Bisexuality doesn\'t really exist. People are just gay or straight.': 'This denies the fluidity of sexuality and dismisses people\'s experiences and definitions of self. People deserve the right to define their own identities any way they wish and have those definitions honored.',
    'i think everyone is bisexual': 'While this is often meant to acknowledge the fluidity of sexuality, it dismisses the reality of people who identify as bisexual and erases their experiences. It also invalidates the self-identifications of non-bisexual people.',
    'You\'re too femme to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'You\'re too butch to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'Bisexual people just want straight privilege':'Bisexual people experience discrimination within straight communities and lesbian/gay communities. They never fully experience straight privilege because they do not identify as straight. Often their identities are made invisible and denied.',
    'Bisexual people are just greedy and want to have sex with everyone.':'This stereotypes bisexual people and assumes they are all promiscuous - and that this is a bad thing. It creates negative attitudes toward sex and works against creating a sex positive climate. It also demonstrates an underlying belief that bisexuality is only about behavior and is not a legitimate identity.',
    'Who do you see yourself ending up with?':'This is another way of implying one has to "end up" gay or straight and ignores bisexuality as an identity versus a relationship status. It also assumes everyone desires to be in a long-term monogamous relationship.',
    'Tranny':'Whether or not someone identifies as trans*, calling anyone "tranny" is extremely offensive. While some folks within the trans* community may choose to reclaim this word for themselves, it is not a word that is okay to use to label another person or use as a joke.',
    'That person doesn\'t really look like a woman':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'That person doesn\'t really look like a man':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'What is your REAL name? I mean the one you were given at birth':'This implies that the person\'s gender identity and chosen name are not "real" and perpetuates the idea of Trans people as deceptive. It removes agency and any right to make decisions for themselves, and is incredibly invalidating. It presumes a right to intimate information, disregards privacy, and places Trans lives on public display.',
    'He-She':'This hyphenated term is demeaning and invalidates an individual\'s identity and the pronouns that they use.',
    'What are you REALLY? Have you had surgery?': 'Asking anyone personal questions about their bodies and/or surgeries is invasive and inappropriate. We don\'t ask cisgender people about what is under their clothes; we shouldn\'t ask Trans* people either.',
    'cunt':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'twat':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'pussy':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'thot':'Word created to express womxn or people who are sexually promiscuous. There are speculations that the word comes from the KKK organization that referred to Black women who were forced into prostitution (i.e. Sarah Baartman: Hottentot).',
    'ugly':'Word used to put down someone for the way they look, can be connected back to white supremacist, ableist, sizeist standards of beauty.',
    'you guys':'Erases the identities of people who are in the room. Generalizing a group of people to be masculine.',
    'I\'m being such a fat-ass':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.',
    'I\'m being so fat right now!':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.'
}

hurtfulWords = list(words_that_hurt.keys())

In [249]:
#Binary Feature #6 1) ID tweets with female pronouns 2) Check if these words are in the tweet 

#these words are used disproportionately often against women
#the behaviour they describe often goes unremarked in men.
#source: http://sacraparental.com/2016/05/14/everyday-misogyny-122-subtly-sexist-words-women/
#EVERYDAY MISOGYNY: 122 SUBTLY SEXIST WORDS ABOUT WOMEN (AND WHAT TO DO ABOUT THEM)
female_and_nongender_Pronouns = set(['you','she','its','their','yours',
                                    'her', 'it', 'they', 'them',
                                    'yourself', 'herself', 'themselves',
                                    'your','hers'])

pronouns = {'I': ('personal', True, 'first'),
 'me': ('personal', True, 'first'),
 'we': ('personal', False, 'first'),
 'us': ('personal', False, 'first'),
 'you': ('personal', False, 'second'),
 'she': ('personal', True, 'third'),
 'he': ('personal', True, 'third'),
 'her': ('possessive', True, 'third'),
 'him': ('personal', True, 'third'),
 'it': ('personal', True, 'third'),
 'they': ('personal', False, 'third'),
 'them': ('personal', False, 'third'),
 'myself': ('reflexive', True, 'first'),
 'ourselves': ('reflexive', False, 'first'),
 'yourself': ('reflexive', True, 'second'),
 'yourselves': ('reflexive', False, 'second'),
 'himself': ('reflexive', True, 'third'),
 'herself': ('reflexive', True, 'third'),
 'itself': ('reflexive', True, 'third'),
 'themselves': ('reflexive', False, 'third'),'my': ('possessive', True, 'first'),
 'your': ('possessive', False, 'second'),
 'his': ('possessive', True, 'third'),
 'hers': ('possessive', True, 'third'),
 'its': ('possessive', True, 'third'),
 'our': ('possessive', False, 'first'),
 'their': ('possessive', False, 'third'),
 'mine': ('possessive', True, 'first'),
 'yours': ('possessive', False, 'second'),
 'ours': ('possessive', False, 'first')}

female_offensive = ['bossy', 'abrasive', 'ball-buster', 'aggressive', 
'shrill', 'bolshy', 'intense', 'stroppy', 'forward', 
'mannish', 'gossipy', 'Dramatic', 'Drama Queen', 'Catty', 
'Bitchy', 'Nag', 'Cold', 'Ice queen', 'Shrew', 'Humourless',
'Man-hater', 'Banshee', 'Fishwife', 'Lippy', 'Ditzy', 'Feminazi', 
'militant feminist', 'Bridezilla', 'Diva', 'Prima donna', 'Blonde moment',
'Feisty', 'Supermum','Working mother', 'Career woman', 'Yummy mummy', 'Little old lady', 
'WAHM', 'Slut', 'Trollop','Frigid','Easy','Tease','Loose','Man-eater','Cougar',
'Asking for it','prude','the town bike', 'Mutton dressed as lamb','Slutty','Curvy','Mumsy',
'Cheap','That dress is flattering','Frumpy','Let herself go','Faded beauty','Mousey',
 'Plus-size','Clotheshorse','Brunette ','Ladylike','Bubbly','Vivacious','Flirty',
'Sassy','Chatty','Demure','Modest','Emotional','Hysterical','Hormonal',
'Menstrual ',' pre-menstrual ','Flaky','Moody','Over-sensitive',
'Clucky','Neurotic','Irrational','Baby brain','Baby weight','Mummy blogger',
'Female engineer','That’s good, for a girl','Like a girl','run like a girl', 
'throw like a girl','Mumpreneur','Spinster','Barren','She wears the pants','Housewife',
'Houseproud','Soccer mom','Mistress','Kept woman','Incompetent cervix',
'Failure to progress','Elderly primagravida','Irritable uterus','Tomboy',
'Girly','a girly girl','Little lady','Jail-bait','Heart-breaker','pretty little thing','Catfight','Mommy wars','Caring','Compassionate','Hard-working',
'Conscientious','Dependable','Diligent','Dedicated','Tactful','Interpersonal','Warm',
'Helpful','Maternal', 'Princess', 'Heart-breaker']
#most tweeted to Megyn Kelly by Trump and trump supporters
#https://www.vox.com/2016/1/27/10852876/donald-trump-supporters-sexist-tweets-megyn-kelly
trump_suppporters_megynKelly = ["ugly", "cheap", 'bitch', 'whore', 'bimbo',
                                'cunt', 'hooker', 'slut', 'skank']
others = ['hoe', 'pussy', 'bitches', 'fatty', 'fatass', 'fat-ass']
offsensive_words_toward_women = female_offensive + trump_suppporters_megynKelly + others + hurtfulWords

In [250]:
female_offensive_words = set()
for word in offsensive_words_toward_women:
    female_offensive_words.add(word.lower())
#female_offensive_words

def check_offensive_to_women(text):
    #split tweet by white space and make lower case
    li = set([word.lower() for word in text.split()]) 
    isFemale = female_and_nongender_Pronouns.intersection(li)
    if len(isFemale) == 0:
        return 0
    isOffensive = female_offensive_words.intersection(li)
    if isOffensive:
        return len(isOffensive)
    return 0

NRC emotions

In [251]:
nrc_emotions_df = pd.read_csv("nrc_emotions.csv")

In [252]:
anger = nrc_emotions_df.loc[nrc_emotions_df['anger']][['term']].values
anticipation = nrc_emotions_df.loc[nrc_emotions_df['anticipation']][['term']].values
disgust = nrc_emotions_df.loc[nrc_emotions_df['disgust']][['term']].values
fear = nrc_emotions_df.loc[nrc_emotions_df['fear']][['term']].values
joy = nrc_emotions_df.loc[nrc_emotions_df['joy']][['term']].values
sadness = nrc_emotions_df.loc[nrc_emotions_df['sadness']][['term']].values
surprise = nrc_emotions_df.loc[nrc_emotions_df['surprise']][['term']].values
trust = nrc_emotions_df.loc[nrc_emotions_df['trust']][['term']].values

In [253]:
def anger_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anger else 0, s))
    return score

In [254]:
def anticipation_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anticipation else 0, s))
    return score

In [255]:
def disgust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in disgust else 0, s))
    return score

In [256]:
def joy_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in joy else 0, s))
    return score

In [257]:
def fear_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in fear else 0, s))
    return score

In [258]:
def sadness_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in sadness else 0, s))
    return score

In [259]:
def surprise_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in surprise else 0, s))
    return score

In [260]:
def trust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in trust else 0, s))
    return score

In [261]:
#groups = open('groups.txt','r').read().split('\n')
ethnic_groups = []
with open('ethnic_groups_and_common_slurs.txt', 'r') as fileinput:
    for line in fileinput:
        ethnic_groups.append((line.split('\n'))[0].lower())
#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this', ]
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want']

In [262]:
#If tweet contains a targeted statement referring to a certain group, i.e. "all you Asians" or "every Mexican"
#also checks if a group word is followed by some sort of modal verb

def contains_target(text):
    s = tknzr.tokenize(text)
    
    for i in range(len(s)):
        if s[i].lower() in targets:
            if i != len(s)-1:
                if s[i+1].lower() in ethnic_groups:
                    return 1
            
        elif s[i].lower() in ethnic_groups:
            if i != len(s)-1:
                if s[i+1].lower() in modality:
                    return 1            
        return 0
    

In [266]:
def other_features_base(tweet):
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features
    
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    #sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    #Our features
    text_only = preprocess(tweet) #Get text only
    words = remove_slang(text_only) #replace slang/abbreviations with full words 
    
    senti = slang_sentiment(text_only)
    pos = positive(tweet, 0)
    neg = negative(tweet, 1)
    obj = objective(tweet, 2)
    
    no_slang_str = ''.join(words)
    trustCount = trust_count(no_slang_str)
    surpriseCount = surprise_count(no_slang_str)
    sadnessCount = sadness_count(no_slang_str)
    fearCount = fear_count(no_slang_str)
    joyCount = joy_count(no_slang_str)
    disgustCount = disgust_count(no_slang_str)
    anticipationCount = anticipation_count(no_slang_str)
    angerCount = anger_count(no_slang_str)
    isSelfReferential = contains_target_self_referential(no_slang_str)
    hasQuotes = contains_quotes(tweet)
    targeted = contains_target(text_only)
    immigrant_ref = 0
    if text_only.find('immigrant') or text_only.find('immigrants'):
        immigrant_ref = 1
    isOffensiveToWomen = check_offensive_to_women(tweet)
    
    #Davidson features
    syllables = textstat.syllable_count(text_only)
    num_chars = sum(len(w) for w in text_only)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(tweet.split())
    #avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(text_only.split()))
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
        
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, targeted, immigrant_ref, isOffensiveToWomen,
                trustCount, surpriseCount, sadnessCount, angerCount, fearCount, 
                joyCount, disgustCount, anticipationCount, isSelfReferential, hasQuotes, pos, neg, obj]
   
    return features

def get_feature_array(tweets, base):
    feats=[]
    for t in tweets:
        if base:
            feats.append(other_features_base(t))
        else:
            feats.append(other_features(t))
    return np.array(feats)

In [264]:
other_features_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet", "targeted", "immigrant_ref", "isOffensiveToWomen",
                "trustCount", "surpriseCount", "sadnessCount", "angerCount", "fearCount", 
              "joyCount", "disgustCount", "anticipationCount", "isSelfReferential", "hasQuotes", "pos", "neg", "obj"]

other_features_base_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet"]

In [268]:
base_feats = get_feature_array(tweets, True)

In [269]:
hand_built_feats = get_feature_array(tweets, False)

## Flair

In [None]:
from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, CharacterEmbeddings, StackedEmbeddings, FlairEmbeddings, BertEmbeddings
import torch

In [None]:
#stack word-level twitter embeddings and forward/backward flair sentence embeddings
news_forward = FlairEmbeddings('news-forward-fast')
news_backward = FlairEmbeddings('news-backward-fast')
twitter = WordEmbeddings('twitter')
bert = BertEmbeddings('bert-base-uncased')
#elmo = ELMoEmbeddings('small')

In [None]:
from flair.data import Sentence

Create embedding for tweets by getting token-level embeddings from stacked embedding

In [None]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
        
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [296]:
def embed_tweets(model, row_len, tweets):
    embeddings = np.empty([len(tweets), row_len])
    num = 0
    total = len(tweets)
    for tweet in tweets:
        s = Sentence(tweet)
        model.embed(s)
        flattened = np.array(s.get_embedding().detach()).flatten()
        #print(flattened.shape[0])
        embeddings[num] = (flattened)
        num+=1
        update_progress(num / total)

    update_progress(1)
    return embeddings

In [21]:
#tweet_embed = DocumentPoolEmbeddings([twitter])

In [270]:
#%%time
#tweet_embeddings = embed_tweets(tweet_embed,100)
#np.savetxt('tweet_embeddings.txt', tweet_embeddings)

tweet_embeddings = np.loadtxt('tweet_embeddings.txt')

In [8]:
#bert_embed = DocumentPoolEmbeddings([bert])

In [368]:
#%%time
#bert_embeddings = embed_tweets(bert_embed, 3072)
#np.savetxt('bert_embeddings.txt', bert_embeddings)

#bert_embeddings = np.loadtxt('bert_embeddings.txt')

In [380]:
bert_news_twitter_embed = DocumentPoolEmbeddings([news_forward, news_backward, bert, twitter])

In [381]:
#bert_news_twitter_embeddings = embed_tweets(bert_news_twitter_embed, 5220)
#np.savetxt('bert_news_twitter_embeddings.txt', bert_news_twitter_embeddings)

bert_news_twitter_embeddings = np.loadtxt('bert_news_twitter_embeddings.txt')

## Train Models with features

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

Flair embeddings

In [407]:
#Now join them all up
M1 = np.concatenate([tfidf,pos,base_feats,bert_news_twitter_embeddings],axis=1)

In [271]:
#M2 = np.concatenate([tfidf,pos,feats,bert_embeddings],axis=1)
M3 = np.concatenate([tfidf,pos,base_feats,tweet_embeddings],axis=1)

Hand-built features

In [272]:
M4 = np.concatenate([tfidf,pos,hand_built_feats],axis=1)

Combine hand-built and twitter embeddings

In [277]:
M5 = np.concatenate([tfidf,pos,hand_built_feats,tweet_embeddings],axis=1)

In [395]:
M6 = np.concatenate([tfidf,pos,hand_built_feats,bert_news_twitter_embeddings],axis=1)

## Train 

In [391]:
def train_model(M):
    X = pd.DataFrame(M)
    y = df['class'].astype(int)
    pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])
    param_grid = [{}] # Optionally add parameters here
    grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv=StratifiedKFold(n_splits=5, 
                                              random_state=42).split(X, y), 
                           verbose=2)
    model = grid_search.fit(X, y)
    return model

In [392]:
tweet_only_LR = train_model(M3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  26.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.6s remaining:    0.0s


[CV] ................................................. , total=  12.4s
[CV]  ................................................................
[CV] ................................................. , total=  11.4s
[CV]  ................................................................
[CV] ................................................. , total=  13.2s
[CV]  ................................................................
[CV] ................................................. , total=  11.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


In [None]:
# np.savetxt("tweet_only_true.txt", tweet_only_true)
# np.savetxt("tweet_only_pred.txt", tweet_only_pred)

In [None]:
#bert_only_true, bert_only_pred, bert_only_model = train_model(M2)

In [None]:
# np.savetxt("bert_only_true.txt", bert_only_true)
# np.savetxt("bert_only_pred.txt", bert_only_pred)

In [None]:
bert_news_twitter_model = train_model(M1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=29.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 29.2min remaining:    0.0s


[CV]  ................................................................
[CV] ................................................. , total=46.6min
[CV]  ................................................................
[CV] ................................................. , total=16.8min
[CV]  ................................................................
[CV] ................................................. , total=26.3min
[CV]  ................................................................
[CV] ................................................. , total=28.0min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 147.2min finished


In [None]:
# np.savetxt("bert_news_twitter_true.txt", bert_news_twitter_true)
# np.savetxt("bert_news_twitter_pred.txt", bert_news_twitter_pred)

In [393]:
hand_built_model = train_model(M4)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  16.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.5s remaining:    0.0s


[CV] ................................................. , total=  10.5s
[CV]  ................................................................
[CV] ................................................. , total=   9.3s
[CV]  ................................................................
[CV] ................................................. , total=   9.0s
[CV]  ................................................................
[CV] ................................................. , total=  10.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   56.2s finished


In [None]:
# np.savetxt("hand_built_true.txt", hand_built_true)
# np.savetxt("hand_built_pred.txt", hand_built_pred)

In [394]:
combined_model = train_model(M5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  41.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.9s remaining:    0.0s


[CV] ................................................. , total=  12.6s
[CV]  ................................................................
[CV] ................................................. , total=  11.7s
[CV]  ................................................................
[CV] ................................................. , total=  10.5s
[CV]  ................................................................
[CV] ................................................. , total=  13.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.5min finished


In [420]:
bnt_hb_combined_model = train_model(M6)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=34.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 34.3min remaining:    0.0s


[CV]  ................................................................
[CV] ................................................. , total=25.8min
[CV]  ................................................................
[CV] ................................................. , total=41.0min
[CV]  ................................................................
[CV] ................................................. , total=20.9min
[CV]  ................................................................
[CV] ................................................. , total=29.7min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 151.8min finished


## Training Evaluation

In [50]:
all_tweets = df[['tweet', 'class']]

In [451]:
def evaluate(y_true, y_preds, tweet):
    report = classification_report( y_true, y_preds )
    print(report)
    
    misses = np.where(np.asarray(y_true) != y_preds)
    missed_preds = []
    for i in range(len(y_true)):
        if np.asarray(y_true)[i] != y_preds[i]:
            missed_preds.append(y_preds[i])


    missed = [list(y_true.index)[i] for i in misses[0]]
    missed_tweets = tweet.iloc[missed]
    missed_tweets.loc[:,'prediction'] = missed_preds
    
    corrects = np.where(np.asarray(y_true) == y_preds)
    correct_preds = []
    for i in range(len(y_true)):
        if np.asarray(y_true)[i] == y_preds[i]:
            correct_preds.append(y_preds[i])


    correct = [list(y_true.index)[i] for i in corrects[0]]
    correct_tweets = tweet.iloc[correct]
    correct_tweets.loc[:,'prediction'] = correct_preds
    return missed_tweets, correct_tweets

Tweet sentence embeddings

In [431]:
tweet_missed = evaluate(tweet_only_true, tweet_only_pred, all_tweets)

              precision    recall  f1-score   support

           0       0.30      0.44      0.36       104
           1       0.94      0.88      0.91      1507
           2       0.75      0.83      0.79       364

   micro avg       0.85      0.85      0.85      1975
   macro avg       0.66      0.72      0.68      1975
weighted avg       0.87      0.85      0.86      1975



BERT embeddings

In [65]:
bert_missed = evaluate(bert_only_true, bert_only_pred, all_tweets)

              precision    recall  f1-score   support

           0       0.31      0.44      0.37       104
           1       0.94      0.89      0.91      1507
           2       0.77      0.84      0.80       364

   micro avg       0.86      0.86      0.86      1975
   macro avg       0.67      0.72      0.69      1975
weighted avg       0.87      0.86      0.87      1975



Combined BERT, news, and tweet embeddings

In [66]:
bert_news_twitter_missed = evaluate(bert_news_twitter_true, bert_news_twitter_pred)

              precision    recall  f1-score   support

           0       0.34      0.45      0.39       104
           1       0.94      0.89      0.92      1507
           2       0.78      0.86      0.82       364

   micro avg       0.86      0.86      0.86      1975
   macro avg       0.69      0.73      0.71      1975
weighted avg       0.88      0.86      0.87      1975



In [215]:
hand_built_missed = evaluate(hand_built_true, hand_built_pred)

              precision    recall  f1-score   support

           0       0.27      0.41      0.33       104
           1       0.92      0.86      0.89      1507
           2       0.68      0.77      0.72       364

   micro avg       0.82      0.82      0.82      1975
   macro avg       0.62      0.68      0.65      1975
weighted avg       0.84      0.82      0.83      1975



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [None]:
both_missed = evaluate(combined_true, combined_pred)

## Run on test set

In [309]:
testing = pd.read_csv("data/test/testing_data.csv")
dev = pd.read_csv("data/dev/development_data.csv") #dev wasn't used in training
test = pd.concat([testing, dev], sort=False)

In [310]:
y_test = test['class'].astype(int)

In [311]:
test_tweets = test['tweet']

In [345]:
#use transform instead of fit_transform to get vector in same space as training data
test_tfidf = vectorizer.transform(test_tweets).toarray() 

In [346]:
test_tweet_tags = []
for t in test_tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    test_tweet_tags.append(tag_str)

In [347]:
test_pos = pos_vectorizer.transform(pd.Series(test_tweet_tags)).toarray()

In [319]:
base_test_feats = get_feature_array(test_tweets, True)

In [320]:
hand_built_test_feats = get_feature_array(test_tweets, False)

Test tweet only embeddings

In [295]:
tweet_embed = DocumentPoolEmbeddings([twitter])

In [297]:
test_tweet_embeddings = embed_tweets(tweet_embed,100, test_tweets)

Progress: [####################] 100.0%


In [360]:
test_M3 = np.concatenate([test_tfidf,test_pos,base_test_feats,test_tweet_embeddings],axis=1)

In [362]:
tweet_only_preds = tweet_only_LR.predict(test_M3)

Test hand-built features

In [350]:
test_M4 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats],axis=1)

In [353]:
hand_built_preds = hand_built_model.predict(test_M4)

Test combined hand-built and tweet embeddings

In [354]:
test_M5 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats,test_tweet_embeddings],axis=1)

In [356]:
combined_preds = combined_model.predict(test_M5)

Bert, news, and twitter embeddings

In [None]:
test_bert_news_twitter_embeddings = embed_tweets(bert_news_twitter_embed, 5220, test_tweets)

Progress: [################----] 78.6%


In [409]:
#test_bert_news_twitter_embeddings.shape, bert_news_twitter_embeddings.shape,

In [410]:
#test_tfidf.shape, tfidf.shape,test_pos.shape, pos.shape,base_test_feats.shape, base_feats.shape

In [401]:
test_M1 = np.concatenate([test_tfidf,test_pos,base_test_feats,test_bert_news_twitter_embeddings],axis=1)

In [408]:
test_M1.shape, M1.shape

((4937, 9046), (19746, 9046))

In [414]:
bert_news_twitter_preds = bert_news_twitter_model.predict(test_M1)

Bert, news, and twitter embeddings with hand-built features

In [421]:
test_M6 = np.concatenate([test_tfidf,test_pos,hand_built_test_feats,test_bert_news_twitter_embeddings ],axis=1)

In [423]:
bnt_hb_combined_preds = bnt_hb_combined_model.predict(test_M6)

## Test Evaluation

In [433]:
testing_tweets = test[['tweet','class']]

In [452]:
test_tweet_only_missed, test_tweet_only_correct = evaluate(y_test, tweet_only_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.25      0.35      0.29       265
           1       0.92      0.87      0.90      3875
           2       0.68      0.75      0.71       797

   micro avg       0.83      0.83      0.83      4937
   macro avg       0.62      0.66      0.63      4937
weighted avg       0.84      0.83      0.83      4937



In [454]:
test_hand_built_missed, test_hand_built_correct = evaluate(y_test, hand_built_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.24      0.36      0.29       265
           1       0.91      0.86      0.89      3875
           2       0.65      0.71      0.68       797

   micro avg       0.81      0.81      0.81      4937
   macro avg       0.60      0.64      0.62      4937
weighted avg       0.83      0.81      0.82      4937



In [455]:
test_combined_missed, test_combined_correct = evaluate(y_test, combined_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.27      0.38      0.32       265
           1       0.92      0.87      0.90      3875
           2       0.68      0.75      0.71       797

   micro avg       0.83      0.83      0.83      4937
   macro avg       0.62      0.67      0.64      4937
weighted avg       0.85      0.83      0.84      4937



In [456]:
test_bert_news_twitter_missed, test_bert_news_twitter_correct = evaluate(y_test, bert_news_twitter_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.31      0.47      0.38       265
           1       0.95      0.88      0.91      3875
           2       0.74      0.85      0.79       797

   micro avg       0.86      0.86      0.86      4937
   macro avg       0.67      0.74      0.69      4937
weighted avg       0.88      0.86      0.87      4937



In [457]:
test_bnt_hb_combined_missed, test_bnt_hb_combined_correct = evaluate(y_test, bnt_hb_combined_preds, testing_tweets)

              precision    recall  f1-score   support

           0       0.31      0.47      0.38       265
           1       0.95      0.89      0.92      3875
           2       0.74      0.84      0.79       797

   micro avg       0.86      0.86      0.86      4937
   macro avg       0.67      0.73      0.69      4937
weighted avg       0.88      0.86      0.87      4937



In [377]:
combined_missed.to_csv("test_tweet_hand_combined_missed.csv", sep='\t')

In [378]:
test_hand_built_missed.to_csv("test_hand_built_missed.csv", sep='\t')

In [379]:
test_tweet_only_missed.to_csv("test_tweet_only_missed.csv", sep='\t')

In [419]:
test_bert_news_twitter_missed.to_csv("test_bert_news_twitter_missed.csv", sep='\t')

In [425]:
test_bnt_hb_combined_missed.to_csv("test_bnt_hb_combined_missed.csv", sep='\t')

In [460]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 700)

In [510]:
test_bnt_hb_combined_correct_hate = test_bnt_hb_combined_correct.loc[test_bnt_hb_combined_correct['class']==2][['tweet']]


In [548]:
test_bnt_hb_combined_correct_hate.columns

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [511]:
test_bnt_hb_combined_correct_hate

Unnamed: 0,tweet
18,RT @DamnFoodPorn: Oreo Cookie Pancakes #FoodPorn http://t.co/q6EvCvM3sO
35,Fairy Tale Hat - free crochet pattern for child and adult at http://t.co/0lRaBLMHB2 http://t.co/99zVbmYVe9
58,"RT @daraobriain: Tommy Voeckler keeps the yellow, Andy Schleck does the break of the tour, and Contador is broken. Incredible day for #tdf"
63,Watching #Steelers colts getting blinded by the Steelers uniys look like bunch of bumblebees
65,RT @netflix: Here's your first look at Charlie Cox as Matt Murdock in #Marvel's @Daredevil on #Netflix #NYCC http://t.co/SIYuCgok8J
67,It's a matter of pure weight ratios... A five ounce bird can not carry a one pound coconut!
78,Ravioli stuffed wit lobster nd a nice stuffed clam #eating betterthanamobster http://t.co/PzdoESaR
80,@AB_crispyy_andy I laugh Oreos and lady gaga on a Sunday day
100,#Yankees Damn. Well Joe that move to the bullpen really helped.
103,Next time you leave magic trash in my jacket try not to have your initials written on them. @whatupag http://t.co/tFV1cZtr


In [476]:
test_bnt_hb_combined_missed_hate = test_bnt_hb_combined_missed.loc[test_bnt_hb_combined_missed['class']==2][['tweet']]


In [477]:
#test_bnt_hb_combined_missed.to_csv("test_bnt_hb_combined_missed.csv", sep='\t')

In [478]:
#test_bnt_hb_combined_correct.to_csv("test_bnt_hb_combined_correct.csv", sep='\t')

In [479]:
#test_hand_built_missed.to_csv("test_hand_built_missed.csv", sep='\t')

In [480]:
#test_hand_built_correct.to_csv("test_hand_built_correct.csv", sep='\t')

## Compare with baseline

In [549]:
test_baseline_correct = pd.read_csv('test_baseline_correct.csv', sep='\t')[['tweet','class','prediction']]
test_baseline_correct_hate = test_baseline_correct[test_baseline_correct['class']==2][['tweet']]


In [516]:
test_baseline_missed = pd.read_csv('test_baseline_missed.csv', sep='\t')[['tweet','class','prediction']]


In [527]:
len(test_bnt_hb_combined_correct_hate), len(test_baseline_correct_hate)

(688, 665)

In [None]:
df_final

In [507]:
m = test_bnt_hb_combined_correct_hate.merge(test_baseline_correct, on=cols, how='outer', suffixes=['', '_'], indicator=True)

In [552]:
m

Unnamed: 0,tweet
228,@XxminijokerXx I HATE birds! They just piss me off about the fact that I can't fly!
435,Nothing good on TV and the @Yankees are eating their balls 3 innings in. Great.
1547,@TheMayorMatt has to be #live version ! #pulledpork #clams #babyjesus #lilsmokey
2288,i made sweet &amp; SPICY drunken noodles with handmade and handcut ho fun noodles with marinated chicken/shrimp. my back is hurting.
474,RT @iLLmak3Ufamous: Touch my swag. Wish you could. RT &#8220;@VizyIsIgnant: I LOOK FLY. I LOOK GOOD. RT @Wisdom_Stature: On my momma.... On my ho&#8230;
1134,@goldnsilvercoin the worst terrorists are republican teabaggers...
2252,I hate Hashtag activism. No lie. I think 95% of it is trash.


In [553]:
test_bnt_hb_combined_missed

Unnamed: 0,tweet,class,prediction
5,I take full pride in the ass my momma has so blessed me with. Now hers is just ridiculous on all levels. Her yellow ass got back for years,1,2
9,@ZaneIsClasker the fuck r u talking about I told u once and am going to tell u against nigga I love white bitch gtf bro,0,1
12,Damn rednecks outside shooting guns keep waking my kids up. No sleep for us tonight. Thanks #2A! Work at 8am is going to be so great...,1,2
16,RT @jaylynnkoliba: @TonyJRodriguez imma get my crew on dis fuckin fag #3hunnad real talk nigga #turn up,1,0
19,Blood this nigga Dion retarded,1,0
33,Fucking trash @carmeloanthony,1,0
36,RT @CHlLDHOODRUINER: who else hated this bitch nigga ?? http://t.co/8jwGmV4Yet,0,1
42,I'm not really a phone kinda guy.. I actually hate talking on the phone &amp; texting kinda trash to me also.,2,1
46,"RT @vodkapapixo: ""@Weed_Cloudz: ""6 God"" by Drake is trash"" thank you",1,2
47,&#8220;@rihanna: We raided Nicki's wig closet for the summer! Bad gals just wanna have fun! http://t.co/0paNscImEw&#8221; this bitch has the a nerve smh,2,1
