In [103]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [104]:
df = pd.read_csv("data/train/training_data.csv")

In [105]:
tweets=df.tweet

## Davidson Feature Generation

In [106]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = tweet.split() #[stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [107]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [108]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [21]:
#tweet_tags

## Preprocess for slang


In [19]:
# loading the emoji dataset
def load_slang_dict():
    slang_dict = {}
    with open("slang_to_words.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            #print(tokens[1])
            slang_dict[tokens[1]] = tokens[0]
    return slang_dict
slang_dict = load_slang_dict()
slang_dict

{'hundo p': '100% certain',
 'trill': 'real true',
 'otp': 'One True Pairing',
 'distractivated': 'Distracted in a way that motivates/inspires',
 'JOMO': 'Joy of Missing Out',
 'tache': 'mustache',
 'ngl': 'Not gonna lie',
 'sus': 'Suspect',
 'AFAIK': 'As Far As I Know',
 'AF': '...as fuck',
 'AFK': 'Away From Keyboard',
 'AIR': 'Am I Right',
 '<3': 'Love',
 'CFATH': 'Chuckling For All to Hear',
 'THNX': 'Thanks',
 'HAND': 'Have A Nice Day',
 'C U': 'See You',
 'C U L8R': 'See You Later',
 'SWYP': "So What's Your Problem?",
 'TIME': 'Tears In My Eyes',
 'SWAK': 'Sealed With a Kiss',
 'hh': 'Haha',
 'CHX': 'Chicks',
 'KISS': 'Keep It Simple, Stupid',
 'SAL': 'Such A Laugh',
 'IDK': "I Don't Know",
 'NP': 'No problem',
 'IHNI': 'I have no idea',
 'JSYK': 'Just so you know',
 'IDC': "I don't care",
 'KYS': 'Kill yourself',
 'ATM': 'At The Moment',
 'WYD': 'What Are You Doing',
 'WYA': 'Where Are You At',
 'IRL': 'In Real Life',
 'SWYD': "Stop What You're Doing",
 'SNOL': 'Said Nice Out Lo

In [17]:
# loading the emoji dataset
def load_slang_two_dict():
    slang_dict_two = {}
    with open("noslangdotcom.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split(':')
            slang_dict[tokens[0]] = tokens[1]
    return slang_dict
slang_dict_two = load_slang_two_dict()
slang_dict_two

['a$$ ', 'ass']
['a&f ', 'always and forever']
["a'ight ", 'alright']
['a.i.m. ', 'aol instant messanger']
['a/l ', 'age and location']
['a/m ', 'away message']
['a/s/l ', 'age,sex,location']
['a/s/l/p ', 'age/sex/location/picture']
['a/s/l/r ', 'age, sex, location, race']
['a1t ', 'anyone there']
['a3 ', 'anyplace, anywhere, anytime']
['a4u ', 'all for you']
['aaaaa ', 'American Assosciation Against Acronym Abuse']
['aabf ', 'as a best friend']
['aaf ', 'as a friend']
['aak ', 'Alive and Kicking']
['aamof ', 'as a matter of fact']
['aatf ', 'always and totally forever']
['aatw ', 'all around the world']
['abd ', 'Already Been Done']
['abend ', 'absent by enforced net deprivation']
['abft ', 'About fucking Time']
['aboot ', 'about']
['abreev ', 'abbreviation']
['absnt ', 'absent']
['abt ', 'about']
['abwt ', 'about']
['acc ', 'account']
['acct ', 'account']
['acgaf ', "Absolutely couldn't give a fuck"]
['ack ', 'acknowledged']
['addy ', 'address']
['adhd ', 'Attention Deficit Hyperacti

['gui ', 'graphical user interface']
['gurl ', 'girl']
['gurlz ', 'girls']
['guru ', 'expert']
['gw ', 'good work']
['gwijd ', 'guess what i just did']
['gwm ', 'gay white male']
['gwork ', 'good work']
['gwrk ', 'good work']
['gws ', 'get well soon']
['gwytose ', 'go waste your time on someone else']
['gy ', 'gay']
['gyal ', 'girl']
['gypo ', 'Get Your Penis Out']
['h&k ', 'hugs and kisses']
['h*r ', 'homestar runner']
['h+k ', 'hugs and kisses']
['h.o ', 'hold on']
['h/e ', 'However']
['h/mo ', 'homo']
['h/o ', 'hold on']
['h/u ', 'hold up']
['h/w ', 'homework']
['h2 ', 'Halo 2']
['h2gtb ', 'have to go to the bathroom']
['h2o ', 'water']
['h2sys ', 'hope to see you soon']
['h3y ', 'hey']
['h4kz0r5 ', 'hackers']
['h4x ', 'Hacks']
['h4x0r ', 'hacker']
['h4xor ', 'hacker']
['h4xr ', 'hacker']
['h4xrz ', 'hackers']
['h4xx0rz ', 'hacker']
['h4xxor ', 'hacker']
['h8 ', 'hate']
['h80r ', 'hater']
['h82sit ', 'hate to say it']
['h83r ', 'hater']
['h8ed ', 'hated']
['h8er ', 'hater']
['h8r ',

['newais ', 'Anyways']
['neway ', 'anyway']
['neways ', 'anyways']
['newayz ', 'anyways']
['newb ', 'someone who is new']
['newbie ', 'new player']
['newez ', 'anyways']
['nf ', 'not funny']
['nfbsk ', 'not for british school kids']
['nfc ', 'no fucking clue']
['nfd ', 'no fucking deal']
['nff ', 'not fucking fair']
['nfg ', 'No fucking Good']
['nfi ', 'no fucking idea']
['nfr ', 'not for real']
['nfs ', 'not for sale']
['nft ', 'no further text']
['nfw ', 'no fucking way']
['ng ', 'nice game']
['ngaf ', 'nobody gives a fuck']
['ngl ', 'Not Gonna Lie']
['nh ', 'nice hand']
['nhatm ', 'not here at the moment']
['ni ', 'no idea']
['ni994 ', 'nigga']
['nib ', 'new in box']
['nic ', 'Network Interface Card']
['nif ', 'non internet friend']
['nifoc ', 'naked in front of computer']
['nifok ', 'naked in front of keyboard']
['nigysob ', "now I've got you son of a bitch"]
['nimby ', 'not in my backyard']
['nin ', 'no its not']
['nip ', 'nothing in particular']
['nips ', 'nipples']
['nite ', 'ni

['ttyw ', 'talk to you whenever']
['ttywl ', 'Talk to you way later']
['tu ', 'thank you']
['tuff ', 'tough']
['tuh ', 'to']
['tul ', 'text you later']
['tut ', 'take your time']
['tuvm ', 'thank you very much']
['tv ', 'television']
['tvm ', 'thanks very much']
['tw ', 'Teacher Watching']
['twajs ', 'That was a joke, son.']
['twat ', 'vagina']
['twbc ', 'that would be cool']
['twdah ', 'that was dumb as hell']
['twf ', 'That was funny']
['twfaf ', 'thats what friends are for']
['twg ', 'That was great']
['twi ', 'Texting While Intoxicated']
['twis ', "that's what I said"]
['twoh ', 'typing with one hand']
['tws2wa ', 'That was so 2 weeks ago']
['twss ', "That's what she said"]
['twsy ', 'That was so yeterday']
['twttr ', 'twitter']
['twvsoy ', 'that was very stupid of you']
['twyl ', 'Talk With You Later']
['twys ', 'Talk With You Soon']
['tx ', 'thanks']
['txs ', 'thanks']
['txt ', 'text']
['txting ', 'texting']
['txtyl ', 'text you later']
['ty ', 'thank you']
['tyclos ', 'turn your

IndexError: list index out of range

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def preprocess_slang(text_string):
    
    




In [153]:
words_that_hurt = {
    'bitch': 'Targets and dehumanizes women, even if used toward men, including queer and gay men. Devalues women and femininity. Reinforces sexism.',
    'ghetto' :'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'ratchett':'Describes something or someone as cheap, worn out, poor, dangerous, etc. Reference to housing communities that are impoverished and disproportionately impact people of color. Associates people of color with these negative characteristics.',
    'illegal alien': 'Reduces undocumented immigrants to something less than human. Fixates on legal status instead of people as individuals. Asserts that some people belong here more than others do. Ignores political, social, and economic factors that impact people of color.',
    'no homo': 'Stresses the speaker\'s heterosexuality, masculinity, and/or other traits to avoid being perceived as LGBTQIA. Goes to great lengths to avoid association with anything queer. Reinforces that to be LGBTQIA is bad.',
    'retarded': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'lame': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'crazy':'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'dumb': 'Targets mental, emotional and physical disabilities as objects for ridicule. Used as synonyms for "worthless," "bad," "unintelligent," "incapable," etc.',
    'that\'s so gay': 'Stigmatizes gay and queer people. Uses their identities to describe something as undesirable and bad. Replaces negative adjectives with words related to LGBTQIA identities.',
    'whore': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'ho': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'slut': 'Dismisses anyone seen as being "too" sexual, particularly sex workers, women, LGBTQI people and people of color. Perpetuates negativity toward sex itself. Regulates who is allowed to have it.',
    'Bisexuality doesn\'t really exist. People are just gay or straight.': 'This denies the fluidity of sexuality and dismisses people\'s experiences and definitions of self. People deserve the right to define their own identities any way they wish and have those definitions honored.',
    'i think everyone is bisexual': 'While this is often meant to acknowledge the fluidity of sexuality, it dismisses the reality of people who identify as bisexual and erases their experiences. It also invalidates the self-identifications of non-bisexual people.',
    'You\'re too femme to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'You\'re too butch to be bisexual':'Gender presentation does not indicate sexual orientation. Bisexual people have a wide range of gender presentations.',
    'Bisexual people just want straight privilege':'Bisexual people experience discrimination within straight communities and lesbian/gay communities. They never fully experience straight privilege because they do not identify as straight. Often their identities are made invisible and denied.',
    'Bisexual people are just greedy and want to have sex with everyone.':'This stereotypes bisexual people and assumes they are all promiscuous - and that this is a bad thing. It creates negative attitudes toward sex and works against creating a sex positive climate. It also demonstrates an underlying belief that bisexuality is only about behavior and is not a legitimate identity.',
    'Who do you see yourself ending up with?':'This is another way of implying one has to "end up" gay or straight and ignores bisexuality as an identity versus a relationship status. It also assumes everyone desires to be in a long-term monogamous relationship.',
    'Tranny':'Whether or not someone identifies as trans*, calling anyone "tranny" is extremely offensive. While some folks within the trans* community may choose to reclaim this word for themselves, it is not a word that is okay to use to label another person or use as a joke.',
    'That person doesn\'t really look like a woman':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'That person doesn\'t really look like a man':'What does it mean to look like a man or woman? There are no set criteria. It also should not be assumed that all Trans Men strive to fit within dominant ideas of masculinity or all Trans Women strive to fit within dominant ideas of femininity, or that all Trans* people want to look like men or women. Gender presentation is fluid and distinct from gender identity, and all forms of gender expression deserve affirmation.',
    'What is your REAL name? I mean the one you were given at birth':'This implies that the person\'s gender identity and chosen name are not "real" and perpetuates the idea of Trans people as deceptive. It removes agency and any right to make decisions for themselves, and is incredibly invalidating. It presumes a right to intimate information, disregards privacy, and places Trans lives on public display.',
    'He-She':'This hyphenated term is demeaning and invalidates an individual\'s identity and the pronouns that they use.',
    'What are you REALLY? Have you had surgery?': 'Asking anyone personal questions about their bodies and/or surgeries is invasive and inappropriate. We don\'t ask cisgender people about what is under their clothes; we shouldn\'t ask Trans* people either.',
    'cunt':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'twat':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'pussy':'Using words that refer to people with vaginas to express that someone is weak or emotional. Dehumanizes womxn and perpetuates misogyny and sexism.',
    'thot':'Word created to express womxn or people who are sexually promiscuous. There are speculations that the word comes from the KKK organization that referred to Black women who were forced into prostitution (i.e. Sarah Baartman: Hottentot).',
    'ugly':'Word used to put down someone for the way they look, can be connected back to white supremacist, ableist, sizeist standards of beauty.',
    'you guys':'Erases the identities of people who are in the room. Generalizing a group of people to be masculine.',
    'I\'m being such a fat-ass':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.',
    'I\'m being so fat right now!':'Demeans and devalues fatness/fat bodies, reinforces harmful assumptions that fat people are gluttonous and are fat because they have no restraint around food. Also implies that there is an acceptable amount of food to eat and anything more is disgusting, or that enjoying food too much is disgusting.'
}

hurtfulWords = list(words_that_hurt.keys())


In [70]:
#Binary Feature #6 1) ID tweets with female pronouns 2) Check if these words are in the tweet 

#these words are used disproportionately often against women
#the behaviour they describe often goes unremarked in men.
#source: http://sacraparental.com/2016/05/14/everyday-misogyny-122-subtly-sexist-words-women/
#EVERYDAY MISOGYNY: 122 SUBTLY SEXIST WORDS ABOUT WOMEN (AND WHAT TO DO ABOUT THEM)
female_and_nongender_Pronouns = set(['you','she','its','their','yours',
                                    'her', 'it', 'they', 'them',
                                    'yourself', 'herself', 'themselves',
                                    'your','hers'])

pronouns = {'I': ('personal', True, 'first'),
 'me': ('personal', True, 'first'),
 'we': ('personal', False, 'first'),
 'us': ('personal', False, 'first'),
 'you': ('personal', False, 'second'),
 'she': ('personal', True, 'third'),
 'he': ('personal', True, 'third'),
 'her': ('possessive', True, 'third'),
 'him': ('personal', True, 'third'),
 'it': ('personal', True, 'third'),
 'they': ('personal', False, 'third'),
 'them': ('personal', False, 'third'),
 'myself': ('reflexive', True, 'first'),
 'ourselves': ('reflexive', False, 'first'),
 'yourself': ('reflexive', True, 'second'),
 'yourselves': ('reflexive', False, 'second'),
 'himself': ('reflexive', True, 'third'),
 'herself': ('reflexive', True, 'third'),
 'itself': ('reflexive', True, 'third'),
 'themselves': ('reflexive', False, 'third'),
 'my': ('possessive', True, 'first'),
 'your': ('possessive', False, 'second'),
 'his': ('possessive', True, 'third'),
 'hers': ('possessive', True, 'third'),
 'its': ('possessive', True, 'third'),
 'our': ('possessive', False, 'first'),
 'their': ('possessive', False, 'third'),
 'mine': ('possessive', True, 'first'),
 'yours': ('possessive', False, 'second'),
 'ours': ('possessive', False, 'first')}

female_offensive = ['bossy', 'abrasive', 'ball-buster', 'aggressive', 
'shrill', 'bolshy', 'intense', 'stroppy', 'forward', 
'mannish', 'gossipy', 'Dramatic', 'Drama Queen', 'Catty', 
'Bitchy', 'Nag', 'Cold', 'Ice queen', 'Shrew', 'Humourless',
'Man-hater', 'Banshee', 'Fishwife', 'Lippy', 'Ditzy', 'Feminazi', 
'militant feminist', 'Bridezilla', 'Diva', 'Prima donna', 'Blonde moment',
'Feisty', 'Supermum','Working mother', 'Career woman', 'Yummy mummy', 'Little old lady', 
'WAHM', 'Slut', 'Trollop','Frigid','Easy','Tease','Loose','Man-eater','Cougar',
'Asking for it','prude','the town bike', 'Mutton dressed as lamb','Slutty','Curvy','Mumsy',
'Cheap','That dress is flattering','Frumpy','Let herself go','Faded beauty','Mousey',
 'Plus-size','Clotheshorse','Brunette ','Ladylike','Bubbly','Vivacious','Flirty',
'Sassy','Chatty','Demure','Modest','Emotional','Hysterical','Hormonal',
'Menstrual ',' pre-menstrual ','Flaky','Moody','Over-sensitive',
'Clucky','Neurotic','Irrational','Baby brain','Baby weight','Mummy blogger',
'Female engineer','That’s good, for a girl','Like a girl','run like a girl', 
'throw like a girl','Mumpreneur','Spinster','Barren','She wears the pants','Housewife',
'Houseproud','Soccer mom','Mistress','Kept woman','Incompetent cervix',
'Failure to progress','Elderly primagravida','Irritable uterus','Tomboy',
'Girly','a girly girl','Little lady','Jail-bait','Heart-breaker',
'pretty little thing','Catfight','Mommy wars','Caring','Compassionate','Hard-working',
'Conscientious','Dependable','Diligent','Dedicated','Tactful','Interpersonal','Warm',
'Helpful','Maternal', 'Princess', 'Heart-breaker']
#most tweeted to Megyn Kelly by Trump and trump supperters
#https://www.vox.com/2016/1/27/10852876/donald-trump-supporters-sexist-tweets-megyn-kelly
trump_suppporters_megynKelly = ["ugly", "cheap", 'bitch', 'whore', 'bimbo',
                                'cunt', 'hooker', 'slut', 'skank']
others = ['hoe', 'pussy', 'bitches', 'fatty', 'fatass', 'fat-ass']
offsensive_words_toward_women = female_offensive + trump_suppporters_megynKelly + others + hurtfulWords

In [134]:
female_offensive_words = set()
for word in offsensive_words_toward_women:
    female_offensive_words.add(word.lower())
#female_offensive_words

def check_offensive_to_women(text):
    #split tweet by white space and make lower case
    li = set([word.lower() for word in text.split()]) 
    isFemale = female_and_nongender_Pronouns.intersection(li)
    if len(isFemale) == 0:
        return 0
    isOffensive = female_offensive_words.intersection(li)
    if isOffensive:
        return len(isOffensive)
    return 0
    
#checkOffensive = check_offensive_to_women("She is so hoe bossy")
#checkOffensive

2

In [2]:
import pandas as pd
nrc_emotions_df = pd.read_csv("nrc_emotions.csv") 

## NRC emotions 
Read in nrc_emotions_df as list values

In [4]:
anger = nrc_emotions_df.loc[nrc_emotions_df['anger']][['term']].values
anticipation = nrc_emotions_df.loc[nrc_emotions_df['anticipation']][['term']].values
disgust = nrc_emotions_df.loc[nrc_emotions_df['disgust']][['term']].values
fear = nrc_emotions_df.loc[nrc_emotions_df['fear']][['term']].values
joy = nrc_emotions_df.loc[nrc_emotions_df['joy']][['term']].values
sadness = nrc_emotions_df.loc[nrc_emotions_df['sadness']][['term']].values
surprise = nrc_emotions_df.loc[nrc_emotions_df['surprise']][['term']].values
trust = nrc_emotions_df.loc[nrc_emotions_df['trust']][['term']].values


In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def anger_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anger else 0, s))
    return score
    

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def anticipation_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in anticipation else 0, s))
    return score
    

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def disgust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in disgust else 0, s))
    return score
    

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def joy_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in joy else 0, s))
    return score
    

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def fear_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in fear else 0, s))
    return score


In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def sadness_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in sadness else 0, s))
    return score


In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def surprise_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in surprise else 0, s))
    return score


In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

def trust_count(text):
    s = tknzr.tokenize(text)
    score = sum(map(lambda word : 1 if word in trust else 0, s))
    return score


In [109]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )

In [110]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [27]:
#pos_vocab

In [111]:
def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

In [135]:

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    #sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    #avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    #Our features
    targeted = contains_target(words)
    immigrant_ref = 0
    if words.find('immigrant') or words.find('immigrants'):
        immigrant_ref = 1
    isOffensiveToWomen = check_offensive_to_women(tweet)
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    #FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    #FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
        
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, targeted, immigrant_ref, isOffensiveToWomen]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [112]:
groups = open('ethnic_groups.txt','r').read().split('\n')

#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this', ]
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want']

In [113]:
#If tweet contains a targeted statement referring to a certain group, i.e. "all you Asians" or "every Mexican"
#also checks if a group word is followed by some sort of modal verb

def contains_target(words):
    for i in range(len(words)):
        if words[i].lower() in targets:
            if words[i+1].lower() in groups:
                return 1
        if words[i].lower() in groups:
            if words[i+1].lower() in modality:
                return 1
            
    return 0
    

In [136]:
other_features_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet", "targeted", "immigrant_ref", "isOffensiveToWomen"]

In [137]:
feats = get_feature_array(tweets)

In [20]:
len(tweets)

19746

## BERT

In [None]:
#RUN FROM HERE WITHOUT ELMO AND BERT

In [117]:
all_X = tweets
all_y = df['class'].astype(int)

In [138]:
#Now join them all up
#M = np.concatenate([tfidf,pos,feats,X_elmo_train_layers],axis=1)

M = np.concatenate([tfidf,pos,feats],axis=1)

In [139]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

In [140]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [142]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

In [143]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

In [144]:
param_grid = [{}] # Optionally add parameters here

In [145]:
grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv=StratifiedKFold(n_splits=5, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=2)

In [146]:
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   5.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV] ................................................. , total=   5.4s
[CV]  ................................................................
[CV] ................................................. , total=   4.2s
[CV]  ................................................................
[CV] ................................................. , total=   5.7s
[CV]  ................................................................
[CV] ................................................. , total=   5.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   27.6s finished


In [147]:
y_preds = model.predict(X_test)

## Evaluation

In [148]:
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.28      0.42      0.34       104
           1       0.92      0.86      0.89      1507
           2       0.68      0.76      0.72       364

   micro avg       0.82      0.82      0.82      1975
   macro avg       0.63      0.68      0.65      1975
weighted avg       0.84      0.82      0.83      1975



In [51]:
#without isOffensiveToWomen
#              precision    recall  f1-score   support

#           0       0.29      0.43      0.35       104
#           1       0.92      0.86      0.89      1507
#           2       0.68      0.75      0.71       364

#   micro avg       0.82      0.82      0.82      1975
#   macro avg       0.63      0.68      0.65      1975
#weighted avg       0.84      0.82      0.83      1975

In [52]:
#With binary isOffensive
#with is offensiveToWomen
#             precision    recall  f1-score   support

#           0       0.28      0.42      0.34       104
#           1       0.92      0.86      0.89      1507
#           2       0.67      0.76      0.71       364

#   micro avg       0.82      0.82      0.82      1975
#   macro avg       0.62      0.68      0.64      1975
#weighted avg       0.84      0.82      0.83      1975

In [149]:
all_tweets = df[['tweet', 'class']]
misses = np.where(np.asarray(y_test) != y_preds)
missed_preds = []
for i in range(len(y_test)):
    if np.asarray(y_test)[i] != y_preds[i]:
        missed_preds.append(y_preds[i])
    

missed = [list(y_test.index)[i] for i in misses[0]]

In [150]:
missed_tweets = all_tweets.iloc[missed]

In [151]:
missed_tweets

Unnamed: 0,tweet,class
8500,@TonyO97 fuck i look like shopping at that tra...,1
2626,RT @DessantiGina: @TonyJRodriguez @WolfVanHale...,0
15306,Damn some Oreos would be so fucking clutch rig...,1
1739,Dis nicca lame,2
6956,"@DivaMonRoe2uHoE @CheefPolo hoe hoe hoe, merry...",2
7172,Gonna straight hip check the next hillbilly wh...,1
19415,They're calling it #Sandy because the wind is ...,1
5116,I really just want to kill some towel head ter...,0
10399,@operationSAFE @GaltsGirl lived there and can ...,2
7983,do they even make dresses any more that have s...,1


In [152]:
missed_tweets.loc[:,'prediction'] = missed_preds
len(missed_tweets[(missed_tweets['class'] == 2)]), len(missed_tweets[(missed_tweets['class'] == 1)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(88, 205)

In [132]:
#with isOffensiveToWomen
#(89, 214)

#without isOffensiveToWomen
#(90, 206)

#(88, 205)