In [None]:
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv("data/train/training_data.csv")

In [3]:
tweets=df.tweet

## Davidson Feature Generation

In [4]:
stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = tweet.split() #[stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords,
    use_idf=True,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.75
    )

In [5]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [6]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [34]:
#Binary Feature #6 1) ID tweets with female pronouns 2) Check if these words are in the tweet 

#these words are used disproportionately often against women
#the behaviour they describe often goes unremarked in men.
#source: http://sacraparental.com/2016/05/14/everyday-misogyny-122-subtly-sexist-words-women/
#EVERYDAY MISOGYNY: 122 SUBTLY SEXIST WORDS ABOUT WOMEN (AND WHAT TO DO ABOUT THEM)
female_and_nongender_Pronouns = set(['you','she','its','their','yours',
                                    'her', 'it', 'they', 'them',
                                    'yourself', 'herself', 'themselves',
                                    'your','hers'])

pronouns = {'I': ('personal', True, 'first'),
 'me': ('personal', True, 'first'),
 'we': ('personal', False, 'first'),
 'us': ('personal', False, 'first'),
 'you': ('personal', False, 'second'),
 'she': ('personal', True, 'third'),
 'he': ('personal', True, 'third'),
 'her': ('possessive', True, 'third'),
 'him': ('personal', True, 'third'),
 'it': ('personal', True, 'third'),
 'they': ('personal', False, 'third'),
 'them': ('personal', False, 'third'),
 'myself': ('reflexive', True, 'first'),
 'ourselves': ('reflexive', False, 'first'),
 'yourself': ('reflexive', True, 'second'),
 'yourselves': ('reflexive', False, 'second'),
 'himself': ('reflexive', True, 'third'),
 'herself': ('reflexive', True, 'third'),
 'itself': ('reflexive', True, 'third'),
 'themselves': ('reflexive', False, 'third'),
 'my': ('possessive', True, 'first'),
 'your': ('possessive', False, 'second'),
 'his': ('possessive', True, 'third'),
 'hers': ('possessive', True, 'third'),
 'its': ('possessive', True, 'third'),
 'our': ('possessive', False, 'first'),
 'their': ('possessive', False, 'third'),
 'mine': ('possessive', True, 'first'),
 'yours': ('possessive', False, 'second'),
 'ours': ('possessive', False, 'first')}

female_offensive = ['bossy', 'abrasive', 'ball-buster', 'aggressive', 
'shrill', 'bolshy', 'intense', 'stroppy', 'forward', 
'mannish', 'gossipy', 'Dramatic', 'Drama Queen', 'Catty', 
'Bitchy', 'Nag', 'Cold', 'Ice queen', 'Shrew', 'Humourless',
'Man-hater', 'Banshee', 'Fishwife', 'Lippy', 'Ditzy', 'Feminazi', 
'militant feminist', 'Bridezilla', 'Diva', 'Prima donna', 'Blonde moment',
'Feisty', 'Supermum','Working mother', 'Career woman', 'Yummy mummy', 'Little old lady', 
'WAHM', 'Slut', 'Trollop','Frigid','Easy','Tease','Loose','Man-eater','Cougar',
'Asking for it','prude','the town bike', 'Mutton dressed as lamb','Slutty','Curvy','Mumsy',
'Cheap','That dress is flattering','Frumpy','Let herself go','Faded beauty','Mousey',
 'Plus-size','Clotheshorse','Brunette ','Ladylike','Bubbly','Vivacious','Flirty',
'Sassy','Chatty','Demure','Modest','Emotional','Hysterical','Hormonal',
'Menstrual ',' pre-menstrual ','Flaky','Moody','Over-sensitive',
'Clucky','Neurotic','Irrational','Baby brain','Baby weight','Mummy blogger',
'Female engineer','That’s good, for a girl','Like a girl','run like a girl', 
'throw like a girl','Mumpreneur','Spinster','Barren','She wears the pants','Housewife',
'Houseproud','Soccer mom','Mistress','Kept woman','Incompetent cervix',
'Failure to progress','Elderly primagravida','Irritable uterus','Tomboy',
'Girly','a girly girl','Little lady','Jail-bait','Heart-breaker',
'pretty little thing','Catfight','Mommy wars','Caring','Compassionate','Hard-working',
'Conscientious','Dependable','Diligent','Dedicated','Tactful','Interpersonal','Warm',
'Helpful','Maternal', 'Princess', 'Heart-breaker']
#most tweeted to Megyn Kelly by Trump and trump supperters
#https://www.vox.com/2016/1/27/10852876/donald-trump-supporters-sexist-tweets-megyn-kelly
trump_suppporters_megynKelly = ["ugly", "cheap", 'bitch', 'whore', 'bimbo',
                                'cunt', 'hooker', 'slut', 'skank']
others = ['hoe', 'pussy']
offsensive_words_toward_women = female_offensive + trump_suppporters_megynKelly + hoe

In [35]:
female_offensive_words = set()
for word in offsensive_words_toward_women:
    female_offensive_words.add(word.lower())
#female_offensive_words

def check_offensive_to_women(text):
    #split tweet by white space and make lower case
    li = set([word.lower() for word in text.split()]) 
    print(li)
    print(female_and_nongender_Pronouns)
    isFemale = female_and_nongender_Pronouns.intersection(li)
    if len(isFemale):
        isOffensive = female_offensive_words.intersection(li)
    if isOffensive:
        return True
    return False
    
#checkOffensive = check_offensive_to_women("She is so bossy")
#checkOffensive

{'she', 'so', 'bossy', 'is'}
{'she', 'they', 'their', 'them', 'its', 'themselves', 'yours', 'it', 'your', 'you', 'yourself', 'herself', 'her', 'hers'}


True

In [7]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )

In [8]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [9]:
def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

In [10]:
groups = open('groups.txt','r').read().split('\n')

#demonstrative adjectives and other words that can inidicate targeting of a specific group
targets = ['all', 'every', 'you', 'those', 'these', 'any', 'each', 'no', 'that', 'this', ]
modality = ['should', 'can', 'can\'t', 'cannot', 'won\'t', 'will', 'want']

In [11]:
#If tweet contains a targeted statement referring to a certain group, i.e. "all you Asians" or "every Mexican"
#also checks if a group word is followed by some sort of modal verb

def contains_target(words):
    for i in range(len(words)):
        if words[i].lower() in targets:
            if words[i+1].lower() in groups:
                return 1
        if words[i].lower() in groups:
            if words[i+1].lower() in modality:
                return 1
            
    return 0
    

In [12]:

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    #sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    #avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    #Our features
    targeted = contains_target(words)
    immigrant_ref = 0
    if words.find('immigrant') or words.find('immigrants'):
        immigrant_ref = 1
    isOffensiveToWomen = check_offensive_to_women(tweet)
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    #FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    #FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
        
    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms,
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet, targeted, immigrant_ref, isOffensiveToWomen]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [13]:
other_features_names = ["num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "num_hashtags", \
                    "num_mentions", "num_urls", "is_retweet", "targeted", "immigrant_ref", "isOffensiveToWomen"]

In [14]:
feats = get_feature_array(tweets)

## Try ELMo

In [15]:
#ELMo
from allennlp.commands.elmo import ElmoEmbedder
from nltk.tokenize.treebank import TreebankWordTokenizer

In [18]:
elmo = ElmoEmbedder()

In [17]:
tokenizer = TreebankWordTokenizer()
small_X = tweets#.head(100)
elmo_train_toks = [tokenizer.tokenize(ex) for ex in small_X]

In [19]:
X_elmo_train_layers = list(elmo.embed_sentences(elmo_train_toks))

In [20]:
len(tweets)

19746

## BERT

In [21]:
from bert_serving.client import BertClient 

In [261]:
bc = BertClient(check_length=False)

In [262]:
all_X = tweets
all_y = df['class'].astype(int)

In [None]:
# X_bert_train, bert_train_toks = bc.encode(
#     list(all_X), show_tokens=True)

In [None]:
# def bert_reduce_mean(X):
#     return X.mean(axis=1) 

In [None]:
#X_bert_train_mean = bert_reduce_mean(X_bert_train)

In [None]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats,X_elmo_train_layers],axis=1)

In [245]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

In [246]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [248]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

In [249]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

In [250]:
param_grid = [{}] # Optionally add parameters here

In [251]:
grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv=StratifiedKFold(n_splits=5, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=2)

In [252]:
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   9.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.2s remaining:    0.0s


[CV] ................................................. , total=   9.1s
[CV]  ................................................................
[CV] ................................................. , total=   5.5s
[CV]  ................................................................
[CV] ................................................. , total=   8.3s
[CV]  ................................................................
[CV] ................................................. , total=   8.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.0s finished


In [253]:
y_preds = model.predict(X_test)

## Evaluation

In [254]:
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.29      0.43      0.35       104
           1       0.92      0.86      0.89      1507
           2       0.68      0.75      0.71       364

   micro avg       0.82      0.82      0.82      1975
   macro avg       0.63      0.68      0.65      1975
weighted avg       0.84      0.82      0.83      1975



In [255]:
all_tweets = df[['tweet', 'class']]
misses = np.where(np.asarray(y_test) != y_preds)
missed_preds = []
for i in range(len(y_test)):
    if np.asarray(y_test)[i] != y_preds[i]:
        missed_preds.append(y_preds[i])
    

missed = [list(y_test.index)[i] for i in misses[0]]

In [256]:
missed_tweets = all_tweets.iloc[missed]

In [257]:
missed_tweets.loc[:,'prediction'] = missed_preds
len(missed_tweets[(missed_tweets['class'] == 2)]), len(missed_tweets[(missed_tweets['class'] == 1)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(91, 206)