## Investigating swings in classification produced by removing #sarcasm and keeping/removing the blank space it creates

In [1]:
import ml
import nlp
import pickle
import re
from re import sub
from nltk.tokenize.casual import _replace_html_entities
from nltk import word_tokenize



In [2]:
pickle_path = 'pickled/'

### Importing best classifier (logistic regression cross-validated on 80% of data)

In [2]:
results = pickle.load(open('-twitter-trained-log87.pickle', 'rb'))
classifier = results[0][2][1][0] # best logistic
dvp = pickle.load(open('-twitter-dvp87.pickle', 'rb'))

### Helper functions

In [132]:
def test_and_print(tweets, classifier, dvp):
    pre = ml.predict(tweets,
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

    for t,p,pp in zip(tweets, pre['prediction'], pre['prediction_probabilities']):
        print(t)
        print('\tSarcastic' if p else '\tNon-sarcastic')
        print('\t'+str(pp[1]*100)+'%' if p else '\t'+str(pp[0]*100)+'%')
        print()
        
def alt_test_and_print(tweets, classifier, dvp):
    pre = ml.predict(tweets,
           classifier,
           dvp,
           alternative_clean)

    for t,p,pp in zip(tweets, pre['prediction'], pre['prediction_probabilities']):
        print(t)
        print('\tSarcastic' if p else '\tNon-sarcastic')
        print('\t'+str(pp[1]*100)+'%' if p else '\t'+str(pp[0]*100)+'%')
        print()
        
def alternative_clean(tweet):
    tweet = sub(nlp.TWEET_HANDLE_RE, "NameTOK", tweet)
    tweet = sub(nlp.TWEET_LINK_RE, "LinkTOK", tweet)
    tweet = sub(nlp.HASHTAG_RE, "", tweet)
    tweet = _replace_html_entities(tweet)
    tweet = tweet.rstrip()
    tokens = word_tokenize(tweet)
    return tokens, tweet

def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    if (classlabel): 
        topn = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    else:
        topn = sorted(zip(classifier.coef_[0], feature_names))[:n]

    for coef, feat in topn:
        print (classlabel, feat, coef)
        
def find_best_classifier(results):
    highest_acc = 0.0
    best_classifier = None
    for r in results:
        classifiers = r[2]
        for c in classifiers:
            if c[2] > highest_acc:
                highest_acc = c[2]
                best_classifier = r

    return best_classifier

# Example of issue

In [133]:
test_sarcastic_text = [
             'Having MS is really easy and fun. #sarcasm',
             'Having MS is really easy and fun. ',
             'Having MS is really easy and fun.',
             'Having MS is really easy and fun',
             'The classiest event a bar can throw is a foam party. #Sarcasm',
             'The classiest event a bar can throw is a foam party. ',
             'The classiest event a bar can throw is a foam party.',
             'The classiest event a bar can throw is a foam party',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources ',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #sarcasm #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse  #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse ',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse.',
    
]

In [7]:
test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	87.3060709339%

Having MS is really easy and fun. 
	Sarcastic
	87.3060709339%

Having MS is really easy and fun.
	Non-sarcastic
	92.9603340041%

Having MS is really easy and fun
	Sarcastic
	56.6225091866%

The classiest event a bar can throw is a foam party. #Sarcasm
	Non-sarcastic
	52.9740675825%

The classiest event a bar can throw is a foam party. 
	Non-sarcastic
	52.9740675825%

The classiest event a bar can throw is a foam party.
	Non-sarcastic
	98.5267101183%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	90.8922815287%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Sarcastic
	80.58434523%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Sarcastic
	80.58434523%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	97.335875044%

@mi

The examples above show that classification of sarcastic tweets with this model are erroneously assigning a huge weight to the presence of a dangling space at the end of the text. Usually this dangling trailing space is caused by removing "#sarcasm" --which overwhelmingly appears at the end of sarcastic tweets-- but that is not exclusively the case. Looking at the last example, "#sarcasm" is followed by "#idiots". Due to the underlying method of hastag removal in our methods, all hastags are removed. Thus "#sarcasm" and "#idiots" are removed and two dangling spaces remain. The tweet remains classified as sarcastic until the dangling trailing spaces are removed, at which point it is classified as serious. 

In every case either the classification itself or the prediction probability swing heavily towards non-sarcastic once the trailing dangling spaces are removed. This indicates the model is relying on the presence of a programming artifact rather than the actual structure and features of the tweet in order to classify it correctly.    

In [8]:
test_serious_text = [
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY.',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. #sarcasm',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. ',
    'How many retweets to give the Lakers the number 1 pick? @NBA',
    'How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm',
    'How many retweets to give the Lakers the number 1 pick? @NBA ',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! #sarcasm',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! ',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll.',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. #sarcasm',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. ',
]

In [9]:
test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Non-sarcastic
	99.6887600279%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Non-sarcastic
	99.6867570202%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Non-sarcastic
	99.6867570202%

How many retweets to give the Lakers the number 1 pick? @NBA
	Non-sarcastic
	89.4377033169%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Non-sarcastic
	89.3761907222%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Non-sarcastic
	89.3761907222%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Non-sarcastic
	86.022159641%

Never thought that clunkiness/cost of microscope might be holding back public health https

Interestingly, it seems that effect is not observed in serious tweets. Adding "#sarcasm" and " " to the end of serious tweets results in no change in classification and a negligible change in prediction probability. 

In [10]:
import matplotlib
import matplotlib.pyplot as plt
from json_io import list_from_json, list_to_json, TWEET_LINK_RE, tweet_iterate

In [11]:
SARCASTIC_PATH = "../json/Twitter hash+old/sarcastic/unique.json"
SARCASTIC_HASH_PATH = "../json/sarcastic/hash_dict.json"
NON_SARCASTIC_PATH = "../json/Twitter hash+old/serious/unique.json"
NON_SARCASTIC_HASH_PATH = "../json/non_sarcastic/hash_dict.json"

In [12]:
sarcastic_tweets = list_from_json(SARCASTIC_PATH)
serious_tweets = list_from_json(NON_SARCASTIC_PATH)

In [13]:
print(len(sarcastic_tweets))
print(len(serious_tweets))

52679
163298


In [14]:
import pandas as pd

In [18]:
pre = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

pre_space_strip = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           alternative_clean)

In [19]:
pre

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'prediction': array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        False,  True,  True,  True,  True,  True,  True,  True, False,
         True,  True,  True,  True,  True, False,  True,  True,  True,
         True,  True, False,  True, False,  True,  True,  True,  True,
         True,  True,  True,  True, False,  True,  True,  True, False,
         True,  True,  True,  True,  True], dtype=bool),
 'prediction_probabilities': array([[  3.21369254e-02,   9.67863075e-01],
        [  9.44699169e-01,   5.53008311e-02],
        [  3.80935953e-01,   6.19064047e-01],
        [  2.47414337e-02,   9.75258566e-01],
        [  3.06351862e-01,   6.93648138e-01],
        [  2.16269211e-01,   7.

In [20]:
sarcastic_df = pd.DataFrame({"o_text": [x["text"] for x in sarcastic_tweets[:50]],
                             "o_sarcastic": pre["prediction"],
                             "o_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre["prediction"], pre["prediction_probabilities"])],
                             "ns_sarcastic": pre_space_strip["prediction"],
                             "ns_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre_space_strip["prediction"], pre_space_strip["prediction_probabilities"])]})
sarcastic_df

Unnamed: 0,ns_prob,ns_sarcastic,o_prob,o_sarcastic,o_text
0,96.786307,True,96.786307,True,@MetalBlonde Come on Karina! What about the #b...
1,94.469917,False,94.469917,False,Take that Black History month.Make America Whi...
2,61.906405,True,61.906405,True,Officially in love with M$ Azure #sarcasm #pow...
3,97.525857,True,97.525857,True,"Classy. 🙄🙄🙄 #sarcasm \nHow's that whole ""uniti..."
4,69.364814,True,69.364814,True,#draintheswamp riiiigggghhhttt. #sarcasm https...
5,78.373079,True,78.373079,True,@HanselCreative dude totally!!!!\n\n.\n.\n.\n....
6,88.482814,True,88.482814,True,@amritabhinder n one more thing @TarekFatah te...
7,83.607981,True,83.607981,True,@AJ I look forward to hearing about people goi...
8,94.902071,True,94.902071,True,@SharonLeavy @labourlewis Where's Corbyn? That...
9,94.593912,False,94.593912,False,This is my 'I ❤️ Stansted' face #sarcasm #lond...


### Alternative clean works as expected

In [4]:
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
True


### Predict does not

In [5]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.12693929,  0.87306071]]), 'prediction': array([ True], dtype=bool)}
{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.12693929,  0.87306071]]), 'prediction': array([ True], dtype=bool)}


In [6]:
print(ml.predict(['Having MS is really easy and fun.'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.92960334,  0.07039666]]), 'prediction': array([False], dtype=bool)}
{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.12693929,  0.87306071]]), 'prediction': array([ True], dtype=bool)}


### Feature does not

In [7]:
print(nlp.feature('Having MS is really easy and fun. #sarcasm', nlp.cleanTokensTwitter) == nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean))
print(nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean) == nlp.feature('Having MS is really easy and fun.', nlp.cleanTokensTwitter))

True
False


### Investigating feature

In [77]:
# nlp.py

# def feature(text, cleanTokens):
#     text = repr(text)
#     (tokens, text) = cleanTokens(text)

#     (tokens, postags) = tokPosTagsNoNE(tokens)
#     (capFreq, allCapsFreq) = casesFeat(tokens)

repr1 = repr('Having MS is really easy and fun. #sarcasm')
repr1

"'Having MS is really easy and fun. #sarcasm'"

In [76]:
repr2 = repr('Having MS is really easy and fun.')
repr2

"'Having MS is really easy and fun.'"

#### Repr adds quotes to end of text, causing alternative clean to not find any trailing spaces and perform identically to nlp.cleanTokensTwitter

In [84]:
print(alternative_clean(repr1) == nlp.cleanTokensTwitter(repr1))
print(alternative_clean(repr2) == nlp.cleanTokensTwitter(repr2))

True
True


In [86]:
print(nlp.cleanTokensTwitter(repr1) == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(nlp.cleanTokensTwitter(repr2) == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
False


### After commenting out text = repr(text)

In [4]:
nlp.feature??

#### Alternative clean works as expected

In [5]:
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
True


#### Predict seems to work as expected in first case. Probabilities differ slightly.

In [6]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction': array([ True], dtype=bool), 'prediction_probabilities': array([[ 0.04226943,  0.95773057]])}
{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction': array([ True], dtype=bool), 'prediction_probabilities': array([[ 0.04249474,  0.95750526]])}


#### Works as expected

In [9]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))
print(ml.predict(['Having MS is really easy and fun.'], classifier, dvp, nlp.cleanTokensTwitter))

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction': array([ True], dtype=bool), 'prediction_probabilities': array([[ 0.04249474,  0.95750526]])}
{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction': array([ True], dtype=bool), 'prediction_probabilities': array([[ 0.04249474,  0.95750526]])}


#### Feature works as expected

In [4]:
print(nlp.feature('Having MS is really easy and fun. #sarcasm', nlp.cleanTokensTwitter) == nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean))
print(nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean) == nlp.feature('Having MS is really easy and fun.', nlp.cleanTokensTwitter))

False
True


### Revised attempt at dataframe

In [12]:
import matplotlib
import matplotlib.pyplot as plt
from json_io import list_from_json, list_to_json, TWEET_LINK_RE, tweet_iterate
import pandas as pd

In [13]:
SARCASTIC_PATH = "../json/Twitter hash+old/sarcastic/unique.json"
SARCASTIC_HASH_PATH = "../json/sarcastic/hash_dict.json"
NON_SARCASTIC_PATH = "../json/Twitter hash+old/serious/unique.json"
NON_SARCASTIC_HASH_PATH = "../json/non_sarcastic/hash_dict.json"

In [14]:
sarcastic_tweets = list_from_json(SARCASTIC_PATH)
serious_tweets = list_from_json(NON_SARCASTIC_PATH)

In [15]:
print(len(sarcastic_tweets))
print(len(serious_tweets))

52679
163298


In [16]:
pre = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

pre_space_strip = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           alternative_clean)

In [17]:
sarcastic_df = pd.DataFrame({"o_text": [x["text"] for x in sarcastic_tweets[:50]],
                             "o_sarcastic": pre["prediction"],
                             "o_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre["prediction"], pre["prediction_probabilities"])],
                             "ns_sarcastic": pre_space_strip["prediction"],
                             "ns_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre_space_strip["prediction"], pre_space_strip["prediction_probabilities"])]})
sarcastic_df

Unnamed: 0,ns_prob,ns_sarcastic,o_prob,o_sarcastic,o_text
0,91.218247,True,91.37344,True,@MetalBlonde Come on Karina! What about the #b...
1,90.887446,False,90.887446,False,Take that Black History month.Make America Whi...
2,67.632509,False,67.392576,False,Officially in love with M$ Azure #sarcasm #pow...
3,85.478773,True,85.478773,True,"Classy. 🙄🙄🙄 #sarcasm \nHow's that whole ""uniti..."
4,84.399083,True,84.399083,True,#draintheswamp riiiigggghhhttt. #sarcasm https...
5,86.303618,True,86.827167,True,@HanselCreative dude totally!!!!\n\n.\n.\n.\n....
6,95.53184,True,95.53184,True,@amritabhinder n one more thing @TarekFatah te...
7,87.376017,True,87.376017,True,@AJ I look forward to hearing about people goi...
8,92.272002,False,92.138653,False,@SharonLeavy @labourlewis Where's Corbyn? That...
9,96.054645,False,96.054645,False,This is my 'I ❤️ Stansted' face #sarcasm #lond...


### Re-examining issue

In [18]:
test_sarcastic_text = [
             'Having MS is really easy and fun. #sarcasm',
             'Having MS is really easy and fun. ',
             'Having MS is really easy and fun.',
             'Having MS is really easy and fun',
             'The classiest event a bar can throw is a foam party. #Sarcasm',
             'The classiest event a bar can throw is a foam party. ',
             'The classiest event a bar can throw is a foam party.',
             'The classiest event a bar can throw is a foam party',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources ',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #sarcasm #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse  #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse ',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse.',
    
]

In [19]:
test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	95.7730568124%

Having MS is really easy and fun. 
	Sarcastic
	95.7730568124%

Having MS is really easy and fun.
	Sarcastic
	95.750525886%

Having MS is really easy and fun
	Sarcastic
	73.1986837306%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	64.0899195131%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	64.0899195131%

The classiest event a bar can throw is a foam party.
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	91.0680734982%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	89.8828590754%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	89.8828590754%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	89.9032788999%

@mikefre

In [20]:
alt_test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	95.750525886%

Having MS is really easy and fun. 
	Sarcastic
	95.750525886%

Having MS is really easy and fun.
	Sarcastic
	95.750525886%

Having MS is really easy and fun
	Sarcastic
	73.1986837306%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party.
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	91.0680734982%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	89.9032788999%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	89.9032788999%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	89.9032788999%

@mikefreeman

In [28]:
test_serious_text = [
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY.',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. #sarcasm',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. ',
    'How many retweets to give the Lakers the number 1 pick? @NBA',
    'How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm',
    'How many retweets to give the Lakers the number 1 pick? @NBA ',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! #sarcasm',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! ',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll.',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. #sarcasm',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. ',
]

In [29]:
test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Non-sarcastic
	97.3463403536%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Non-sarcastic
	97.3323452094%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Non-sarcastic
	97.3323452094%

How many retweets to give the Lakers the number 1 pick? @NBA
	Non-sarcastic
	89.8057540459%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Non-sarcastic
	89.7880724483%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Non-sarcastic
	89.7880724483%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Non-sarcastic
	83.1653529571%

Never thought that clunkiness/cost of microscope might be holding back public health http

In [30]:
alt_test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Non-sarcastic
	97.3463403536%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Non-sarcastic
	97.3463403536%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Non-sarcastic
	97.3463403536%

How many retweets to give the Lakers the number 1 pick? @NBA
	Non-sarcastic
	89.8057540459%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Non-sarcastic
	89.8057540459%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Non-sarcastic
	89.8057540459%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Non-sarcastic
	83.1653529571%

Never thought that clunkiness/cost of microscope might be holding back public health http

### All work as expected

# Findings

Using repr(text) as the first step in the feature processing pipeline for a tweet caused the addition of quotation marks at the beginning and end of each tweet

In [32]:
repr("Example tweet. #sarcasm")

"'Example tweet. #sarcasm'"

In [33]:
repr("Example tweet.")

"'Example tweet.'"

In [4]:
print(nlp.cleanTokensTwitter(repr("Example tweet.")))
print(nlp.cleanTokensTwitter("Example tweet."))

(["'Example", 'tweet', '.', "'"], "'Example tweet.'")
(['Example', 'tweet', '.'], 'Example tweet.')


In [5]:
print(nlp.cleanTokensTwitter(repr("Example tweet. #sarcasm")))
print(nlp.cleanTokensTwitter("Example tweet. #sarcasm"))

(["'Example", 'tweet.', "'"], "'Example tweet. '")
(['Example', 'tweet', '.'], 'Example tweet. ')


### How did this drastically affect the classifier?

#### For all features EXCEPT punctuation, the tokens of the tweet were used. In all cases, repr(text) replaced text as the first step in the feature processing pipeline

In [14]:
# 'Having MS is really easy and fun. #sarcasm', 87.306% sarcastic
# 'Having MS is really easy and fun. ', 87.306% sarcastic
# 'Having MS is really easy and fun.', 92.960% non-sarcastic
# 'Having MS is really easy and fun', 56.623% sarcastic

print(nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun. #sarcasm')))

(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun. ')
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun.', "'"], "'Having MS is really easy and fun. '")


In [16]:
print(nlp.cleanTokensTwitter('Having MS is really easy and fun. '))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun. ')))

(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun. ')
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun.', "'"], "'Having MS is really easy and fun. '")


In [17]:
print(nlp.cleanTokensTwitter('Having MS is really easy and fun.'))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun.')))

(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun.')
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', '.', "'"], "'Having MS is really easy and fun.'")


In [18]:
print(nlp.cleanTokensTwitter('Having MS is really easy and fun'))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun')))

(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun'], 'Having MS is really easy and fun')
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', "'"], "'Having MS is really easy and fun'")


#### N-grams are affected. The first word in a tweet is matched to the n-gram 'word instead of word

#### In cases where a dangling space occurs at the end of the tweet, such as when the tweet ends with a hashtag, the last word is matched to the n-gram word. instead of word

#### When no dangling space exists, the period character appears as its own n-gram. It is likely that the classifier relied heavily on the period n-gram being present in non-sarcastic tweets and not appearing in sarcastic tweets.

#### Looking at an example without periods

In [19]:
# "@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm" 80.584% sarcastic
# "@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources " 80.584% sarcastic
# "@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources" 97.336% non-sarcastic

print(nlp.cleanTokensTwitter("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm"))
print(nlp.cleanTokensTwitter(repr("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm")))

(['NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources'], "NameTOK Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources ")
(['``', 'NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources', '``'], '"NameTOK Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources "')


In [20]:
print(nlp.cleanTokensTwitter("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources "))
print(nlp.cleanTokensTwitter(repr("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources ")))

(['NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources'], "NameTOK Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources ")
(['``', 'NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources', '``'], '"NameTOK Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources "')


In [21]:
print(nlp.cleanTokensTwitter("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources"))
print(nlp.cleanTokensTwitter(repr("@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources")))

(['NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources'], "NameTOK Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources")
(['``', 'NameTOK', 'Sure', 'I', "'ll", 'watch', 'North', 'Korean', '-', 'China', '-', 'Russian', 'and', 'Iran', 'news', 'all', 'very', 'creditable', 'sources', "''"], '"NameTOK Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources"')


#### A new artifact appears when a character that needs to be escaped when represented as a python string appears in the tweet. This is the `` character. When there is a dangling space, it appears as a token twice (once at the beginning, once at the end). When there is no dangling space, it only appears once at the beginning, and a new token '' appears at the end.

In [110]:
most_informative_feature_for_class(dvp, classifier, True)

True grm2 ' i 2.06082484964
True grm1 hashtag 2.21268505752
True grm1 shocked 2.23309305865
True grm1 sarcastic 2.31216836499
True grm1 'Shocking 2.37029510357
True grm1 'Yay 2.40244926147
True grm1 ' 2.60837610358
True grm1 sarcasm 2.80359433667
True grm2 ' RT 3.02907186299
True ._RAW/TOTAL_PUNCT_FOUND 3.53482770131


In [111]:
most_informative_feature_for_class(dvp, classifier, False)

False TOTAL/LENGTH -6.05327004015
False '_RAW/LEN -4.3633856018
False grm2 . ' -1.85792898937
False "_RAW/TOTAL_PUNCT_FOUND -1.7348314888
False grm1 nude -1.55488148816
False grm1 nigga -1.53288527986
False '_RAW/TOTAL_PUNCT_FOUND -1.50268372428
False grm1 'i -1.49072647577
False grm1 porn -1.46982345929
False grm1 sex -1.45558813682


## Comparing new classifier to old

In [5]:
X = pickle.load(open(pickle_path + '-twitter-X.pickle', 'rb'))
y = pickle.load(open(pickle_path + '-twitter-y.pickle', 'rb'))

X_v2 = pickle.load(open(pickle_path + '-twitter-X-v2.pickle', 'rb'))
y_v2 = pickle.load(open(pickle_path + '-twitter-y-v2.pickle', 'rb'))

dvp = pickle.load(open(pickle_path + '-twitter-dvp.pickle', 'rb'))
dvp_v2 = pickle.load(open(pickle_path + '-twitter-dvp-v2.pickle', 'rb'))

classifier = pickle.load(open(pickle_path + '-twitter-trained.pickle', 'rb'))
classifier_v2 = pickle.load(open(pickle_path + '-twitter-trained-v2.pickle', 'rb'))

In [108]:
best_classifier = find_best_classifier(classifier)
best_classifier

(0,
 0.8,
 [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   1.382722,
   0.82637281229743498),
  (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   0.784222,
   0.82521529771275115),
  (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   1.083756,
   0.82509954625428283)])

In [109]:
best_classifier_v2 = find_best_classifier(classifier_v2)
best_classifier_v2

(0,
 0.8,
 [(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   0.738366,
   0.80681081581627934),
  (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   0.667845,
   0.80442633577183076),
  (MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
   0.678312,
   0.8062783591073247)])

In [131]:
print(most_informative_feature_for_class(dvp, best_classifier[2][0][0], True))
print(most_informative_feature_for_class(dvp, best_classifier[2][0][0], False))

True vow2 1 0 -4.67103712541
True syl3 1 1 1 -4.64666336094
True vow1 0 -4.30103173968
True syl2 1 1 -4.22686706271
True TOTAL -4.12326270071
True vow4 1 1 1 1 -3.97697172696
True syl1 1 -3.82194554257
True vow3 1 1 1 -3.72575415861
True vow2 1 1 -3.48113540597
True vow1 1 -3.21550830719
None
False \_RAW -16.3595783804
False \_RAW/LEN -16.3595783804
False \_RAW/TOTAL_PUNCT_FOUND -16.3595783804
False grm1 ''A -16.3595783804
False grm1 ''Come -16.3595783804
False grm1 ''Disaster -16.3595783804
False grm1 ''Do -16.3595783804
False grm1 ''IT -16.3595783804
False grm1 ''Mona -16.3595783804
False grm1 ''Not -16.3595783804
None


In [129]:
print(most_informative_feature_for_class(dvp_v2, best_classifier_v2[2][0][0], True))
print(most_informative_feature_for_class(dvp_v2, best_classifier_v2[2][0][0], False))

True vow2 1 0 -4.78157844314
True vow1 0 -4.58288805128
True TOTAL -4.56200077533
True syl3 1 1 1 -4.53490867122
True syl2 1 1 -4.11821302026
True vow4 1 1 1 1 -3.89865460923
True syl1 1 -3.71596691971
True vow3 1 1 1 -3.64635005398
True vow2 1 1 -3.39909197263
True vow1 1 -3.13091109146
None
False \_RAW -16.2730555273
False \_RAW/LEN -16.2730555273
False \_RAW/TOTAL_PUNCT_FOUND -16.2730555273
False grm1 ''A -16.2730555273
False grm1 ''Come -16.2730555273
False grm1 ''Disaster -16.2730555273
False grm1 ''Do -16.2730555273
False grm1 ''IT -16.2730555273
False grm1 ''Mona -16.2730555273
False grm1 ''Not -16.2730555273
None


In [122]:
log_classifiers = pickle.load(open('-twitter-trained-log87.pickle', 'rb'))
dvp_log = pickle.load(open('-twitter-dvp87.pickle', 'rb'))

In [124]:
best_classifier_log = find_best_classifier(log_classifiers)
best_classifier_log

(0,
 0.8,
 [(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
             penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False), 415.732046, 0.86950180572275215),
  (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
             penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False), 417.592044, 0.8732290026854338)])

In [128]:
print(most_informative_feature_for_class(dvp_log, best_classifier_log[2][1][0], True))
print(most_informative_feature_for_class(dvp_log, best_classifier_log[2][1][0], False))

True grm2 ' i 2.06082484964
True grm1 hashtag 2.21268505752
True grm1 shocked 2.23309305865
True grm1 sarcastic 2.31216836499
True grm1 'Shocking 2.37029510357
True grm1 'Yay 2.40244926147
True grm1 ' 2.60837610358
True grm1 sarcasm 2.80359433667
True grm2 ' RT 3.02907186299
True ._RAW/TOTAL_PUNCT_FOUND 3.53482770131
None
False TOTAL/LENGTH -6.05327004015
False '_RAW/LEN -4.3633856018
False grm2 . ' -1.85792898937
False "_RAW/TOTAL_PUNCT_FOUND -1.7348314888
False grm1 nude -1.55488148816
False grm1 nigga -1.53288527986
False '_RAW/TOTAL_PUNCT_FOUND -1.50268372428
False grm1 'i -1.49072647577
False grm1 porn -1.46982345929
False grm1 sex -1.45558813682
None


In [134]:
test_and_print(test_sarcastic_text, best_classifier_log[2][1][0], dvp_log)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	95.750525886%

Having MS is really easy and fun. 
	Sarcastic
	95.750525886%

Having MS is really easy and fun.
	Sarcastic
	95.750525886%

Having MS is really easy and fun
	Sarcastic
	73.1986837306%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party.
	Sarcastic
	64.047319944%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	91.0680734982%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	89.9032788999%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	89.9032788999%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	89.9032788999%

@mikefreeman

In [135]:
test_and_print(test_sarcastic_text, best_classifier[2][0][0], dvp)

Having MS is really easy and fun. #sarcasm
	Non-sarcastic
	99.9900484085%

Having MS is really easy and fun. 
	Non-sarcastic
	99.9900484085%

Having MS is really easy and fun.
	Non-sarcastic
	99.9900484085%

Having MS is really easy and fun
	Non-sarcastic
	99.9914871042%

The classiest event a bar can throw is a foam party. #Sarcasm
	Non-sarcastic
	99.9999852666%

The classiest event a bar can throw is a foam party. 
	Non-sarcastic
	99.9999852666%

The classiest event a bar can throw is a foam party.
	Non-sarcastic
	99.9999852666%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	99.9999871479%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	100.0%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	100.0%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	100.0%

@m

In [136]:
test_and_print(test_sarcastic_text, best_classifier_v2[2][0][0], dvp_v2)

Having MS is really easy and fun. #sarcasm
	Non-sarcastic
	98.6062428586%

Having MS is really easy and fun. 
	Non-sarcastic
	98.6062428586%

Having MS is really easy and fun.
	Non-sarcastic
	98.6062428586%

Having MS is really easy and fun
	Non-sarcastic
	99.9039874851%

The classiest event a bar can throw is a foam party. #Sarcasm
	Non-sarcastic
	99.999998879%

The classiest event a bar can throw is a foam party. 
	Non-sarcastic
	99.999998879%

The classiest event a bar can throw is a foam party.
	Non-sarcastic
	99.999998879%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	99.9999999396%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	100.0%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	100.0%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	100.0%

@mike