## Investigating swings in classification produced by removing #sarcasm and keeping/removing the blank space it creates

In [2]:
import ml
import nlp
import pickle
import re
from re import sub
from nltk.tokenize.casual import _replace_html_entities
from nltk import word_tokenize



In [3]:
pickle_path = 'pickled/'

### Importing best classifier (logistic regression cross-validated on 80% of data)

In [3]:
results = pickle.load(open(pickle_path + '-twitter-trained-log-unbalanced.pickle', 'rb'))
dvp = pickle.load(open(pickle_path + '-twitter-dvp.pickle', 'rb'))

In [4]:
best = ml.best_classifiers([results], ["LOG_unbalanced"], dvp)
best

defaultdict(dict,
            {'LOG_unbalanced': {'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                        intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
                        penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
                        verbose=0, warm_start=False),
              'dvp': DictVectorizerPartial(dtype=<class 'numpy.float32'>, feature_names=None,
                         separator='=', sparse=True, vocab=None),
              'reduction': 0,
              'score': 0.87133067876655246,
              'size': 0.8,
              'train_time': 376.016126}})

In [5]:
classifier = best["LOG_unbalanced"]["classifier"]
dvp = best["LOG_unbalanced"]["dvp"]

### Helper functions

In [4]:
def test_and_print(tweets, classifier, dvp):
    pre = ml.predict(tweets,
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

    for t,p,pp in zip(tweets, pre['prediction'], pre['prediction_probabilities']):
        print(t)
        print('\tSarcastic' if p else '\tSerious')
        print('\t'+str(pp[1]*100)+'%' if p else '\t'+str(pp[0]*100)+'%')
        print()
        
def alt_test_and_print(tweets, classifier, dvp):
    pre = ml.predict(tweets,
           classifier,
           dvp,
           alternative_clean)

    for t,p,pp in zip(tweets, pre['prediction'], pre['prediction_probabilities']):
        print(t)
        print('\tSarcastic' if p else '\tSerious')
        print('\t'+str(pp[1]*100)+'%' if p else '\t'+str(pp[0]*100)+'%')
        print()
        
def alternative_clean(tweet):
    tweet = sub(nlp.TWEET_HANDLE_RE, "NameTOK", tweet)
    tweet = sub(nlp.TWEET_LINK_RE, "LinkTOK", tweet)
    tweet = sub(nlp.HASHTAG_RE, "", tweet)
    tweet = _replace_html_entities(tweet)
    tweet = tweet.rstrip()
    tokens = word_tokenize(tweet)
    return tokens, tweet

def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    if (classlabel): 
        topn = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    else:
        topn = sorted(zip(classifier.coef_[0], feature_names))[:n]

    for coef, feat in topn:
        print (classlabel, feat, coef)
        
def find_best_classifier(results):
    highest_acc = 0.0
    best_classifier = None
    for r in results:
        classifiers = r[2]
        for c in classifiers:
            if c[2] > highest_acc:
                highest_acc = c[2]
                best_classifier = r

    return best_classifier

# Example of issue

In [13]:
test_sarcastic_text = [
             'Having MS is really easy and fun. #sarcasm',
             'Having MS is really easy and fun. ',
             'Having MS is really easy and fun.',
             'Having MS is really easy and fun',
             'The classiest event a bar can throw is a foam party. #Sarcasm',
             'The classiest event a bar can throw is a foam party. ',
             'The classiest event a bar can throw is a foam party.',
             'The classiest event a bar can throw is a foam party',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources ',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #sarcasm #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse  #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse ',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse.',
    
]

In [11]:
test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	95.5382460923%

Having MS is really easy and fun. 
	Sarcastic
	95.5382460923%

Having MS is really easy and fun.
	Serious
	92.9694146335%

Having MS is really easy and fun
	Sarcastic
	68.4250281747%

The classiest event a bar can throw is a foam party. #Sarcasm
	Serious
	56.8570085424%

The classiest event a bar can throw is a foam party. 
	Serious
	56.8570085424%

The classiest event a bar can throw is a foam party.
	Serious
	98.9521429322%

The classiest event a bar can throw is a foam party
	Serious
	93.2898160203%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Sarcastic
	82.7001799788%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Sarcastic
	82.7001799788%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Serious
	99.0031407564%

@mikefreemanNFL Because voicing yo

The examples above show that classifications of sarcastic tweets with this model are erroneously assigning a huge weight to the presence of a dangling space at the end of the text. Usually this dangling trailing space is caused by removing "#sarcasm" --which overwhelmingly appears at the end of sarcastic tweets-- but that is not exclusively the case. Looking at the last example, "#sarcasm" is followed by "#idiots". Due to the underlying method of hastag removal in our methods, all hastags are removed. Thus "#sarcasm" and "#idiots" are removed and two dangling spaces remain. The tweet remains classified as sarcastic until the dangling trailing spaces are removed, at which point it is classified as serious. 

In every case either the classification itself or the prediction probability swing heavily towards non-sarcastic once the trailing dangling spaces are removed. This indicates the model is relying on the presence of a programming artifact rather than the actual structure and features of the tweet in order to classify it correctly.    

In [14]:
test_serious_text = [
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY.',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. #sarcasm',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. ',
    'How many retweets to give the Lakers the number 1 pick? @NBA',
    'How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm',
    'How many retweets to give the Lakers the number 1 pick? @NBA ',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! #sarcasm',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! ',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll.',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. #sarcasm',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. ',
]

In [13]:
test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Serious
	99.8915777064%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Serious
	99.8907313844%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Serious
	99.8907313844%

How many retweets to give the Lakers the number 1 pick? @NBA
	Serious
	87.2958283871%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Serious
	87.2096300967%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Serious
	87.2096300967%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Serious
	93.2313394438%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope excit

Interestingly, it seems the effect is not as severe in the serious tweets samples. Adding "#sarcasm" and " " to the end of the sampled serious tweets results in no change in classification and a smaller changes in prediction probability. 

## Attempt at solution: Using an alternative tweet cleaning method (alternative_clean) that performs a rstrip() of the text of the tweet, removing the dangling spaces

In [9]:
import matplotlib
import matplotlib.pyplot as plt
from json_io import list_from_json, list_to_json, TWEET_LINK_RE, tweet_iterate

In [10]:
SARCASTIC_PATH = "../json/twitter/sarcastic/unique.json"
SARCASTIC_HASH_PATH = "../json/sarcastic/hash_dict.json"
SERIOUS_PATH = "../json/twitter/serious/unique.json"
SERIOUS_HASH_PATH = "../json/twitter/serious/hash_dict.json"

In [11]:
sarcastic_tweets = list_from_json(SARCASTIC_PATH, old_format=False)
serious_tweets = list_from_json(SERIOUS_PATH, old_format=False)

In [12]:
print(len(sarcastic_tweets))
print(len(serious_tweets))

58164
173868


In [13]:
import pandas as pd

In [25]:
pre = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

pre_space_strip = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           alternative_clean)

In [30]:
sarcastic_df = pd.DataFrame({"o_text": [x["text"] for x in sarcastic_tweets[:50]],
                             "o_sarcastic": pre["prediction"],
                             "o_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre["prediction"], pre["prediction_probabilities"])],
                             "noSpace_sarcastic": pre_space_strip["prediction"],
                             "noSpace_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre_space_strip["prediction"], pre_space_strip["prediction_probabilities"])]})
sarcastic_df

Unnamed: 0,noSpace_prob,noSpace_sarcastic,o_prob,o_sarcastic,o_text
0,96.595865,True,96.595865,True,@MetalBlonde Come on Karina! What about the #b...
1,87.346849,False,87.346849,False,Take that Black History month.Make America Whi...
2,89.52647,False,89.52647,False,Officially in love with M$ Azure #sarcasm #pow...
3,99.254244,True,99.254244,True,"Classy. 🙄🙄🙄 #sarcasm \nHow's that whole ""uniti..."
4,79.498309,True,79.498309,True,#draintheswamp riiiigggghhhttt. #sarcasm https...
5,54.430044,True,54.430044,True,@HanselCreative dude totally!!!!\n\n.\n.\n.\n....
6,91.580739,True,91.580739,True,@amritabhinder n one more thing @TarekFatah te...
7,92.591917,True,92.591917,True,@AJ I look forward to hearing about people goi...
8,95.680305,True,95.680305,True,@SharonLeavy @labourlewis Where's Corbyn? That...
9,77.631688,True,77.631688,True,This is my 'I ❤️ Stansted' face #sarcasm #lond...


### Something is wrong. Using alternative_clean should produce a different string (and different prediction/prediction probability), but in every case the prediction on the string that was cleaned using nlp.cleanTokensTwitter is exactly the same as the prediction on the string cleaned using alternative_clean

# Investigate

### Alternative clean works as expected

In [32]:
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
True


### Predict does not

### These should NOT be equal

In [34]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.04461754,  0.95538246]])}
{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.04461754,  0.95538246]])}


### These SHOULD be equal

In [6]:
print(ml.predict(['Having MS is really easy and fun.'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.92960334,  0.07039666]]), 'prediction': array([False], dtype=bool)}
{'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.12693929,  0.87306071]]), 'prediction': array([ True], dtype=bool)}


### Feature does not

In [35]:
print(nlp.feature('Having MS is really easy and fun. #sarcasm', nlp.cleanTokensTwitter) == nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean))
print(nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean) == nlp.feature('Having MS is really easy and fun.', nlp.cleanTokensTwitter))

True
False


### Investigating feature

In [40]:
nlp.feature??

### First step in feature pipeline is calling repr() on text of tweet

In [37]:
repr1 = repr('Having MS is really easy and fun. #sarcasm')
repr1

"'Having MS is really easy and fun. #sarcasm'"

In [38]:
repr2 = repr('Having MS is really easy and fun.')
repr2

"'Having MS is really easy and fun.'"

#### Calling Repr on a string adds quotes to end of text, causing alternative clean to not find any trailing spaces and perform identically to nlp.cleanTokensTwitter

In [84]:
print(alternative_clean(repr1) == nlp.cleanTokensTwitter(repr1))
print(alternative_clean(repr2) == nlp.cleanTokensTwitter(repr2))

True
True


### Repr is also affecting the tokenizer

In [39]:
print(nlp.cleanTokensTwitter(repr1) == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(nlp.cleanTokensTwitter(repr2) == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
False


### After commenting out text = repr(text)

In [14]:
nlp.feature??

#### Alternative clean works as expected

In [15]:
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(alternative_clean('Having MS is really easy and fun. #sarcasm') == nlp.cleanTokensTwitter('Having MS is really easy and fun.'))

False
True


#### Predict seems to work as expected. Probabilities differ slightly.

In [16]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, nlp.cleanTokensTwitter))
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))

{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.02236605,  0.97763395]])}
{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.02251699,  0.97748301]])}


#### Predict works as expected

In [17]:
print(ml.predict(['Having MS is really easy and fun. #sarcasm'], classifier, dvp, alternative_clean))
print(ml.predict(['Having MS is really easy and fun.'], classifier, dvp, nlp.cleanTokensTwitter))

{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.02251699,  0.97748301]])}
{'prediction': array([ True], dtype=bool), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'prediction_probabilities': array([[ 0.02251699,  0.97748301]])}


#### Feature works as expected

In [18]:
print(nlp.feature('Having MS is really easy and fun. #sarcasm', nlp.cleanTokensTwitter) == nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean))
print(nlp.feature('Having MS is really easy and fun. #sarcasm', alternative_clean) == nlp.feature('Having MS is really easy and fun.', nlp.cleanTokensTwitter))

False
True


### Revised attempt at dataframe

In [19]:
pre = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           nlp.cleanTokensTwitter)

pre_space_strip = ml.predict([x["text"] for x in sarcastic_tweets[:50]],
           classifier,
           dvp,
           alternative_clean)

In [20]:
sarcastic_df = pd.DataFrame({"o_text": [x["text"] for x in sarcastic_tweets[:50]],
                             "o_sarcastic": pre["prediction"],
                             "o_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre["prediction"], pre["prediction_probabilities"])],
                             "ns_sarcastic": pre_space_strip["prediction"],
                             "ns_prob": [tup[1][1]*100 if tup[0] else tup[1][0]*100 for tup in zip(pre_space_strip["prediction"], pre_space_strip["prediction_probabilities"])]})
sarcastic_df

Unnamed: 0,ns_prob,ns_sarcastic,o_prob,o_sarcastic,o_text
0,94.153289,True,94.280688,True,@MetalBlonde Come on Karina! What about the #b...
1,70.501927,False,70.501927,False,Take that Black History month.Make America Whi...
2,92.349261,False,92.255461,False,Officially in love with M$ Azure #sarcasm #pow...
3,84.663902,True,84.663902,True,"Classy. 🙄🙄🙄 #sarcasm \nHow's that whole ""uniti..."
4,91.256633,True,91.256633,True,#draintheswamp riiiigggghhhttt. #sarcasm https...
5,54.892505,False,53.528123,False,@HanselCreative dude totally!!!!\n\n.\n.\n.\n....
6,94.93041,True,94.93041,True,@amritabhinder n one more thing @TarekFatah te...
7,95.694558,True,95.694558,True,@AJ I look forward to hearing about people goi...
8,95.925353,False,95.83783,False,@SharonLeavy @labourlewis Where's Corbyn? That...
9,59.829001,True,59.829001,True,This is my 'I ❤️ Stansted' face #sarcasm #lond...


#### Removing the pre-processing step of calling repr() on the text of every tweet fixed the problems with alternative_clean behaving identically to nlp.cleanTokensTwitter. The revised pipeline of NOT calling repr() and removing trailing whitespace now correctly produces different results from the original pipeline. From the dataframe above we can see the predictions remain the same regardless of which method is used, and the prediction probabilities rarely deviate more than 1%. 

#### This does NOT mean that using this revised pipeline will result in only small changes. The dataframe above, in combination with the example tweets at the beginning of the notebook where the prediction swung wildly based on the presence of trailing spaces, show that the original classifier very likely relied on the presence/absence of trailing spaces to make its classifications.

### Re-examining issue

### Hypothesis: Predicting on the same strings from the beginning of the notebook will result in similarly wild swings of prediction/prediction probability when using the old nlp.cleanTokensTwitter method. Using alternative_clean which strips white space from the end of tweets will result in more stable predictions.

In [23]:
test_sarcastic_text = [
             'Having MS is really easy and fun. #sarcasm',
             'Having MS is really easy and fun. ',
             'Having MS is really easy and fun.',
             'Having MS is really easy and fun',
             'The classiest event a bar can throw is a foam party. #Sarcasm',
             'The classiest event a bar can throw is a foam party. ',
             'The classiest event a bar can throw is a foam party.',
             'The classiest event a bar can throw is a foam party',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources ',
             '@thehill Sure I\'ll watch North Korean - China - Russian and Iran news all very creditable sources',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #sarcasm #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse  #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #idiots',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse #',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse ',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse',
             '@mikefreemanNFL Because voicing your beliefs if they\'re political is worse than actual physical abuse.',
    
]

In [22]:
test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	97.7633947628%

Having MS is really easy and fun. 
	Sarcastic
	97.7633947628%

Having MS is really easy and fun.
	Sarcastic
	97.7483006504%

Having MS is really easy and fun
	Sarcastic
	76.0116313602%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	65.2400088794%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	65.2400088794%

The classiest event a bar can throw is a foam party.
	Sarcastic
	65.1879805414%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	96.5156663706%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	94.9862370198%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	94.9862370198%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	94.9989697847%

@mikef

In [25]:
alt_test_and_print(test_sarcastic_text, classifier, dvp)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	97.7483006504%

Having MS is really easy and fun. 
	Sarcastic
	97.7483006504%

Having MS is really easy and fun.
	Sarcastic
	97.7483006504%

Having MS is really easy and fun
	Sarcastic
	76.0116313602%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	65.1879805414%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	65.1879805414%

The classiest event a bar can throw is a foam party.
	Sarcastic
	65.1879805414%

The classiest event a bar can throw is a foam party
	Non-sarcastic
	96.5156663706%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Non-sarcastic
	94.9989697847%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Non-sarcastic
	94.9989697847%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Non-sarcastic
	94.9989697847%

@mikef

In [26]:
test_serious_text = [
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY.',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. #sarcasm',
    'Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn\'t train today w/ #RBNY. ',
    'How many retweets to give the Lakers the number 1 pick? @NBA',
    'How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm',
    'How many retweets to give the Lakers the number 1 pick? @NBA ',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! #sarcasm',
    'Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting! ',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll.',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. #sarcasm',
    '@SwiftOnSecurity They sponsor literally every podcast ever, followed by Crunchyroll. ',
]

In [27]:
test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Non-sarcastic
	99.5243137495%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Non-sarcastic
	99.5212005566%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Non-sarcastic
	99.5212005566%

How many retweets to give the Lakers the number 1 pick? @NBA
	Non-sarcastic
	75.5746950757%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Non-sarcastic
	75.5321183463%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Non-sarcastic
	75.5321183463%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Non-sarcastic
	93.2234507183%

Never thought that clunkiness/cost of microscope might be holding back public health http

In [28]:
alt_test_and_print(test_serious_text, classifier, dvp)

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY.
	Non-sarcastic
	99.5243137495%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. #sarcasm
	Non-sarcastic
	99.5243137495%

Marsch listed Aurelien Collin & Connor Lade as "day-to-day." Keita got a slight knock, didn't train today w/ #RBNY. 
	Non-sarcastic
	99.5243137495%

How many retweets to give the Lakers the number 1 pick? @NBA
	Non-sarcastic
	75.5746950757%

How many retweets to give the Lakers the number 1 pick? @NBA #sarcasm
	Non-sarcastic
	75.5746950757%

How many retweets to give the Lakers the number 1 pick? @NBA 
	Non-sarcastic
	75.5746950757%

Never thought that clunkiness/cost of microscope might be holding back public health https://t.co/TQQcvBheDf so $1 microscope exciting!
	Non-sarcastic
	93.2234507183%

Never thought that clunkiness/cost of microscope might be holding back public health http

#### The hypothesis seems to be incorrect. The wild swings in prediction/prediction probability produced by either keeping or removing the dangling space from the end of sarcastic tweets are not present regardless of which cleaning method is used. Across all tweets, the difference between using alternative_clean vs.  nlp.cleanTokensTwitter is negligible. This means that repr() was the main culprit behind the wild swings seen in the beginning of the notebook. It also means that the classifier was heavily relying on the influence repr() had on the features of a tweet to make a classification.

#### Another thing to note is the effect of the tweet ending with a period (.) Regardless of the cleaning method, the presence of a trailing period results in a significantly higher swing in prediction probabilty towards sarcastic. In some cases its removal was enough to swing the prediction entirely from sarcastic to serious. It's hard to tell whether or not this is an artifact from training the classifier on tweets where repr() was called as the first step in the pre-processing pipeline. It's possible that the quotes added by repr() altered the tokens at the end of a tweet and resulted in heavily weighting features that appeared near the end of tweets like periods and question marks.

# Investigating the effect of repr() on tokens

Using repr(text) as the first step in the feature processing pipeline for a tweet caused the addition of quotation marks at the beginning and end of each tweet

In [29]:
repr("Example tweet. #sarcasm")

"'Example tweet. #sarcasm'"

In [30]:
repr("Example tweet.")

"'Example tweet.'"

In [33]:
print(nlp.cleanTokensTwitter(repr("Example tweet. #sarcasm")))
print(nlp.cleanTokensTwitter("Example tweet. #sarcasm"))

(["'Example", 'tweet.', "'"], "'Example tweet. '")
(['Example', 'tweet', '.'], 'Example tweet. ')


In [34]:
print(nlp.cleanTokensTwitter(repr("Example tweet. ")))
print(nlp.cleanTokensTwitter("Example tweet. "))

(["'Example", 'tweet.', "'"], "'Example tweet. '")
(['Example', 'tweet', '.'], 'Example tweet. ')


In [35]:
print(nlp.cleanTokensTwitter(repr("Example tweet.")))
print(nlp.cleanTokensTwitter("Example tweet."))

(["'Example", 'tweet', '.', "'"], "'Example tweet.'")
(['Example', 'tweet', '.'], 'Example tweet.')


In [36]:
print(nlp.cleanTokensTwitter(repr("Example tweet #sarcasm")))
print(nlp.cleanTokensTwitter("Example tweet #sarcasm"))

(["'Example", 'tweet', "'"], "'Example tweet '")
(['Example', 'tweet'], 'Example tweet ')


In [37]:
print(nlp.cleanTokensTwitter(repr("Example tweet ")))
print(nlp.cleanTokensTwitter("Example tweet "))

(["'Example", 'tweet', "'"], "'Example tweet '")
(['Example', 'tweet'], 'Example tweet ')


In [38]:
print(nlp.cleanTokensTwitter(repr("Example tweet")))
print(nlp.cleanTokensTwitter("Example tweet"))

(["'Example", 'tweet', "'"], "'Example tweet'")
(['Example', 'tweet'], 'Example tweet')


### How did this drastically affect the classifier?

#### For all features EXCEPT punctuation, the tokens of the tweet were used. In all cases, repr(text) replaced text as the first step in the feature processing pipeline

In [16]:
test_and_print(
    ['Having MS is really easy and fun. #sarcasm',
     'Having MS is really easy and fun. ',
     'Having MS is really easy and fun.',
     'Having MS is really easy and fun #sarcasm',
     'Having MS is really easy and fun ',
     'Having MS is really easy and fun',
    ],
    classifier,
    dvp
)

print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun. #sarcasm')))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun. ')))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun.')))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun #sarcasm')))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun ')))
print(nlp.cleanTokensTwitter(repr('Having MS is really easy and fun')))

Having MS is really easy and fun. #sarcasm
	Sarcastic
	95.5382460923%

Having MS is really easy and fun. 
	Sarcastic
	95.5382460923%

Having MS is really easy and fun.
	Serious
	92.9694146335%

Having MS is really easy and fun #sarcasm
	Sarcastic
	68.8796539155%

Having MS is really easy and fun 
	Sarcastic
	68.8796539155%

Having MS is really easy and fun
	Sarcastic
	68.4250281747%

(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun.', "'"], "'Having MS is really easy and fun. '")
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun.', "'"], "'Having MS is really easy and fun. '")
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', '.', "'"], "'Having MS is really easy and fun.'")
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', "'"], "'Having MS is really easy and fun '")
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', "'"], "'Having MS is really easy and fun '")
(["'Having", 'MS', 'is', 'really', 'easy', 'and', 'fun', "'"], "'Having MS is really easy and 

#### N-grams are affected by calling repr(). The first word in a tweet is matched to the n-gram 'word instead of word

#### In cases where a dangling space occurs at the end of the tweet, such as when the tweet ends with a hashtag, the last word is matched to the n-gram "word." instead of "word"

#### When no dangling space exists, the period character appears as its own n-gram. Because the vast majority of sarcastic tweets in our dataset ended with #sarcastic (and a dangling space when removed in pre-processing), it is likely that the classifier relied heavily on the period unigram and period-close quote bigram being present in serious tweets and not appearing in sarcastic tweets.

#### Confirming by looking at most influential features

In [23]:
ml.top_n_features(classifier, dvp, False, "Sarcastic", "Serious", n=3)

[('Serious', 'TOTAL/LENGTH', -7.2950022873247136),
 ('Serious', "'_RAW/LEN", -5.1315491326820348),
 ('Serious', "grm2 . '", -2.0140869680084821)]

#### All three of the most informative features indicating a serious tweet have to do with punctuation which was altered by using repr(). The bigram period close-quote also appears as expected as a feature weighted heavily for classifying tweets as serious.

# Testing V2 classifier

#### For v2 the same twitter dataset was used for training and testing. The sole difference between v1 and v2 is that for the v2 classifier, the pre-processing step of calling repr() was removed, and trailing white space was stripped from all tweets. 

In [6]:
dvp_v2 = pickle.load(open(pickle_path + 'v2-twitter-dvp.pickle', 'rb'))
classifier_v2_results = pickle.load(open(pickle_path + 'v2-twitter-trained-log-unbalanced.pickle', 'rb'))
best_classifier_v2 = ml.best_classifiers([classifier_v2_results], ["LOG_unbalanced"], dvp_v2)

In [7]:
classifier_v2 = best_classifier_v2["LOG_unbalanced"]["classifier"]
dvp_v2 = best_classifier_v2["LOG_unbalanced"]["dvp"]

In [12]:
test_and_print(
    ['Having MS is really easy and fun. #sarcasm',
     'Having MS is really easy and fun. ',
     'Having MS is really easy and fun.',
     'Having MS is really easy and fun #sarcasm',
     'Having MS is really easy and fun ',
     'Having MS is really easy and fun',
    ],
    classifier_v2,
    dvp_v2
)

print(nlp.cleanTokensTwitter('Having MS is really easy and fun. #sarcasm'))
print(nlp.cleanTokensTwitter('Having MS is really easy and fun. '))
print(nlp.cleanTokensTwitter('Having MS is really easy and fun.'))
print(nlp.cleanTokensTwitter('Having MS is really easy and fun #sarcasm'))
print(nlp.cleanTokensTwitter('Having MS is really easy and fun '))
print(nlp.cleanTokensTwitter('Having MS is really easy and fun'))

Having MS is really easy and fun. #sarcasm
	Sarcastic
	81.9169344669%

Having MS is really easy and fun. 
	Sarcastic
	81.9169344669%

Having MS is really easy and fun.
	Sarcastic
	81.9169344669%

Having MS is really easy and fun #sarcasm
	Sarcastic
	98.4449320038%

Having MS is really easy and fun 
	Sarcastic
	98.4449320038%

Having MS is really easy and fun
	Sarcastic
	98.4449320038%

(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun.')
(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun.')
(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun', '.'], 'Having MS is really easy and fun.')
(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun'], 'Having MS is really easy and fun')
(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun'], 'Having MS is really easy and fun')
(['Having', 'MS', 'is', 'really', 'easy', 'and', 'fun'], 'Having MS is really easy and fun')


In [15]:
test_and_print(test_sarcastic_text, classifier_v2, dvp_v2)

Having MS is really easy and fun. #sarcasm
	Sarcastic
	81.9169344669%

Having MS is really easy and fun. 
	Sarcastic
	81.9169344669%

Having MS is really easy and fun.
	Sarcastic
	81.9169344669%

Having MS is really easy and fun
	Sarcastic
	98.4449320038%

The classiest event a bar can throw is a foam party. #Sarcasm
	Sarcastic
	99.9983285883%

The classiest event a bar can throw is a foam party. 
	Sarcastic
	99.9983285883%

The classiest event a bar can throw is a foam party.
	Sarcastic
	99.9983285883%

The classiest event a bar can throw is a foam party
	Sarcastic
	99.9999530524%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources #sarcasm
	Sarcastic
	99.9990420016%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources 
	Sarcastic
	99.9990420016%

@thehill Sure I'll watch North Korean - China - Russian and Iran news all very creditable sources
	Sarcastic
	99.9990420016%

@mikefreemanNFL Becaus

# Conclusion

The use of repr() had a significant impact on the classifier and made it appear to be extremely accurate, when in reality the classifier was heavily relying on programming artifacts resulting from the combination of calling repr() on tweet texts and tokenizing that output.

Looking at the results of classification using the v2 classifier, we can see that classification and prediction probability are completely unaffected by trailing spaces. While tweets ending in periods versus the same exact tweet ending without a period may differ slightly in their prediction probability, the swing is not as dramatic as in V1 and most importantly doesn't rely on a programming artifact to reach that conclusion. One possible explanation is that in our twitter dataset there are simply more serious tweets ending with a period than sarcastic tweets ending with a period. Another possible explanation is that the choice of ending a tweet with a period versus omitting it is actually informative when determining whether the user who tweeted it was being sarcastic or not.  