## Punctuation features


In [1]:
from json_io import tweet_iterate
from basic_nlp import nlp, PUNCTUATION_RE

import itertools
import re

PATH_SARCASTIC_TWEETS = "../json/sarcastic/unique.json"
n = nlp()



In [3]:
# See lower slides for explanation of is_surrogate_escaped() and remove_surrogate_escaping()

# Suggested methods by author of article

def remove_surrogate_escaping(s, method='ignore'):
    assert method in ('ignore', 'replace'), 'invalid removal method'
    return s.encode('utf-8', method).decode('utf-8')

def is_surrogate_escaped(s):
    try:
        s.encode('utf-8')
    except UnicodeEncodeError as e:
        if e.reason == 'surrogates not allowed':
            return True
        raise
    return False

@MetalBlonde Come on Karina! What about the #bowlinggreenmassacre ? Damn those Muslims!  #alternatefacts #Sarcasm #WorstPresidentEver
Take that Black History month.Make America White again, with fatty tissue around the heart #sarcasm #trump #first100
https://t.co/SXnqNmRgqr
Ears lowered big ears to lower ‍♂️ #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm… https://t.co/oXK7RHlODU
Officially in love with M$ Azure #sarcasm #powerless
#lol #funny #comedy #sarcasm RT DivinityLA: Introducing "Friendship", a bracelet set that celebrates the love we s… https://t.co/ilaFtGM5Ed
Classy.  #sarcasm 
How's that whole "uniting the country"-thing working out, Kellyanne? https://t.co/TQhF8isVEP
#draintheswamp riiiigggghhhttt. #sarcasm https://t.co/S9nhUHXM9e
@HanselCreative dude totally!!!!

.
.
.
.
.
#sarcasm
@amritabhinder n one more thing @TarekFatah teaching loyalty to Indian Muslims and he is even not loyal with own country #sarcasm
@AJ I look forward to hearing about peopl

In [6]:
def punctuation_features(s):
    """
    Process a string for punctuation features. Punctuation defined by the regular expression:
    [\'\!\"\#\$\%\&\/\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\}\|\~\\u2026]
    Which consists of punctuation in string.punctuation, and the unicode char \u2026 (ellipsis)
    
    s: input string
    returns {punctuation_mark: (raw #, % of length of s, % of total # of punctuation marks found in s)}
    
    example:
    punctuation_features("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed consequat magna eu facilisis!!?")
    {'!': (2, 0.0217, 0.4),
     ',': (1, 0.0109, 0.2),
     '.': (1, 0.0109, 0.2),
     '?': (1, 0.0109, 0.2)}
    """
    
    punctuation_found_list = re.findall(PUNCTUATION_RE, s)
    return {p: (punctuation_found_list.count(p),
                round(punctuation_found_list.count(p)/len(s), 4), 
                round(punctuation_found_list.count(p)/len(punctuation_found_list), 4)) for p in punctuation_found_list} 

In [8]:
for tweet in itertools.islice(tweet_iterate(PATH_SARCASTIC_TWEETS), 25):
    text = tweet["text"]
    if is_surrogate_escaped(text):
        text = remove_surrogate_escaping(text)
    print("text: {} \n punctuation features: {} \n".format(text, punctuation_features(text)))

text: @MetalBlonde Come on Karina! What about the #bowlinggreenmassacre ? Damn those Muslims!  #alternatefacts #Sarcasm #WorstPresidentEver 
 punctuation features: {'@': (1, 0.0075, 0.125), '?': (1, 0.0075, 0.125), '#': (4, 0.0301, 0.5), '!': (2, 0.015, 0.25)} 

text: Take that Black History month.Make America White again, with fatty tissue around the heart #sarcasm #trump #first100
https://t.co/SXnqNmRgqr 
 punctuation features: {':': (1, 0.0071, 0.1), '#': (3, 0.0214, 0.3), '.': (2, 0.0143, 0.2), '/': (3, 0.0214, 0.3), ',': (1, 0.0071, 0.1)} 

text: Ears lowered big ears to lower ‍♂️ #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm… https://t.co/oXK7RHlODU 
 punctuation features: {':': (1, 0.0077, 0.0667), '.': (1, 0.0077, 0.0667), '#': (9, 0.0692, 0.6), '/': (3, 0.0231, 0.2), '…': (1, 0.0077, 0.0667)} 

text: Officially in love with M$ Azure #sarcasm #powerless 
 punctuation features: {'$': (1, 0.0192, 0.3333), '#': (2, 0.0385, 0.6667)} 

text: #lol #funny #com

## Issue: Surrogate escaped strings

Read [http://lucumr.pocoo.org/2013/7/2/the-updated-guide-to-unicode/](http://lucumr.pocoo.org/2013/7/2/the-updated-guide-to-unicode/)

#### TLDR; 
These are unicode strings that cannot be encoded to a unicode encoding because they are actually invalid. These strings are created by APIs that think an encoding is a specific one but cannot guarantee it because the underlying system does not fully enforce that. This functionality is provided by the 'surrogateescape' error handler.

#### Example tweet

Ears lowered big ears to lower \ud83d\udc87\u200d\u2642\ufe0f\ud83d\ude02 #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm\u2026 https://t.co/oXK7RHlODU
_______
[https://twitter.com/jasonstats09/status/827589626944770049](https://twitter.com/jasonstats09/status/827589626944770049)

![alt text](https://image.ibb.co/jPkKBF/Screen_Shot_2017_03_16_at_4_03_36_PM.png)

In [9]:
tweet = "Ears lowered big ears to lower \ud83d\udc87\u200d\u2642\ufe0f\ud83d\ude02 #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm\u2026 https://t.co/oXK7RHlODU"
# print(tweet)
# Uncomment line above to try, crashes with UnicodeEncodeError (for some reason error is dumped to terminal running jupyter notebook, and crashes notebook kernel)

In [10]:
# Have to encode with "ignore" or "replace" option
print(tweet.encode("utf-8", "ignore").decode("utf-8"))
print(tweet.encode("utf-8", "replace").decode("utf-8"))

Ears lowered big ears to lower ‍♂️ #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm… https://t.co/oXK7RHlODU
Ears lowered big ears to lower ??‍♂️?? #Trim #BigEars #Jokes #Selfie #MugShot #Weekend #Smile #Happy #Sarcasm… https://t.co/oXK7RHlODU
