# Sentiment Analysis of Microblog Data Streams

## Outline:

- preprocessing
    - [X] reduce whitespace
    - [X] covnert to lower case
    - [X] remove punctuation
    - [X] remove stop word
    - [X] remove number
    - [X] expanding abbreviations (by normalization)
    - [X] replace smiles
    - [X] remove too small tweets
    - [X] reduce repeated latter (looool $\rightarrow$ lol)
- classification
    - [X] predict organization
    - [X] predict 3-way sentiment
    - [X] Try to use temporal data
    - [X] 5-way sentiment prediction

## Preprocessing

In [1]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

df = pd.read_csv('data/Train.csv')

In [2]:
def load_vocab_norm(path: str):
    m = {}
    with open(path) as f:
        for l in f:
            a, b = l.split()
            m[a] = b
    return m
vocabs_norm = [load_vocab_norm('emnlp_dict.txt')]

In [3]:
import typing
import re
import unicodedata

from nltk.corpus import stopwords
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

from static_vocabularies import contraction_mapping, emoticons

stopwords_vocs = {
    'russian': stopwords.words('russian'),
    'english': stopwords.words('english'),
    'spanish': stopwords.words('spanish')
}
# add spain, italia
all_stopwords = {w for words in stopwords_vocs.values()
                   for w in words}

# irrelevant_patterns
regexes = {
    'URL': r"""(?xi)\b(?:(?:https?|ftp|file):\/\/|www\.|ftp\.|pic\.|twitter\.|facebook\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:;,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:;,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])""",
    'EMOJI': u'([\U0001F1E0-\U0001F1FF])|([\U0001F300-\U0001F5FF])|([\U0001F600-\U0001F64F])|([\U0001F680-\U0001F6FF])|([\U0001F700-\U0001F77F])|([\U0001F800-\U0001F8FF])|([\U0001F900-\U0001F9FF])|([\U0001FA00-\U0001FA6F])|([\U0001FA70-\U0001FAFF])|([\U00002702-\U000027B0])|([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])',
    'HASHTAG': r"\#\b[\w\-\_]+\b",
    'EMAIL': r"(?:^|(?<=[^\w@.)]))(?:[\w+-](?:\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(?:\.(?:[a-z]{2,})){1,3}(?:$|(?=\b))",
    'MENTION': r"@[A-Za-z0-9]+",
    'CASHTAG': r"(?:[$\u20ac\u00a3\u00a2]\d+(?:[\\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\\.,']\\d+)?[$\u20ac\u00a3\u00a2])",
    'DATE': r"(?:(?:(?:(?:(?<!:)\b\'?\d{1,4},? ?)?\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b))|(?:(?:(?<!:)\b\\'?\d{1,4},? ?)\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)?))|(?:\b(?<!\d\\.)(?:(?:(?:[0123]?[0-9][\\.\\-\\/])?[0123]?[0-9][\\.\\-\\/][12][0-9]{3})|(?:[0123]?[0-9][\\.\\-\\/][0123]?[0-9][\\.\\-\\/][12]?[0-9]{2,3}))(?!\.\d)\b))",
    'TIME': r'(?:(?:\d+)?\\.?\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)',
    #'EMPHASIS': r"(?:\*\b\w+\b\*)",
    #'ELONG': r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b"
    'NUMBERS': r"[0-9]+"
}
regexes = {k:re.compile(v) for k, v in regexes.items()}

target_word = {'apple', 'microsoft', 'google', 'twitter'}

def clean(msg: str) -> typing.List[str]:
    msg = ' ' + msg
    
    # normalize unicode
    msg = unicodedata.normalize('NFKD', msg)
    
    msg = regexes['URL'].sub(" ", msg)
#     try:
#         lang = detect(msg)
#     except LangDetectException:
#         lang = 'en'
#     if lang == 'en':
#         msg = msg.encode('ascii', errors='ignore')\
#                  .decode('utf8', errors='ignore')

    # expand contraction
    for k, v in contraction_mapping.items():
        msg = msg.replace(k, v)
    
    # replace emoticons
    for k, v in emoticons.items():
        if k in msg:
            msg = msg.replace(k, v)
    
    # unify latter case
    msg = msg.lower()
    
    # save target word in hashtag or mention
    for w in target_word:
        msg = re.sub(f'(#|@){w}\\b', w, msg)
    
    # remove irrelevant
    for pn in ['EMOJI', 'HASHTAG', 'MENTION', 'CASHTAG', 'DATE', 'TIME', 'NUMBERS']:
        msg = regexes[pn].sub(" ", msg)
    
    # reduce repeated latter
    msg = re.sub(r"([a-z])\1(\1)+", r'\1', msg)
    
    # remove punctuation
    msg = re.sub(r"[\[\]\{\}/\\\(\)\"'`\-_:;.,!?*”“»@%><+]+", " ", msg)
    
    # split to word
    msg = msg.split()
    
    # language normalization
    for norm in vocabs_norm:
        msg = map(lambda w: norm.get(w, w), msg)
    msg = list(msg)
    
    # remove stopwords
    msg = [w for w in msg if w not in all_stopwords]
    
    # remove short words
#     msg = [w for w in msg if len(w) > 2]
    
    msg = ' '.join(msg)
    return msg

## Learning for organization

In [4]:
from sklearn.feature_extraction import text
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.compose import ColumnTransformer

In [5]:
#CountVectorizer = text.CountVectorizer()
#TfidfVectorizer = text.TfidfVectorizer()
#HashVectorizer = text.HashingVectorizer()

In [6]:
word_vectorizer = text.TfidfVectorizer(
    analyzer='word', ngram_range=(1, 4),
    min_df=2, use_idf=True, sublinear_tf=True)
char_vectorizer = text.TfidfVectorizer(
          analyzer='char', ngram_range=(3, 5),
          min_df=2, use_idf=True, sublinear_tf=True)
ngrams_vectorizer = Pipeline([
    ('feats', 
     FeatureUnion([('word_ngram', word_vectorizer),
                    ('char_ngram', char_vectorizer),
                  ])),])

In [7]:
df['cleaned'] = df['TweetText'].apply(clean)
df_filtered = df.loc[df['cleaned'].apply(len) > 0]
# for l in df_filtered['cleaned'][200:300]:
#     print(l)

In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, Normalizer, OneHotEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report


# encoder=LabelEncoder()
# label=encoder.fit_transform(df['Topic'])

In [9]:
feature=ngrams_vectorizer.fit_transform(df_filtered['cleaned'].values)

clf_org=LinearSVC()
# x_train, x_test, y_train, y_test = train_test_split(feature, df_filtered['Topic'], test_size=0.2, stratify=df_filtered['Topic'])
clf_org.fit(feature, df_filtered['Topic'])

LinearSVC()

### Test

In [10]:
test_df = pd.read_csv('data/Test.csv')
test_df['cleaned'] = test_df['TweetText'].apply(clean)
test_feature = ngrams_vectorizer.transform(test_df['cleaned'])
print(classification_report(test_df['Topic'], clf_org.predict(test_feature)))

              precision    recall  f1-score   support

       apple       0.95      0.96      0.95        98
      google       0.84      0.77      0.80        79
   microsoft       0.90      0.73      0.81        78
     twitter       0.72      0.89      0.79        87

    accuracy                           0.85       342
   macro avg       0.85      0.84      0.84       342
weighted avg       0.85      0.85      0.85       342



##  Learning for 3-way sentiment

In [11]:
org_encoder = LabelEncoder()
org_encoder.fit(df['Topic'])

sentiment_feature = FeatureUnion(
    [('text', ngrams_vectorizer), 
     ('org', Pipeline([
        ('vect', ngrams_vectorizer),
        ('clf', FunctionTransformer(clf_org.predict)),
        ('encode', org_encoder),
        ('tranp', FunctionTransformer(lambda r: np.asmatrix(r).transpose())),
        # ('norm', Normalizer())
      ]))
    ])

clf_sentiment=LinearSVC()
clf_sentiment.fit(sentiment_feature.transform(df_filtered['cleaned']), df_filtered['Sentiment'])

LinearSVC()

### Test

In [12]:
test_sentiment_feature = sentiment_feature.transform(test_df['cleaned'])
print(classification_report(test_df['Sentiment'], clf_sentiment.predict(test_sentiment_feature)))

              precision    recall  f1-score   support

  irrelevant       0.74      0.93      0.82       105
    negative       0.70      0.47      0.56        49
     neutral       0.79      0.74      0.76       156
    positive       0.53      0.50      0.52        32

    accuracy                           0.74       342
   macro avg       0.69      0.66      0.67       342
weighted avg       0.74      0.74      0.73       342



### Add temporal feature

In [13]:
from scipy.sparse import hstack

df_filtered['weekday'] = df_filtered['TweetDate'].apply(lambda s: s.split()[0])
df_filtered['month'] = df_filtered['TweetDate'].apply(lambda s: s.split()[1])

org_encoder = LabelEncoder()
org_encoder.fit(df['Topic'])

time_feature = ColumnTransformer([
    ('weekday', OneHotEncoder(), ['weekday']),
    ('month', OneHotEncoder(), ['month']),
])
time_feature.fit(df_filtered[['weekday', 'month']])
clf_sentiment_with_time=LinearSVC()
clf_sentiment_with_time.fit(hstack((sentiment_feature.transform(df_filtered['cleaned']), 
                                    time_feature.transform(df_filtered))), df_filtered['Sentiment'])



LinearSVC()

In [14]:
test_df['weekday'] = test_df['TweetDate'].apply(lambda s: s.split()[0])
test_df['month'] = test_df['TweetDate'].apply(lambda s: s.split()[1])

a = sentiment_feature.transform(test_df['cleaned'])
b = time_feature.transform(test_df)
print(classification_report(test_df['Sentiment'], clf_sentiment_with_time.predict(hstack((a, b)))))

              precision    recall  f1-score   support

  irrelevant       0.73      0.93      0.82       105
    negative       0.70      0.53      0.60        49
     neutral       0.81      0.72      0.76       156
    positive       0.52      0.50      0.51        32

    accuracy                           0.74       342
   macro avg       0.69      0.67      0.67       342
weighted avg       0.74      0.74      0.73       342





## 5-way sentiment

In [15]:
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.metrics import mean_squared_error

clf_sentiment5=RidgeClassifier()
clf_sentiment5.fit(sentiment_feature.transform(df_filtered['cleaned']), df_filtered['Sentiment'])

print(classification_report(test_df['Sentiment'], clf_sentiment5.predict(test_sentiment_feature)))

d = {'positive': +1, 'neutral': 0, 'negative': -1, 'irrelevant': 0}
df_filtered['sent_score'] = df_filtered['Sentiment'].apply(lambda s: d[s])
clf = Ridge()
clf.fit(sentiment_feature.transform(df_filtered['cleaned']), df_filtered['sent_score'])

test_df['sent_score'] = test_df['Sentiment'].apply(lambda s: d[s])
print(mean_squared_error(test_df['sent_score'], clf.predict(test_sentiment_feature)))

              precision    recall  f1-score   support

  irrelevant       0.72      0.92      0.81       105
    negative       0.66      0.47      0.55        49
     neutral       0.77      0.72      0.75       156
    positive       0.56      0.47      0.51        32

    accuracy                           0.73       342
   macro avg       0.68      0.65      0.65       342
weighted avg       0.72      0.73      0.72       342

0.1607128775012604


In [16]:
def float_to_catetory(v: float) -> int:
    if v < -0.75: return -2
    elif v < -0.25: return -1
    elif v < 0.25: return 0
    elif v < 0.75: return 1
    else: return 2

test_df['5way_pred'] = np.vectorize(float_to_catetory)(clf.predict(sentiment_feature.transform(test_df['cleaned'])))

In [17]:
from IPython.display import display, HTML
display(HTML(test_df[['TweetText', '5way_pred']][:30].to_html()))

Unnamed: 0,TweetText,5way_pred
0,RT @JamaicanIdler: Lmao I think @apple is onto something magical! I am DYING!!! haha. Siri suggested where to find whores and where to h ...,1
1,"Bravo, @Apple! http://t.co/BgoTzj7K",0
2,"Day305, I'm thankful for the great customer service received today from @Apple via phone CS, new phone on the way #365daysofgratefulness",0
3,i love this. so much. thank you @apple. http://t.co/Ui8lOEzX,1
4,I &lt;3 @apple http://t.co/ondXWpEr,1
5,"dammit, listening to siri is making me want to upgrade. well played @apple.",-1
6,. @apple & @AT&T u cannot tell me there isn't at least 1 64GB iPhone 4S in LA or Vegas!! Give me a fucking break!!!!,-1
7,"I am so done with @Att and @apple 's profitering and lack of customer service, so fucking down with both!!!",-1
8,@rogerweir no but I have the option of a replacement iPhone 4s ? Not sure if I want one after having 2 duff iPhones. @O2 @iphone4s @apple,0
9,removing all @apple shit.,-1


# Demo

In [18]:
def guess_org(msg, time):
    msg = clean(msg)
    return clf_org.predict(ngrams_vectorizer.transform([msg]))[0]

def guess_sentiment(msg, time):
    msg = clean(msg)
    d = pd.DataFrame([dict(weekday=time.split()[0], month=time.split()[1])])
    
    return clf_sentiment_with_time.predict(
        hstack((
            sentiment_feature.transform([msg]),
            time_feature.transform(d)
        ))
    )[0]


In [21]:
msg = "Google is very good"
time = "Tue Oct 18 21:53:25 +0000 2011"
test_df['weekday'] = test_df['TweetDate'].apply(lambda s: s.split()[0])
test_df['month'] = test_df['TweetDate'].apply(lambda s: s.split()[1])

a = sentiment_feature.transform(test_df['cleaned'])
b = time_feature.transform(test_df)

print(guess_org(msg, time))
print(guess_sentiment(msg, time))

google
positive


