### Supervised Sentiment Analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

review_df = pd.read_csv('../data/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [4]:
import re

review_df['review'] = review_df['review'].str.replace('<br />', ' ')

review_df['review'] = review_df['review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [5]:
print(review_df['review'][0])

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [6]:
from sklearn.model_selection import train_test_split

class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]
acc = accuracy_score(y_test, pred)
rocauc = roc_auc_score(y_test, pred_probs)
print(f'Accuracy: {acc:.4f}\nROC-AUC: {rocauc:.4f}')

Accuracy: 0.8860
ROC-AUC: 0.9503


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ('cnt_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]
acc = accuracy_score(y_test, pred)
rocauc = roc_auc_score(y_test, pred_probs)
print(f'Accuracy: {acc:.4f}\nROC-AUC: {rocauc:.4f}')

Accuracy: 0.8936
ROC-AUC: 0.9598


### Unsupervised Sentiment Analysis

VADER

In [9]:
import nltk
nltk.download('vader_lexicon', quiet=True)

True

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_score = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_score)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [11]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

In [12]:
review_df['vader_preds'] = review_df['review'].apply(lambda x: vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, roc_auc_score

print('confusion_matrix\n', confusion_matrix(y_target, vader_preds))
print('accuracy:', accuracy_score(y_target, vader_preds))
print('precision:', precision_score(y_target, vader_preds))
print('recall:', recall_score(y_target, vader_preds))

confusion_matrix
 [[ 6730  5770]
 [ 1857 10643]]
accuracy: 0.69492
precision: 0.64844939986596
recall: 0.85144


SentiWordNet

In [14]:
from nltk.corpus import wordnet as wn

term = 'present'

synsets = wn.synsets(term)
print(type(synsets))
print(len(synsets))
print(synsets)

<class 'list'>
18
[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [15]:
for synset in synsets:
    print(f'\nSynset name: {synset.name()}')
    print(f'POS: {synset.lexname()}')
    print(f'Definition: {synset.definition()}')
    print(f'Lemma: {synset.lemma_names()}')


Synset name: present.n.01
POS: noun.time
Definition: the period of time that is happening now; any continuous stretch of time including the moment of speech
Lemma: ['present', 'nowadays']

Synset name: present.n.02
POS: noun.possession
Definition: something presented as a gift
Lemma: ['present']

Synset name: present.n.03
POS: noun.communication
Definition: a verb tense that expresses actions or states at the time of speaking
Lemma: ['present', 'present_tense']

Synset name: show.v.01
POS: verb.perception
Definition: give an exhibition of to an interested audience
Lemma: ['show', 'demo', 'exhibit', 'present', 'demonstrate']

Synset name: present.v.02
POS: verb.communication
Definition: bring forward and present to the mind
Lemma: ['present', 'represent', 'lay_out']

Synset name: stage.v.01
POS: verb.creation
Definition: perform (a play), especially on a stage
Lemma: ['stage', 'present', 'represent']

Synset name: present.v.04
POS: verb.possession
Definition: hand over formally
Lemma: 

In [16]:
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree, lion, tiger, cat, dog]
similarities = []
entity_names = [entity.name().split('.')[0] for entity in entities]

for entity in entities:
    similarity = [round(entity.path_similarity(compared_entity), 2) for compared_entity in entities]
    similarities.append(similarity)
    
similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [17]:
nltk.download('sentiwordnet', quiet=True)

True

In [18]:
from nltk.corpus import sentiwordnet as swn

senti_synsets = list(swn.senti_synsets('slow'))
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
11
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [19]:
father = swn.senti_synset('father.n.01')
print('father')
print('pos_score:', father.pos_score())
print('neg_score:', father.neg_score())
print('obj_score:', father.obj_score())

fabulous = swn.senti_synset('fabulous.a.01')
print('\nfabulous')
print('pos_score:', fabulous.pos_score())
print('neg_score:', fabulous.neg_score())
print('obj_score:', fabulous.obj_score())

father
pos_score: 0.0
neg_score: 0.0
obj_score: 1.0

fabulous
pos_score: 0.875
neg_score: 0.125
obj_score: 0.0


In [20]:
from nltk.corpus import wordnet as wn

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    if tag.startswith('N'):
        return wn.NOUN
    if tag.startswith('R'):
        return wn.ADV
    if tag.startswith('V'):
        return wn.VERB
    return

In [21]:
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [22]:
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    
    for raw_sentence in raw_sentences:
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))

        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    if sentiment >= 0:
        return 1
    
    return 0

In [23]:
review_df['preds'] = review_df['review'].apply(lambda x: swn_polarity(x))
y_target = review_df['sentiment'].values
preds = review_df['preds'].values

In [24]:
import numpy as np

print('confusion_matrix\n', confusion_matrix(y_target, preds))
print('accuracy:', np.round(accuracy_score(y_target, preds), 4))
print('precision:', np.round(precision_score(y_target, preds), 4))
print('recall:', np.round(recall_score(y_target, preds), 4))

confusion_matrix
 [[7668 4832]
 [3636 8864]]
accuracy: 0.6613
precision: 0.6472
recall: 0.7091
