## Sentiment Analysis in Python with Vader

Sentiment analysis is the interpretation and classification of emotions (positive, negative and neutral) within text data using text analysis techniques. Essentially just trying to judge the amount of emotion from the written words & determine what type of emotion. This post we'll go into how to do this with Python and specifically the package Vader <https://github.com/cjhutto/vaderSentiment>.

In [1]:
import pandas as pd
import nltk
import typing
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jackm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Read in data here
# https://nlp.stanford.edu/sentiment/code.html
text_data = pd.read_table('original_rt_snippets.txt',header=None)
display(text_data.shape)
display(text_data.sample(5))

(10605, 1)

Unnamed: 0,0
5911,...its stupidities wind up sticking in one's m...
9945,"Been there, done that, liked it much better th..."
7685,'Dragonfly' is a movie about a bus wreck that ...
7081,Scorsese doesn't give us a character worth giv...
744,...an inviting piece of film.


In [None]:
# Import english stop words

from nltk.corpus import stopwords
stopcorpus: typing.List = stopwords.words('english')
print(stopcorpus)

In [None]:
# Convert to lowercase, and remove stop words

def remove_links(text):
    import re
    return re.sub(r"http\S+", "", text)

text_data['translated_full_text'] = text_data['translated_full_text'].astype(str).apply(remove_links)

def style_text(text:str):
    return text.lower()

def remove_words(text_data:str,list_of_words_to_remove: typing.List):
    return [item for item in text_data if item not in list_of_words_to_remove]

text_data['cleaned_text'] = text_data['translated_full_text'].astype(str).apply(style_text)

text_data['cleaned_text'] = text_data['translated_full_text'].astype(str).apply(lambda x: remove_words(x.split(),stopcorpus))

def collapse_list_to_string(string_list):
    return ' '.join(string_list)

text_data['cleaned_text'] = text_data['cleaned_text'].apply(collapse_list_to_string)

display(text_data['cleaned_text'].head(5))

In [None]:
# Lemmatize cleaned text (stem words)

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

text_data['clean_lemmatized'] = text_data['cleaned_text'].astype(str).apply(lemmatize_text)

text_data['clean_lemmatized'] = text_data['clean_lemmatized'].apply(collapse_list_to_string)

display(text_data['clean_lemmatized'].head(5))

In [None]:
output_df = pd.DataFrame(text_data[['full_text','clean_lemmatized','translated_full_text']].drop_duplicates())
display(output_df.head(5))

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_analyzer = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment(text:str, analyser,desired_type:str='pos'):
    # Get sentiment from text
    sentiment_score = analyser.polarity_scores(text)
    return sentiment_score[desired_type]

In [None]:
# Get Sentiment scores
def get_sentiment_scores(df,data_column):
    df[f'{data_column} Positive Sentiment Score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'pos'))
    df[f'{data_column} Negative Sentiment Score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neg'))
    df[f'{data_column} Neutral Sentiment Score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'neu'))
    df[f'{data_column} Compound Sentiment Score'] = df[data_column].astype(str).apply(lambda x: get_sentiment(x,sid_analyzer,'compound'))
    return df

output_df = get_sentiment_scores(output_df,'translated_full_text')
output_df = get_sentiment_scores(output_df,'clean_lemmatized')
display(output_df.head(5))

In [None]:
output_df.to_excel(f'{location_name}-Sentiment-Analysis.xlsx')

In [None]:
output_df.plot.hist(subplots=True,grid=True)
plt.tight_layout()
plt.savefig(f'{location_name}-sentiment-histograms.jpg')

In [None]:
from wordcloud import WordCloud

exclude_words = ["http","https","error"]

exclude_wordWs.extend(stopcorpus)

output_df['wordcloud'] = output_df['translated_full_text'].astype(str).apply(remove_links)

output_df['wordcloud'] = output_df['wordcloud'].astype(str).apply(lambda x: x.lower())

def remove_apostrophes(text):
    text = text.replace("'", "")
    text = text.replace('"', "")E
    return text

output_df['wordcloud'] = output_df['wordcloud'].astype(str).apply(remove_apostrophes)

output_df['wordcloud'] = output_df['wordcloud'].astype(str).apply(lambda x: remove_words(x.split(),exclude_words))

output_df['wordcloud'] = output_df['wordcloud'].apply(lambda x: ' '.join(x))

display(output_df['wordcloud'])

wordcloud = WordCloud().generate(' '.join(output_df['wordcloud'].astype(str)))

wordcloud.to_file(f"{location_name}-wordcloud.png")

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis("off")