## Sentiment analysis

So we will be conducting sentiment analysis to our semantic data gathered from Twitter and Reddit comments. We will use VADER to calculate polarity scores indicating the sentiment of the input text. VADER is a parsimonious rule-based model for sentiment analysis of social media text, perfect for our purpose. In this notebook we will access the data from the CSV files, clean the data, calculate the polarity score, add this to the CSV and store the new CSV file.

In [13]:
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import re

tweets_names_list = ['T_ada_march_18.csv', 'T_ada_april_18.csv', 'T_ada_may_18.csv', 'T_ada_june_18.csv', 'T_ada_july_18.csv', 'T_ada_august_18.csv',\
                  'T_ada_september_18.csv', 'T_ada_october_18.csv', 'T_ada_november_18.csv', 'T_ada_december_18.csv', 'T_ada_january_19.csv',\
                  'T_ada_february_19.csv', 'T_ada_march_19.csv', 'T_ada_april_19.csv', 'T_ada_may_19.csv', 'T_ada_june_19.csv', 'T_ada_july_19.csv',\
                  'T_ada_august_19.csv', 'T_ada_september_19.csv', 'T_ada_october_19.csv', 'T_ada_november_19.csv', 'T_ada_december_19.csv',\
                  'T_ada_january_20.csv', 'T_ada_febrauri_20.csv', 'T_ada_march_20.csv', 'T_ada_april_20.csv', 'T_ada_may_20.csv', 'T_ada_june_20.csv',\
                  'T_ada_july_20.csv', 'T_ada_august_20.csv', 'T_ada_september_20.csv', 'T_ada_october_20.csv', 'T_ada_november_20.csv',\
                  'T_ada_december_20.csv', 'T_ada_january_21.csv', 'T_ada_february_21.csv', 'T_ada_march_21.csv', 'T_ada_april_21.csv', 'T_ada_may_21.csv']
reddit_names_list = ['R_ada_march_18.csv', 'R_ada_april_18.csv', 'R_ada_may_18.csv', 'R_ada_june_18.csv', 'R_ada_july_18.csv', 'R_ada_august_18.csv',\
                  'R_ada_september_18.csv', 'R_ada_october_18.csv', 'R_ada_november_18.csv', 'R_ada_december_18.csv','R_ada_january_19.csv',\
                  'R_ada_febrauri_19.csv', 'R_ada_march_19.csv', 'R_ada_april_19.csv', 'R_ada_may_19.csv', 'R_ada_june_19.csv', 'R_ada_july_19.csv',\
                  'R_ada_august_19.csv', 'R_ada_september_19.csv', 'R_ada_october_19.csv', 'R_ada_november_19.csv', 'R_ada_december_19.csv',\
                  'R_ada_january_20.csv', 'R_ada_febrauri_20.csv', 'R_ada_march_20.csv', 'R_ada_april_20.csv', 'R_ada_may_20.csv', 'R_ada_june_20.csv',\
                  'R_ada_july_20.csv', 'R_ada_august_20.csv', 'R_ada_september_20.csv', 'R_ada_october_20.csv', 'R_ada_november_20.csv',\
                  'R_ada_december_20.csv', 'R_ada_january_21.csv', 'R_ada_february_21.csv', 'R_ada_march_21.csv','R_ada_april_21.csv', 'R_ada_may_21.csv']

In [18]:
analyzer = SentimentIntensityAnalyzer()

for doc in reddit_names_list:
    df = pd.read_csv(doc)
    text = df['body'].apply(lambda x: '' if len(x) < 10 else x)
    for i, sentence in enumerate(text):
        sentence = re.sub(r'@[A-Za-z0-9]+', "", sentence) # Removal of @mentions
        sentence = re.sub(r'#', '', sentence) # Removal of #
        sentence = re.sub(r'RT[\s]+', '', sentence) # Removal of RT
        sentence = re.sub(r'&amp?', '&', sentence) # Replacement of &amp; to &
        sentence = re.sub(r'https?:\/\/\S+', '', sentence) # Removal of hyperlinks
        text[i] = sentence
    df[['negative', 'neutral', 'positive', 'compound']] = text.apply(lambda body:pd.Series(analyzer.polarity_scores(body)))
    df.to_csv(doc)
    
for doc in tweets_names_list:
    df = pd.read_csv(doc)
    text = df['content'].apply(lambda x: '' if len(x) < 10 else x)
    for i, sentence in enumerate(text):
        sentence = re.sub(r'@[A-Za-z0-9]+', "", sentence) # Removal of @mentions
        sentence = re.sub(r'#', '', sentence) # Removal of #
        sentence = re.sub(r'RT[\s]+', '', sentence) # Removal of RT
        sentence = re.sub(r'&amp;', '&', sentence) # Replacement of &amp; to &
        sentence = re.sub(r'https?:\/\/\S+', '', sentence) # Removal of hyperlinks
        text[i] = sentence
    df[['negative', 'neutral', 'positive', 'compound']] = text.apply(lambda body:pd.Series(analyzer.polarity_scores(body)))
    df.to_csv(doc)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}