In [1]:
# GENERIC
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import re
import tqdm

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# INITIALIZATION
directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)

/Users/konstantin/Documents/Projects/McGill/McGill-INSY-669-GroupProject


In [2]:
amzn = pd.read_csv(os.path.join(directory, 'data', 'Amazon', 'Amazon_comments_processed.csv'))
amzn_drt = pd.read_csv(os.path.join(directory, 'data', 'Amazon', 'Amazon_comments.csv'))

### For sentiment analysis - the data has to be processed somewhat different

In [3]:
def clean_text(text):  
    text = str(text) 

    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'$', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+','',text)
    text = re.sub(r'https//[A-Za-z0-9./]+','',text)
    text = text.replace("\n","")
    text = re.sub('@[\w]+', '', text)

    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text) 

    text = text.lower()
    text = text.strip()

    return(text)


pros_list = amzn_drt.Pros.tolist()
cons_list = amzn_drt.Cons.tolist()

pros_clean = [clean_text(text) for text in tqdm.tqdm(pros_list)]
cons_clean = [clean_text(text) for text in tqdm.tqdm(cons_list)]

100%|██████████| 10000/10000 [00:00<00:00, 147446.90it/s]
100%|██████████| 10000/10000 [00:00<00:00, 144327.09it/s]


### Sentiment Analysis
estimate compound score

In [14]:
analyzer = SentimentIntensityAnalyzer()

pros_sent = []
for text in pros_clean:
    vs = analyzer.polarity_scores(text)
    pros_sent.append(vs)
pros_sent = pd.DataFrame(pros_sent)
pros_sent = pros_sent.rename(columns={'compound': 'pros_comp'})

cons_set = []
for text in cons_clean:
    vs = analyzer.polarity_scores(text)
    cons_set.append(vs)
cons_sent = pd.DataFrame(cons_set)
cons_sent = cons_sent.rename(columns={'compound': 'cons_comp'})

sent_df = pd.concat([pros_sent, cons_sent], axis=1)
sent_df['avg_comp'] = (sent_df['pros_comp'] + sent_df['cons_comp'])/2

### Sentiment Analysis
assign sentiment value

In [19]:
sent_df[['pros_sent','cons_sent', 'avg_sent']] = 'neutral'

sent_df['pros_sent'] = np.where(sent_df.pros_comp > 0.1, 'positive', sent_df.pros_sent)
sent_df['pros_sent'] = np.where(sent_df.pros_comp < 0.1, 'negative', sent_df.pros_sent)

sent_df['cons_sent'] = np.where(sent_df.cons_comp > 0.1, 'positive', sent_df.cons_sent)
sent_df['cons_sent'] = np.where(sent_df.cons_comp < 0.1, 'negative', sent_df.cons_sent)

sent_df['avg_sent'] = np.where(sent_df.avg_comp > 0.1, 'positive', sent_df.avg_sent)
sent_df['avg_sent'] = np.where(sent_df.avg_comp < 0.1, 'negative', sent_df.avg_sent)

sent_df[['pros_sent','cons_sent', 'avg_sent', 'pros_comp', 'cons_comp', 'avg_comp']]

Unnamed: 0,pros_sent,cons_sent,avg_sent,pros_comp,cons_comp,avg_comp
0,positive,positive,positive,0.5994,0.2541,0.42675
1,positive,negative,negative,0.4215,-0.4404,-0.00945
2,positive,negative,positive,0.8481,0.0000,0.42405
3,positive,positive,positive,0.8750,0.7003,0.78765
4,positive,positive,positive,0.3818,0.7650,0.57340
...,...,...,...,...,...,...
9995,positive,positive,positive,0.8999,0.4656,0.68275
9996,positive,negative,positive,0.8591,0.0000,0.42955
9997,positive,negative,positive,0.8481,0.0000,0.42405
9998,positive,negative,positive,0.8625,-0.6332,0.11465


In [20]:
amzn = pd.concat([amzn, sent_df], axis=1)
amzn.to_csv(f'data/Amazon/Amazon_comments_sentiment.csv', index=False)

### Visualize Sentiments