In [2]:
!pip install afinn



In [3]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import re
from textblob import TextBlob
from afinn import Afinn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshuatam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joshuatam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joshuatam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joshuatam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joshuatam/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer_apple = WordNetLemmatizer()

In [10]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

excluded = ['no', 'not']

In [11]:
pd.set_option("max_colwidth", 1000)

In [13]:
def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)

def remove_hashtags(sample):
    return re.sub(r"#\S+", "", sample)

def remove_atMarks(sample):
    return re.sub(r"@\S+", "", sample).strip()

In [3]:
#Load datasets
import os

#Apple datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Apple/Combined')
df_apple = pd.read_csv('apple_combined.csv')

#Pwr+ datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Pwr+/Combined')
df_pwr = pd.read_csv('Pwr+_combined.csv')

#Sony datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Sony/Combined')
df_sony = pd.read_csv('Sony_combined.csv')

#Garmin datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Garmin/Combined')
df_garmin = pd.read_csv('Garmin_combined.csv')

#YamahaAudio datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/YamahaAudio/Combined')
df_yamaha = pd.read_csv('YamahaAudio_combined.csv')

#Belkin datasets
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Belkin/Combined')
df_belkin = pd.read_csv('Belkin_combined.csv')

# Sentiment Analysis for Apple Data

In [4]:
df_apple['Text'] = df_apple['Text'].fillna("missing")

df_apple['TextClean'] = df_apple['Text']

count = 0
for i in df_apple['TextClean']:
    df_apple.at[count,"TextClean"] = remove_URL(i)
    df_apple.at[count,"TextClean"] = remove_hashtags(df_apple.at[count,"TextClean"])
    df_apple.at[count,"TextClean"] = remove_atMarks(df_apple.at[count,"TextClean"])
    count += 1

In [5]:
# Remove number and special characters
df_apple['TextClean'] = df_apple['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_apple['TextClean'] = df_apple['TextClean'].str.lower()

# remove white spaces
df_apple['TextClean'] = df_apple['TextClean'].str.strip()

# fill empty review with na
# df_apple['TextClean'] = df_apple['TextClean'].fillna('', inplace= True)

In [8]:
# remove stopwords
# df_merge['reviewClean'] = df_merge['reviewClean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_apple['TextClean'] = df_apple['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [11]:
df_apple['TextCleanLemm'] = df_apple['TextClean'].apply(lambda x: [lemmatizer_apple.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

In [12]:
## Sentiment scores

# Textblob
df_apple['tb_score'] = df_apple['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_apple['afinn_score'] = df_apple['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_apple['vader_com'] = df_apple['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_apple['vader_pos'] = df_apple['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_apple['vader_neg'] = df_apple['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_apple['vader_neu'] = df_apple['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

In [13]:
#Export apple data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_apple.to_csv("Apple_sentiment_analysis.csv", index=False, encoding='utf-8-sig')

# Sentiment Analysis for Pwr+ Data

In [14]:
df_pwr['Text'] = df_pwr['Text'].fillna("missing")

df_pwr['TextClean'] = df_pwr['Text']

count = 0
for i in df_pwr['TextClean']:
    df_pwr.at[count,"TextClean"] = remove_URL(i)
    df_pwr.at[count,"TextClean"] = remove_hashtags(df_pwr.at[count,"TextClean"])
    df_pwr.at[count,"TextClean"] = remove_atMarks(df_pwr.at[count,"TextClean"])
    count += 1

In [15]:
# Remove number and special characters
df_pwr['TextClean'] = df_pwr['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_pwr['TextClean'] = df_pwr['TextClean'].str.lower()

# remove white spaces
df_pwr['TextClean'] = df_pwr['TextClean'].str.strip()

In [16]:
# remove stopwords
df_pwr['TextClean'] = df_pwr['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [17]:
lemmatizer_pwr = WordNetLemmatizer()

df_pwr['TextCleanLemm'] = df_pwr['TextClean'].apply(lambda x: [lemmatizer_pwr.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

In [18]:
## Sentiment scores

# Textblob
df_pwr['tb_score'] = df_pwr['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_pwr['afinn_score'] = df_pwr['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_pwr['vader_com'] = df_pwr['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_pwr['vader_pos'] = df_pwr['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_pwr['vader_neg'] = df_pwr['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_pwr['vader_neu'] = df_pwr['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])


In [19]:
#Export pwr+ data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_pwr.to_csv("Pwr+_sentiment_analysis.csv", index=False, encoding='utf-8-sig')


# Sentiment Analysis for Sony Data

In [20]:
df_sony['Text'] = df_sony['Text'].fillna("missing")

df_sony['TextClean'] = df_sony['Text']

count = 0
for i in df_sony['TextClean']:
    df_sony.at[count,"TextClean"] = remove_URL(i)
    df_sony.at[count,"TextClean"] = remove_hashtags(df_sony.at[count,"TextClean"])
    df_sony.at[count,"TextClean"] = remove_atMarks(df_sony.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_sony['TextClean'] = df_sony['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_sony['TextClean'] = df_sony['TextClean'].str.lower()

# remove white spaces
df_sony['TextClean'] = df_sony['TextClean'].str.strip()

In [21]:
# remove stopwords
df_sony['TextClean'] = df_sony['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [22]:
# Lemmatization
lemmatizer_sony = WordNetLemmatizer()
df_sony['TextCleanLemm'] = df_sony['TextClean'].apply(lambda x: [lemmatizer_sony.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

In [23]:
## Sentiment scores

# Textblob
df_sony['tb_score'] = df_sony['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_sony['afinn_score'] = df_sony['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_sony['vader_com'] = df_sony['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_sony['vader_pos'] = df_sony['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_sony['vader_neg'] = df_sony['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_sony['vader_neu'] = df_sony['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])


In [24]:
#Export sony data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_sony.to_csv("Sony_sentiment_analysis.csv", index=False, encoding='utf-8-sig')

# Sentiment Analysis for Garmin Data

In [25]:
df_garmin['Text'] = df_garmin['Text'].fillna("missing")

df_garmin['TextClean'] = df_garmin['Text']

count = 0
for i in df_garmin['TextClean']:
    df_garmin.at[count,"TextClean"] = remove_URL(i)
    df_garmin.at[count,"TextClean"] = remove_hashtags(df_garmin.at[count,"TextClean"])
    df_garmin.at[count,"TextClean"] = remove_atMarks(df_garmin.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_garmin['TextClean'] = df_garmin['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_garmin['TextClean'] = df_garmin['TextClean'].str.lower()

# remove white spaces
df_garmin['TextClean'] = df_garmin['TextClean'].str.strip()

In [26]:
# remove stopwords
df_garmin['TextClean'] = df_garmin['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [27]:
# Lemmatization
lemmatizer_garmin = WordNetLemmatizer()
df_garmin['TextCleanLemm'] = df_garmin['TextClean'].apply(lambda x: [lemmatizer_garmin.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

In [28]:
## Sentiment scores

# Textblob
df_garmin['tb_score'] = df_garmin['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_garmin['afinn_score'] = df_garmin['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_garmin['vader_com'] = df_garmin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_garmin['vader_pos'] = df_garmin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_garmin['vader_neg'] = df_garmin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_garmin['vader_neu'] = df_garmin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

In [29]:
#Export garmin data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_garmin.to_csv("Garmin_sentiment_analysis.csv", index=False, encoding='utf-8-sig')

# Sentiment analysis for YamahaAudio

In [30]:
df_yamaha['Text'] = df_yamaha['Text'].fillna("missing")

df_yamaha['TextClean'] = df_yamaha['Text']

count = 0
for i in df_yamaha['TextClean']:
    df_yamaha.at[count,"TextClean"] = remove_URL(i)
    df_yamaha.at[count,"TextClean"] = remove_hashtags(df_yamaha.at[count,"TextClean"])
    df_yamaha.at[count,"TextClean"] = remove_atMarks(df_yamaha.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_yamaha['TextClean'] = df_yamaha['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_yamaha['TextClean'] = df_yamaha['TextClean'].str.lower()

# remove white spaces
df_yamaha['TextClean'] = df_yamaha['TextClean'].str.strip()

In [31]:
# remove stopwords
df_yamaha['TextClean'] = df_yamaha['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [32]:
# Lemmatization
lemmatizer_yamaha = WordNetLemmatizer()
df_yamaha['TextCleanLemm'] = df_yamaha['TextClean'].apply(lambda x: [lemmatizer_yamaha.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

In [33]:
## Sentiment scores

# Textblob
df_yamaha['tb_score'] = df_yamaha['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_yamaha['afinn_score'] = df_yamaha['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_yamaha['vader_com'] = df_yamaha['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_yamaha['vader_pos'] = df_yamaha['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_yamaha['vader_neg'] = df_yamaha['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_yamaha['vader_neu'] = df_yamaha['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

In [34]:
#Export garmin data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_yamaha.to_csv("YamahaAudio_sentiment_analysis.csv", index=False, encoding='utf-8-sig')

# Sentiment analysis for Belkin

In [35]:
df_belkin['Text'] = df_belkin['Text'].fillna("missing")

df_belkin['TextClean'] = df_belkin['Text']

count = 0
for i in df_belkin['TextClean']:
    df_belkin.at[count,"TextClean"] = remove_URL(i)
    df_belkin.at[count,"TextClean"] = remove_hashtags(df_belkin.at[count,"TextClean"])
    df_belkin.at[count,"TextClean"] = remove_atMarks(df_belkin.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_belkin['TextClean'] = df_belkin['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_belkin['TextClean'] = df_belkin['TextClean'].str.lower()

# remove white spaces
df_belkin['TextClean'] = df_belkin['TextClean'].str.strip()


In [36]:
# remove stopwords
df_belkin['TextClean'] = df_belkin['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

In [37]:
# Lemmatization
lemmatizer_belkin = WordNetLemmatizer()
df_belkin['TextCleanLemm'] = df_belkin['TextClean'].apply(lambda x: [lemmatizer_belkin.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))


In [38]:
## Sentiment scores

# Textblob
df_belkin['tb_score'] = df_belkin['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_belkin['afinn_score'] = df_belkin['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_belkin['vader_com'] = df_belkin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_belkin['vader_pos'] = df_belkin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_belkin['vader_neg'] = df_belkin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_belkin['vader_neu'] = df_belkin['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

In [39]:
#Export garmin data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_belkin.to_csv("Belkin_sentiment_analysis.csv", index=False, encoding='utf-8-sig')

# Sentiment analysis for BossAudio

In [12]:
# Load data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/BossAudio/Combined')
df_boss = pd.read_csv('BossAudio_combined.csv')

In [14]:
df_boss['Text'] = df_boss['Text'].fillna("missing")

df_boss['TextClean'] = df_boss['Text']

count = 0
for i in df_boss['TextClean']:
    df_boss.at[count,"TextClean"] = remove_URL(i)
    df_boss.at[count,"TextClean"] = remove_hashtags(df_boss.at[count,"TextClean"])
    df_boss.at[count,"TextClean"] = remove_atMarks(df_boss.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_boss['TextClean'] = df_boss['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_boss['TextClean'] = df_boss['TextClean'].str.lower()

# remove white spaces
df_boss['TextClean'] = df_boss['TextClean'].str.strip()

# remove stopwords
df_boss['TextClean'] = df_boss['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

# Lemmatization
lemmatizer_boss = WordNetLemmatizer()
df_boss['TextCleanLemm'] = df_boss['TextClean'].apply(lambda x: [lemmatizer_boss.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

## Sentiment scores

# Textblob
df_boss['tb_score'] = df_boss['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_boss['afinn_score'] = df_boss['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_boss['vader_com'] = df_boss['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_boss['vader_pos'] = df_boss['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_boss['vader_neg'] = df_boss['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_boss['vader_neu'] = df_boss['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

#Export data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_boss.to_csv("BossAudio_sentiment_analysis.csv", index=False, encoding='utf-8-sig')


# Sentiment analysis for TrippLite

In [15]:
# Load data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/TrippLite/Combined')
df_tl = pd.read_csv('TrippLite_combined.csv')

In [16]:
df_tl['Text'] = df_tl['Text'].fillna("missing")

df_tl['TextClean'] = df_tl['Text']

count = 0
for i in df_tl['TextClean']:
    df_tl.at[count,"TextClean"] = remove_URL(i)
    df_tl.at[count,"TextClean"] = remove_hashtags(df_tl.at[count,"TextClean"])
    df_tl.at[count,"TextClean"] = remove_atMarks(df_tl.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_tl['TextClean'] = df_tl['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_tl['TextClean'] = df_tl['TextClean'].str.lower()

# remove white spaces
df_tl['TextClean'] = df_tl['TextClean'].str.strip()

# remove stopwords
df_tl['TextClean'] = df_tl['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

# Lemmatization
lemmatizer_tl = WordNetLemmatizer()
df_tl['TextCleanLemm'] = df_tl['TextClean'].apply(lambda x: [lemmatizer_tl.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

## Sentiment scores

# Textblob
df_tl['tb_score'] = df_tl['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_tl['afinn_score'] = df_tl['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_tl['vader_com'] = df_tl['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_tl['vader_pos'] = df_tl['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_tl['vader_neg'] = df_tl['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_tl['vader_neu'] = df_tl['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

#Export data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_tl.to_csv("TrippLite_sentiment_analysis.csv", index=False, encoding='utf-8-sig')


# Sentiment analysis for PolkAudio

In [17]:
# Load data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/PolkAudio/Combined')
df_polk = pd.read_csv('PolkAudio_combined.csv')

In [18]:
df_polk['Text'] = df_polk['Text'].fillna("missing")

df_polk['TextClean'] = df_polk['Text']

count = 0
for i in df_polk['TextClean']:
    df_polk.at[count,"TextClean"] = remove_URL(i)
    df_polk.at[count,"TextClean"] = remove_hashtags(df_polk.at[count,"TextClean"])
    df_polk.at[count,"TextClean"] = remove_atMarks(df_polk.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_polk['TextClean'] = df_polk['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_polk['TextClean'] = df_polk['TextClean'].str.lower()

# remove white spaces
df_polk['TextClean'] = df_polk['TextClean'].str.strip()

# remove stopwords
df_polk['TextClean'] = df_polk['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

# Lemmatization
lemmatizer_polk = WordNetLemmatizer()
df_polk['TextCleanLemm'] = df_polk['TextClean'].apply(lambda x: [lemmatizer_polk.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

## Sentiment scores

# Textblob
df_polk['tb_score'] = df_polk['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_polk['afinn_score'] = df_polk['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_polk['vader_com'] = df_polk['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_polk['vader_pos'] = df_polk['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_polk['vader_neg'] = df_polk['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_polk['vader_neu'] = df_polk['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

#Export data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_polk.to_csv("PolkAudio_sentiment_analysis.csv", index=False, encoding='utf-8-sig')


# Sentiment analysis for Sangean

In [19]:
# Load data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/Sangean/Combined')
df_san = pd.read_csv('Sangean_combined.csv')

In [20]:
df_san['Text'] = df_san['Text'].fillna("missing")

df_san['TextClean'] = df_san['Text']

count = 0
for i in df_san['TextClean']:
    df_san.at[count,"TextClean"] = remove_URL(i)
    df_san.at[count,"TextClean"] = remove_hashtags(df_san.at[count,"TextClean"])
    df_san.at[count,"TextClean"] = remove_atMarks(df_san.at[count,"TextClean"])
    count += 1

# Remove number and special characters
df_san['TextClean'] = df_san['TextClean'].replace("[^a-zA-Z]+", " ", regex = True)

# lower case
df_san['TextClean'] = df_san['TextClean'].str.lower()

# remove white spaces
df_san['TextClean'] = df_san['TextClean'].str.strip()

# remove stopwords
df_san['TextClean'] = df_san['TextClean'].apply(lambda x: " ".join(x for x in x.split() if (x not in stop) or (x in excluded)))

# Lemmatization
lemmatizer_san = WordNetLemmatizer()
df_san['TextCleanLemm'] = df_san['TextClean'].apply(lambda x: [lemmatizer_san.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(x)]).apply(lambda x:" ".join(x))

## Sentiment scores

# Textblob
df_san['tb_score'] = df_san['TextCleanLemm'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Affin score
af = Afinn()
df_san['afinn_score'] = df_san['TextCleanLemm'].apply(lambda x: af.score(x))

#vader analysis
sid = SentimentIntensityAnalyzer()
df_san['vader_com'] = df_san['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_san['vader_pos'] = df_san['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['pos'])
df_san['vader_neg'] = df_san['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neg']*(-1))
df_san['vader_neu'] = df_san['TextCleanLemm'].apply(lambda x: sid.polarity_scores(x)['neu'])

#Export data
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
df_san.to_csv("Sangean_sentiment_analysis.csv", index=False, encoding='utf-8-sig')


# Combine all brands twitter data

In [21]:
import glob

os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
all_filenames = [i for i in glob.glob('*')]
print(all_filenames)

['TrippLite_sentiment_analysis.csv', 'YamahaAudio_sentiment_analysis.csv', 'Sangean_sentiment_analysis.csv', 'Apple_sentiment_analysis.csv', 'PolkAudio_sentiment_analysis.csv', 'Belkin_sentiment_analysis.csv', 'Pwr+_sentiment_analysis.csv', 'Garmin_sentiment_analysis.csv', 'Sony_sentiment_analysis.csv', 'BossAudio_sentiment_analysis.csv']


In [None]:
combined_twitter_df = pd.DataFrame(list())

In [39]:
combined_twitter_df = pd.DataFrame(list())

for file in all_filenames:
    brandname = file.split("_")[0]
    df = pd.read_csv(file, index_col=False, engine='python')
    df['Brand'] = brandname
    combined_twitter_df = pd.concat([combined_twitter_df,df])
    print(combined_twitter_df.shape)

(31417, 15)
(103107, 15)
(135389, 15)
(171926, 15)
(204046, 15)
(236915, 15)
(269796, 16)
(302669, 16)
(325590, 16)
(327982, 16)


In [45]:
# Drop unnamed column
combined_twitter_df = combined_twitter_df.drop('Unnamed: 0', axis = 1)

# Drop null values
combined_twitter_df = combined_twitter_df.dropna(subset = ['Datetime', 'Tweet Id', 'Retweets', 'TextClean'])

In [47]:
os.chdir('/Users/joshuatam/Documents/NUS/AY_2020_21_SEM_2/BT4222/Group Project/Datasets/sentimentAnalysis')
combined_twitter_df.to_csv("Combined_sentiment_analysis.csv", index=False, encoding='utf-8-sig')