# Leveraging Sentiment Analysis for Enhanced Brand Management

# Main Objectives

1. **Identify Brand Sentiment:**  The primary objective is to  leverage Natural Language Processing to categorize the emotions expressed in tweets directed at the brand.  This will involve classifying tweets as positive, negative, or neutral sentiment.

2. **Understand Customer Emotions:**  By analyzing the emotions expressed in tweets  beyond just positive or negative, we aim to gain a deeper understanding of the specific emotions customers associate with the brand.  This could include happiness, sadness, anger, frustration, or excitement.

3. **Actionable Insights:**  The final objective is to translate the sentiment analysis results into actionable insights for the branding team.  This could involve identifying recurring themes in negative tweets, pinpointing triggers for positive brand emotions, or highlighting areas where brand messaging can be improved to evoke desired customer emotions.

In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer,TweetTokenizer
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC
from yellowbrick.classifier import ClassificationReport
from sklearn.naive_bayes import ComplementNB

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# Import the dataset and use encoding for the special characters
data = pd.read_csv('judge-1377884607_tweet_product_company.csv',encoding='unicode_escape')
# Preview top 5
data.head()

In [None]:
data.info()

# Data Understanding

The provided dataset from data.world: https://data.world/crowdflower/brands-and-product-emotions offers valuable columns for sentiment analysis in branding:

1. tweet_text: This column contains the actual text of the tweet, allowing for analysis of the sentiment expressed within the content itself.

2. emotion_in_tweet_is_directed_at: This column identifies whether the expressed emotion is directed at a brand or product. This allows for targeted sentiment analysis specific to the brand's performance.


3. is_there_an_emotion_directed_at_a_brand_or_product(**target variable**): This binary column provides a quick indicator of brand-related sentiment, enabling efficient initial filtering of relevant data.

# Data Cleaning

In [None]:
# Rename column 'tweet_text' to 'text'
data = data.rename(columns={'tweet_text': 'text'})
# Rename column 'emotion_in_tweet_is_directed_at' to 'target'
data = data.rename(columns={'emotion_in_tweet_is_directed_at': 'target'})
# Rename column 'is_there_an_emotion_directed_at_a_brand_or_product' to 'emotion'
data = data.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'})

data.head()

In [None]:
# Find a replace any target brand with the keyword using .replace()
data['target'] = data['target'].str.replace("['Google','Other Google product or service']", "Google", case=False, regex=False)
data['target'] = data['target'].str.replace("iPad or iPhone App", "Apple", case=False, regex=False)
data['target'] = data['target'].str.replace('Andriod App', "Andriod", case=False, regex=False)
data['target'] = data['target'].str.replace("iPad", "Apple", case=False, regex=False)
data['target'] = data['target'].str.replace("iPhone", "Apple", case=False, regex=False)
data['target'] = data['target'].str.replace("Other Google product or service", "Google", case=False, regex=False)
data['target'] = data['target'].str.replace("Android App", "Android", case=False, regex=False)
data['target'] = data['target'].str.replace("Other Apple product or service", "Android", case=False, regex=False)
# Replace the no emotion toward brand or product to no emotion
data['emotion'] = data['emotion'].str.replace("No emotion toward brand or product", "No emotion", case=False, regex=False)

data.head()




In [None]:
data.isnull().sum()

In [None]:
# Replace missing values with "Undetermined" in column: 'target'
data = data.fillna({'target': "Undetermined"})
# Drop the missing value in text
data.dropna(inplace=True)

# Data Pre-processing

- The functions and its logic will be created first however data preprocessing will be done after the data split in order to regulate data leakage

In [None]:
# Create a function to clean the text
def clean_text(text):
    #TweetTokenizer also puts each punctuation as it's own token
    no_handle = tweet_tknzr.tokenize(text)
    #Join the list of non-handle tokens back together
    text = " ".join(no_handle) 
    # Remove the punctuation
    text = re.sub(r'[^\w\s]',"",text)
    # Remove the @ mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    # Remove 
    text = re.sub(r'&[a-z]+;', '', text)
    #TweetTokenizer also puts each punctuation as it's own token
    no_handle = tweet_tknzr.tokenize(text)
    # Remove keyword link
    text = re.sub(r"\blink\b", " ", text)
    # Remove keyword video
    text = re.sub(r"\bvideo\b", " ", text)
    # Remove www. and .com
    text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", " ", text)
    # Remove keyword "sxsx"
    text = re.sub(r"\bsxsw\b", " ", text)
    # Remove keyword "SXSW"
    text = re.sub(r"\bSXSW\b", " ", text)
    # Remove keyword sxtx
    text = re.sub(r"\sxtx\b", " ", text)
    # Remove the # symbol
    text = re.sub(r'#','', text)
    # Removing RT
    text = re.sub(r'RT[\s]+', '', text)
    # Removing hyperlink
    text = re.sub(r'https?:\/\/\S+', '', text)
    # Remove Special Characters
    text = re.sub(r"[^\x00-\x7F]+\ *(?:[^\x00-\x7F]|)*", " ", text)
    # Remove curly brackets
    text = re.sub(r'{.+?}', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove leftover numbers 
    text = re.sub(r'\d+', '', text)
    #Lower all text
    text = text.lower()
    # Remove stop words and common Twitter jargon 
    stop_words = stopwords.words('english') + ['rt', 'amp']
    # Remove empty strings after cleaning
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove empty strings after cleaning 
    text = ' '.join(word for word in text.split() if word)
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    
    return text

In [None]:
data = data.astype({'text': 'string'})

In [None]:
# Convert the target into binary
dict_target = {'No emotion':0, 
             'Positive emotion':1,
             'Negative emotion':0,
             "I can't tell": 0}
data['emotion'] = data['emotion'].map(dict_target)

data.head()