# From Sentiments to Strategies: Building an NLP Model for Brand Engagement

`
from IPython.display import Image
`
Image(url = "https://storage.ning.com/topology/rest/1.0/file/get/3780584426?profile=original",width = 1000, height=800)


# Data understanding

The dataset was sourced from kaggle.

There are 9093 records and 3 features in this data.

Associated columns in the dataset are:
- `tweet_text`: Contains the text of the tweets.

- `emotion_in_tweet_is_directed_at`: Indicates the brand or product mentioned in the tweet (many missing values).

- `is_there_an_emotion_directed_at_a_brand_or_product`: Categorizes the sentiment as "Positive emotion," "Negative emotion," or potentially other classes.

The column names will be renamed to manageable ones in the data cleaning steps

# Data Preparation

In [15]:
# Data Exploration Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Preprocessing and Feature Extraction
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report

# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier 


In [14]:
# Ensure necessary NLTK resources are downloaded
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
class DataOverview():
    """
    This class takes a dataframe and returns basic information.
    """
    def __init__(self, data):
        self.data = data

    def read_head(self):
        """Returns the first 5 rows"""
        return self.data.head()

    def read_columns(self):
        """Returns the columns of the DataFrame"""
        return self.data.columns

    def read_info(self):
        """Returns the features, datatypes and non-null count"""
        return self.data.info()
    def read_describe(self):
        """Returns the statistical summary of the dataset"""
        return self.data.describe()

    def read_shape(self):
        """Returns the number of rows and columns"""
        return self.data.shape

    def read_corr(self):
        """Returns a correlation dataframe"""
        return self.data.corr()

    def read_corr_wrt_target(self, target='churn'):
        """Returns a Series containing the correlation of features with respect to target"""
        return self.data.corr()[target].sort_values(ascending=False)

    def read_multicollinearity(self, target='churn'):
        """Returns a correlation dataframe without the target"""
        return self.data.corr().iloc[0:-1, 0:-1]

    def read_na(self):
        """Returns the sum of all null values per feature"""
        return self.data.isna().sum()

    def read_duplicated(self):
        """Returns the sum of all duplicated records"""
        return self.data.duplicated().sum()

In [17]:
# The data
filepath='../data/tweet_product_company.csv'
df = pd.read_csv(filepath,encoding='iso-8859-1')

In [18]:
# Instantiate datapreparation object
dprep = DataOverview(data=df)

# First 5 lines of the DataFrame
dprep.read_head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [19]:
# Renaming columns for ease of work
df.rename(columns = {'tweet_text': 'text', 'emotion_in_tweet_is_directed_at': 'product', 
                     'is_there_an_emotion_directed_at_a_brand_or_product':'emotion'}, inplace = True)


In [20]:
# Explore features and their datatypes
dprep.read_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9092 non-null   object
 1   product  3291 non-null   object
 2   emotion  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [21]:
# Exploring feature null values
dprep.read_na()

text          1
product    5802
emotion       0
dtype: int64

Product has 5802 nulls values i.e 64% of all records in the dataset