# Sentiment Analysis of Tweets with Emojis
This is method 2.
Where we use a dataset contatining both utf-8 emoticons and tweets texts. We will be training it using naive bayes.

In [1]:
import pandas as pd
import numpy as np

### Importing the data
we will be importing the dataset that we processed

In [2]:
df_emojis = pd.read_csv('dataset/15_emoticon_data.csv')
df_tweets = pd.read_csv('dataset/1k_data_emoji_tweets_senti_posneg.csv')

In [3]:
# the emojis used in the 1k tweet data
# does not have sentiment cause the naive bayes will assign the sentiment automatically
df_emojis

Unnamed: 0.1,Unnamed: 0,Emoji,Unicode codepoint,Unicode name
0,0,😍,0x1f60d,SMILING FACE WITH HEART-SHAPED EYES
1,1,😭,0x1f62d,LOUDLY CRYING FACE
2,2,😘,0x1f618,FACE THROWING A KISS
3,3,😊,0x1f60a,SMILING FACE WITH SMILING EYES
4,4,😁,0x1f601,GRINNING FACE WITH SMILING EYES
5,5,😉,0x1f609,WINKING FACE
6,6,😄,0x1f604,SMILING FACE WITH OPEN MOUTH AND SMILING EYES
7,7,😒,0x1f612,UNAMUSED FACE
8,8,😔,0x1f614,PENSIVE FACE
9,9,😢,0x1f622,CRYING FACE


In [4]:
df_tweets

Unnamed: 0.1,Unnamed: 0,sentiment,post
0,0,0,One year ago today 😧 .1
1,1,1,keep smiling happy.1
2,2,0,It's hard to imagine anyone but Robin 😧 but st...
3,3,1,Good luck to Rich riding for great project in ...
4,4,1,He didn't play for a year
...,...,...,...
995,995,0,Maa ki kuss tumhari. Now take this bullshit yo...
996,996,0,Pozuelo (formerly of Swans) and Suso (Liverpoo...
997,997,0,Louis_Tomlinson follow me please? 😧
998,998,1,you know what i think? you look exceptional fo...


### Possible Methods for Sentiment Analysis:
1. this is method 2, simply using naive bayes to train the dataset
    - pros : easiest to do, just focus on the naive bayes(NB) classifier, less preprocessing
    - cons : the sentiments of emojis might not be accurate since it is based on the analysis of the given data to the NB classifier
    
2. another method, converting the emoticons to words first, before training the data
    - pros : adds more vocabulary to the NB classifier, might have better sentiment analysis
    - cons : additional data preprocessing, plus the cons of method above
    
3. can use the old method (method 1), separate text and emoji
    - pros : emoji sentiment is based on the emoji data, thus will not be affected by the NB classifier
    - cons : emoji sentiment might strongly affect the sentiment analysis (more emoji == less text sentiment influence). More data processing cause of text and symbol separation

### Classification

In [7]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [6]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 768.3 kB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp39-cp39-win_amd64.whl (267 kB)
     ------------------------------------ 267.8/267.8 kB 968.1 kB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2022.10.31
Note: you may need to restart the kernel to use updated packages.




In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
# TFIDF vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,
                            strip_accents='ascii', stop_words='english')

In [11]:
df_tweets2 = df_tweets

In [14]:
# dependent variable will be linked as:
# 0 = negative, 1 = positive
y = df_tweets2.sentiment
# convert 'sentence' from text to features
X = vectorizer.fit_transform(df_tweets2.post)

print(y.shape)
print(X.shape)
print(f'{X.shape[0]} observations X {X.shape[1]} unique words')


(1000,)
(1000, 2035)
1000 observations X 2035 unique words


### Naive Bayes - Training & Testing

In [27]:
# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=None)

# we will train a naive bayes classifier
clf = naive_bayes.MultinomialNB()
# clf = naive_bayes.BernoulliNB()

clf.fit(X_train, y_train)

# test our models accuracy
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.7648076923076923

### Build tweet sentiment analyzer

In [29]:
def get_sentiment(s_input = '😧 I hate sentiment analysis 😧'):
    # turn input into array
    input_array= np.array([s_input])

    # vectorize the input
    input_vector = vectorizer.transform(input_array)
    # predict the score of vector
    
    pred_senti = clf.predict(input_vector)

    return pred_senti[0]
print(get_sentiment())

1


### Tweet Something

In [19]:
import ipywidgets as widgets
import warnings; warnings.simplefilter('ignore')

In [20]:
def print_senti_status(test):
    print('========================================')
    print(f'Your input is "{test}" \n')
    sentiment = get_sentiment(test)
    sentiment = 'Positive' if sentiment == 1 else 'Negative'
    print(f'\nYour input is of "{sentiment}" sentiment'.upper())
    print('========================================')
    

In [21]:
# for text area
l = widgets.Layout(flex='0 1 auto', height='50px',width='auto')
post_tweet = widgets.Textarea(value='😍 I love sentiment analysis 😊', layout=l)
print(post_tweet.value)
# for button
button = widgets.Button(description="Say your Sentiments!")
output = widgets.Output()

def on_tweet_clicked(b):
    output.clear_output()
    with output:
        output.layout={'border': '1px solid black'}
        print_senti_status(post_tweet.value)

😍 I love sentiment analysis 😊


In [22]:
# type post below
display(post_tweet,button, output)
button.on_click(on_tweet_clicked)

Textarea(value='😍 I love sentiment analysis 😊', layout=Layout(flex='0 1 auto', height='50px', width='auto'))

Button(description='Say your Sentiments!', style=ButtonStyle())

Output()