In [8]:
import pandas as pd
import numpy as np
import pickle
from math import e

from sklearn.feature_extraction.text import CountVectorizer
import data_cleaning_functions as dcf

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [9]:
infile = open("data/df", "rb")
df = pickle.load(infile)
infile.close()
df

Unnamed: 0,id,created_at,tweet,keyword,target,clean_tweet
0,1396047477695029249,2021-05-22 10:17:10+00:00,Tava tão feliz c o apartamento mas acho q é golpe,feliz,positive,tav tao apart ach golp
1,1396047411047542785,2021-05-22 10:16:54+00:00,@rita_castro1 Bom dia Sweetie!! Sábado feliz!!...,feliz,positive,bom dia sweti ! sab ! ☕
2,1396047195921604611,2021-05-22 10:16:03+00:00,Bom dia e um feliz sábado a todos ✌🏼💜🍀. 😘😘 htt...,feliz,positive,bom dia sab tod ✌
3,1396046918153904128,2021-05-22 10:14:57+00:00,Eu estou tão feliz pela Hande ela merece tudo !,feliz,positive,tao hand merec tud !
4,1396045926016368642,2021-05-22 10:11:00+00:00,Estou tao feliz finalmente em Castelo Branco c...,feliz,positive,tao final castel branc xuxu
...,...,...,...,...,...,...
23327,1397872771640827908,2021-05-27 11:10:14+00:00,Eu: detesto musicais 🤮🤮🤮🤮\n\nAlso eu a dois mi...,detesto OR detestei,negative,music als doi minut episodi music anatom grey ...
23328,1397867369276579840,2021-05-27 10:48:46+00:00,Detesto está situação poha,detesto OR detestei,negative,situ poh
23329,1397839222883688449,2021-05-27 08:56:55+00:00,Que linda noite de sono ao sonhar com a pessoa...,detesto OR detestei,negative,lind noit son sonh pesso conhec faculdad ent ta
23330,1397833381099061248,2021-05-27 08:33:43+00:00,@Joaohpr Também detesto e evito sempre que exi...,detesto OR detestei,negative,evit sempr exist altern fac tap ryana ra tem…


In [10]:
infile = open("data/vectorizer", "rb")
vectorizer = pickle.load(infile)
infile.close()
vectorizer

CountVectorizer()

In [11]:
vocabulary = vectorizer.get_feature_names()
vocabulary[0:10]

['ab',
 'abac',
 'abacat',
 'abacax',
 'abaf',
 'abaix',
 'abal',
 'aban',
 'abandon',
 'abat']

In [24]:
outfile = open("data/vocabulary", 'wb')
pickle.dump(vocabulary, outfile)
outfile.close()

In [12]:
infile = open("data/doc_term_matrix", "rb")
doc_term_matrix = pickle.load(infile)
infile.close()
doc_term_matrix

<22842x12569 sparse matrix of type '<class 'numpy.int64'>'
	with 150810 stored elements in Compressed Sparse Row format>

In [13]:
# Model training

X = doc_term_matrix
y = np.where(df["target"]=="positive", 1, 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8)

model_multinomialNB = MultinomialNB(fit_prior=True)
model_multinomialNB.fit(X_train, y_train)

# Model testing

pred_values = model_multinomialNB.predict(X_test)
     
print(f"Accuracy: {round(accuracy_score(pred_values, y_test)*100, 2)}%")
conf_matrix = pd.DataFrame(confusion_matrix(y_test, pred_values))
conf_matrix.columns = ["predicted negative", "predicted positive"]
conf_matrix.index = ["actually negative", "actually positive"]
conf_matrix

Accuracy: 73.74%


Unnamed: 0,predicted negative,predicted positive
actually negative,596,815
actually positive,385,2773


In [14]:
# The model tends to favor predicting positive sentiment; it's not very good at predicting negative 
# tweets. Which makes sense, because the training set was imbalanced (almost twice as many positive tweets 
# as negative) and the naive bayes model uses prior probabilities in its classification which are calculated 
# based on the distribution of the training dataset. This is something that could be improved by collecting 
# more tweets with negative keywords to balance the training dataset.

In [15]:
outfile = open("data/model_multinomialNB", 'wb')
pickle.dump(model_multinomialNB, outfile)
outfile.close()

In [16]:
# Predicting sentiment polarity of new text

# First, I will build a dataframe with the probabilities (per word) of being in the positive or negative 
# subsets of the training dataset.

feature_log_probs_df = (pd.DataFrame([vocabulary, 
                                      model_multinomialNB.feature_log_prob_[0], 
                                      model_multinomialNB.feature_log_prob_[1]])
                        .transpose()
                        .rename(columns={0:"word", 1:"log_p_negative", 2:"log_p_positive"})
                        .set_index("word"))

feature_log_probs_df["p_negative"] = e**feature_log_probs_df["log_p_negative"] / (e**feature_log_probs_df["log_p_negative"] + e**feature_log_probs_df["log_p_positive"])
feature_log_probs_df["p_positive"] = e**feature_log_probs_df["log_p_positive"] / (e**feature_log_probs_df["log_p_negative"] + e**feature_log_probs_df["log_p_positive"])
feature_log_probs_df

Unnamed: 0_level_0,log_p_negative,log_p_positive,p_negative,p_positive
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ab,-9.869483,-10.068515,0.549594,0.450406
abac,-10.968095,-10.761662,0.448574,0.551426
abacat,-10.274948,-10.356197,0.520301,0.479699
abacax,-10.968095,-10.761662,0.448574,0.551426
abaf,-10.968095,-11.454809,0.619332,0.380668
...,...,...,...,...
ℂℙ,-10.968095,-10.761662,0.448574,0.551426
ℕℂℕ,-10.968095,-10.761662,0.448574,0.551426
ℙℝ,-10.968095,-10.761662,0.448574,0.551426
스트레이키즈,-10.968095,-10.761662,0.448574,0.551426


In [17]:
outfile = open("data/feature_log_probs_df", 'wb')
pickle.dump(feature_log_probs_df, outfile)
outfile.close()

In [18]:
# With this table done, we can build the sentiment predictor function:

def sentiment_predictor(text, extended = False):
    """This function returns the sentiment polarity predicted by a Naive Bayes model trained on a dataset of 
    tweets. If extended is set to True, it also returns the probabilities per word of being in the negative 
    or positive subsets of the training dataset.
    
    Notes on the extended results: 
    
    Some words that are present in the input text might not appear in the extended results table. This 
    is due to one of three reasons:
    - the word is one of the keywords used in building the training dataset
    - the word is a stopword (a list of words in portuguese that are so common that they don't contribute 
    much meaning to the sentiment analysis)
    - the word was not present in the training dataset
    
    Sometimes, the average of the probabilities in the extended results table would suggest that the text 
    should be considered negative but the result is positive. This is due to the fact that the training
    dataset was imbalanced, containing more positive tweets than negative ones. This ends up favoring
    positive classifications in edge cases.
    """
    processed_text = dcf.remove_stopwords(dcf.clean_up_tweets(text), dcf.processed_stopwords)
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform([processed_text])
    words = count_vectorizer.get_feature_names()

    text_vector = [1 if i in words else 0 for i in vocabulary]
    prediction = model_multinomialNB.predict([text_vector])[0]
    
    if not extended:

        if prediction == 0:
            print("Negative")
            return(0)
        else:
            print("Positive")
            return(1)
    else:
        
        if prediction == 0:
            print("Negative")
        else:
            print("Positive")

        words = []
        neg_probs = []
        pos_probs = []

        for word in processed_text.split(" "):
            words.append(word)
            neg_probs.append(feature_log_probs_df.loc[word, "p_negative"])
            pos_probs.append(feature_log_probs_df.loc[word, "p_positive"])

        word_probs_df = (pd.DataFrame([words, neg_probs, pos_probs])
                         .transpose()
                         .rename(columns={0:"processed word", 1:"p_negative", 2:"p_positive"})
                         .set_index("processed word"))
        return(word_probs_df)
    

In [19]:
# Testing the sentiment_predictor function

test_example_1 = "O livro que li no sábado agradou-me muito, adorei" # Positive polarity
test_example_2 = "Odeio o estúpido do treinador do Sporting nao posso crer"          # Negative polarity

In [20]:
sentiment_predictor(test_example_1)

Positive


1

In [21]:
sentiment_predictor(test_example_2)

Negative


0

In [22]:
sentiment_predictor(test_example_1, extended=True)

Positive


Unnamed: 0_level_0,p_negative,p_positive
processed word,Unnamed: 1_level_1,Unnamed: 2_level_1
livr,0.413495,0.586505
li,0.474753,0.525247
sab,0.442441,0.557559
agrad,0.619332,0.380668
ador,0.28682,0.71318


In [23]:
sentiment_predictor(test_example_2, extended=True)

Negative


Unnamed: 0_level_0,p_negative,p_positive
processed word,Unnamed: 1_level_1,Unnamed: 2_level_1
odei,0.67037,0.32963
estup,0.466967,0.533033
trein,0.647502,0.352498
sporting,0.363916,0.636084
nao,0.587917,0.412083
pos,0.569527,0.430473
cr,0.67037,0.32963
