In [1]:
import pandas as pd
import re
import numpy as np
import math
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

from string import punctuation

# Importing the data

In [2]:
positive_keywords = ["feliz",
                     "amor",
                     "obrigado OR obrigada",
                     "ótimo OR ótima",
                     "parabéns",
                     "fantástico OR fantástica", 
                     "maravilha OR maravilhoso OR maravilhosa"]
            
                     
negative_keywords = ["fml",
                     "péssimo OR péssima",
                     "trágico OR trágica",
                     "horrível",
                     "mau OR má",
                     "terrível", 
                     "detesto OR detestei"]

In [3]:
df = pd.DataFrame(columns=["id", "created_at", "tweet", "keyword", "target"])

In [4]:
df

Unnamed: 0,id,created_at,tweet,keyword,target


In [5]:
for keyword in positive_keywords:
    temp_df = pd.read_csv(f"data/raw_tweets_{keyword}.csv", header=None)
    temp_df.columns = ["id", "created_at", "tweet"]
    temp_df["keyword"] = [keyword] * len(temp_df)
    temp_df["target"] = ["positive"] * len(temp_df)
    df = df.append(temp_df, ignore_index=True)

In [6]:
for keyword in negative_keywords:
    temp_df = pd.read_csv(f"data/raw_tweets_{keyword}.csv", header=None)
    temp_df.columns = ["id", "created_at", "tweet"]
    temp_df["keyword"] = [keyword] * len(temp_df)
    temp_df["target"] = ["negative"] * len(temp_df)
    df = df.append(temp_df, ignore_index=True)

In [7]:
df

Unnamed: 0,id,created_at,tweet,keyword,target
0,1396047477695029249,2021-05-22 10:17:10+00:00,Tava tão feliz c o apartamento mas acho q é golpe,feliz,positive
1,1396047411047542785,2021-05-22 10:16:54+00:00,@rita_castro1 Bom dia Sweetie!! Sábado feliz!!...,feliz,positive
2,1396047195921604611,2021-05-22 10:16:03+00:00,Bom dia e um feliz sábado a todos ✌🏼💜🍀. 😘😘 htt...,feliz,positive
3,1396046918153904128,2021-05-22 10:14:57+00:00,Eu estou tão feliz pela Hande ela merece tudo !,feliz,positive
4,1396045926016368642,2021-05-22 10:11:00+00:00,Estou tao feliz finalmente em Castelo Branco c...,feliz,positive
...,...,...,...,...,...
23327,1397872771640827908,2021-05-27 11:10:14+00:00,Eu: detesto musicais 🤮🤮🤮🤮\n\nAlso eu a dois mi...,detesto OR detestei,negative
23328,1397867369276579840,2021-05-27 10:48:46+00:00,Detesto está situação poha,detesto OR detestei,negative
23329,1397839222883688449,2021-05-27 08:56:55+00:00,Que linda noite de sono ao sonhar com a pessoa...,detesto OR detestei,negative
23330,1397833381099061248,2021-05-27 08:33:43+00:00,@Joaohpr Também detesto e evito sempre que exi...,detesto OR detestei,negative


In [8]:
df["tweet"].tail(50)

23282    @marekjusk @inesmorsantos Longe disso!! Detest...
23283    @ruiclassic detesto o pizzi eu, mas tu es dema...
23284    @MiguelBushes Eu já detesto aquela comissão de...
23285    @AnaPRDeepBlue Ainda agora li da variante Indi...
23286    @_V_Castello 😂😂\nNormalmente, detesto ter razã...
23287    Detesto quando combino uma hora e a pessoa che...
23288    @nunocalvin Padeço do oposto em Lisboa e detes...
23289         Detesto q fodam as merdas q tenho combinadas
23290    @offcrissy Detesto isso pq o humor negro (o "v...
23291                 Detesto ter q tomar estas decisões 😥
23292    @dianarrmiranda eu vou na 6a, houve um episodi...
23293         Juro, detesto pessoal gemado no Clash Royale
23294    Detesto conversar com gente que conta várias s...
23295                    Está um calor do caralho, detesto
23296    E gostava de dizer o seguinte: O IMI não devia...
23297    Dois pidezinhos de esquerda bloqueados logo de...
23298                      Já disse que detesto ortopedi

In [9]:
# Some tweets may contain more than one keyword, and can therefore appear more than once in the dataset; this is not desirable, 
# so I will remove them

df = df.drop_duplicates(subset="id")

# Data cleaning - functions
These functions were also saved to a [.py file](data_cleaning_functions.py) that can be imported in the next steps of training and validating the model

In [10]:
def remove_pt_special_chars(text):
    '''Replaces portuguese special characters (accented vowels and ç) with their base cognates. Input should be lower case.'''
    text = re.sub(r"[àáãâ]", "a", text)
    text = re.sub(r"[éê]", "e", text)
    text = re.sub(r"[í]", "i", text)
    text = re.sub(r"[óôõ]", "o", text)
    text = re.sub(r"[ú]", "u", text)
    text = re.sub(r"[ç]", "c", text)
    return(text)

print(df.loc[19, "tweet"])
print(remove_pt_special_chars(df.loc[19, "tweet"].lower()))

@TelmaCarlos7 vamos roubar na melicia, tudo p te fazer feliz
@telmacarlos7 vamos roubar na melicia, tudo p te fazer feliz


In [11]:
def remove_mentions(text):
    '''Removes mentions to other twitter users.'''
    text = re.sub(r"(@[^ !]*)", "", text)
    return(text)

print(df.loc[19, "tweet"])
print(remove_mentions(df.loc[19, "tweet"]))

@TelmaCarlos7 vamos roubar na melicia, tudo p te fazer feliz
 vamos roubar na melicia, tudo p te fazer feliz


In [12]:
def remove_links(text):
    '''Removes links from text.'''
    text = re.sub(r"http[\S]*", "", text)
    text = re.sub(r"www.[\S]*", "", text)
    return(text)

print(df.loc[1, "tweet"])
print(remove_links(df.loc[1, "tweet"]))

@rita_castro1 Bom dia Sweetie!! Sábado feliz!! 🌞🙏🥰☕😘
@rita_castro1 Bom dia Sweetie!! Sábado feliz!! 🌞🙏🥰☕😘


In [13]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
f"[{punctuation[1:]}“”]"

'["#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”]'

In [15]:
def clean_punctuation_and_newlines(text):
    '''Removes punctuation *except* exclamation marks. Reduces multiple exclamation marks to a single one and adds a space 
    between words and exclamation mark. Also removes newline characters and reduces multiple spaces to one'''
    text = re.sub(f"[{punctuation[1:]}“”]", " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"(!+)", " ! ", text)
    text = re.sub(r" +", " ", text)
    return(text)

print(df.loc[5, "tweet"])
print(clean_punctuation_and_newlines(df.loc[5, "tweet"]))

@Maraia21401743 Obrigada Maraia!🤗♥️
Feliz dia,bom fim de semana!😘🌼🌹
#CanYaman
#SandokanTheSeries
#Sandokan
#LuxVide https://t.co/uixkrq949k
 Maraia21401743 Obrigada Maraia ! 🤗♥️ Feliz dia bom fim de semana ! 😘🌼🌹 CanYaman SandokanTheSeries Sandokan LuxVide https t co uixkrq949k


In [16]:
def remove_numbers(text):
    '''Removes numbers and words containing numbers'''
    text = re.sub(r"\S*[0-9]+\S*", " ", text)
    return(text)

print(df.loc[17, "tweet"])
print(remove_numbers(df.loc[17, "tweet"]))

@r_rodrigues0303 @0705Rita fico feliz por ouvir isso
    fico feliz por ouvir isso


In [17]:
def remove_emojis(text):
    '''Removes most emojis from the text; some are still left.'''
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    return RE_EMOJI.sub(r'', text)

print(df.loc[1, "tweet"])
print(remove_emojis(df.loc[1, "tweet"]))

@rita_castro1 Bom dia Sweetie!! Sábado feliz!! 🌞🙏🥰☕😘
@rita_castro1 Bom dia Sweetie!! Sábado feliz!! ☕


In [18]:
def remove_repeated_letters(text):
    '''Reduces repeated letters except r or s anywhere in each word by a single letter. Repeated r or s inside words are 
    reduced to rr or ss respectively, to account for words where this naturally occurs. Repeated r or s at the end of words are
    reduced to single letters.
    '''
    
    clean_text = []
    for i in text.split():
        word = re.sub(r"([a-z])\1{1,}$", r"\1", i)         #replaces repeated letters at the end of the word with single letter
        word = re.sub(r"([^sr])\1{1,}(?=.)", r"\1", word)  #replaces repeated letters (except r or s) anywhere with single letter
        word = re.sub(r"([rs])\1{1,}(?=.)", r"\1\1", word) #replaces repeated internal r or s with double r or s
        clean_text.append(word)
    return(" ".join(clean_text))

print("feliz aniversario aaaaaaa vem ca amorr")
print(remove_repeated_letters("feliz aniversario aaaaaaa vem ca amorr"))
print("")
print("oh meu deeeeusss isso e fanttaaastticooo adoorroooo arrre")
print(remove_repeated_letters("oh meu deeeeusss isso e fanttaaastticooo adoorroooo arrre"))

feliz aniversario aaaaaaa vem ca amorr
feliz aniversario a vem ca amor

oh meu deeeeusss isso e fanttaaastticooo adoorroooo arrre
oh meu deus isso e fantastico adorro arre


In [25]:
def apply_stemming(text):    
    '''Applies stemming using the Porter stemmer implementation for portuguese contained in the nltk library.
    Still not sure if it's worth using this or not, because the results are... sketchy, to say the least.
    '''
    stemmer = RSLPStemmer()
    return(" ".join([stemmer.stem(i) for i in text.split()]))

print(df.loc[8, "tweet"])
print(apply_stemming(df.loc[8, "tweet"]))

Benzema: Estar na seleção é uma recompensa e estou super feliz https://t.co/zY6FYKd266
benzema: est na seleç é uma recompens e est sup feliz https://t.co/zy6fykd266


In [26]:
def clean_up_tweets(tweet):
    clean_tweet = tweet.lower()
    clean_tweet = remove_mentions(clean_tweet)
    clean_tweet = remove_links(clean_tweet)
    clean_tweet = remove_numbers(clean_tweet)
    clean_tweet = remove_emojis(clean_tweet)
    clean_tweet = clean_punctuation_and_newlines(clean_tweet)
    clean_tweet = apply_stemming(clean_tweet)
    clean_tweet = remove_pt_special_chars(clean_tweet) 
    clean_tweet = remove_repeated_letters(clean_tweet)

    return(clean_tweet.strip(" "))

print(df.loc[0, "tweet"])
print(clean_up_tweets(df.loc[0, "tweet"]))

Tava tão feliz c o apartamento mas acho q é golpe
tav tao feliz c o apart mas ach q e golp


In [27]:
processed_stopwords = [clean_up_tweets(i) for i in stopwords.words("portuguese")] + ["q", "k", "c", "p"]
processed_stopwords.remove("nao")

processed_keywords = []

for i in positive_keywords + negative_keywords:
    for j in i.split():
        if j != "OR":
            processed_keywords.append(clean_up_tweets(j))

processed_stopwords = list(set(processed_stopwords + processed_keywords))

In [28]:
def remove_stopwords(text, stopwords):
    return(" ".join([i for i in text.split() if i not in stopwords]))

print(df.loc[0,"tweet"])
print(clean_up_tweets(df.loc[0,"tweet"]))
print(remove_stopwords(clean_up_tweets(df.loc[0,"tweet"]), processed_stopwords))

Tava tão feliz c o apartamento mas acho q é golpe
tav tao feliz c o apart mas ach q e golp
tav tao apart ach golp


# Cleaning up the tweets
Here I apply all data cleaning functions written above to my corpus of raw tweets

In [29]:
df["clean_tweet"] = df["tweet"].apply(lambda x: remove_stopwords(clean_up_tweets(x), processed_stopwords))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["clean_tweet"] = df["tweet"].apply(lambda x: remove_stopwords(clean_up_tweets(x), processed_stopwords))


In [30]:
df

Unnamed: 0,id,created_at,tweet,keyword,target,clean_tweet
0,1396047477695029249,2021-05-22 10:17:10+00:00,Tava tão feliz c o apartamento mas acho q é golpe,feliz,positive,tav tao apart ach golp
1,1396047411047542785,2021-05-22 10:16:54+00:00,@rita_castro1 Bom dia Sweetie!! Sábado feliz!!...,feliz,positive,bom dia sweti ! sab ! ☕
2,1396047195921604611,2021-05-22 10:16:03+00:00,Bom dia e um feliz sábado a todos ✌🏼💜🍀. 😘😘 htt...,feliz,positive,bom dia sab tod ✌
3,1396046918153904128,2021-05-22 10:14:57+00:00,Eu estou tão feliz pela Hande ela merece tudo !,feliz,positive,tao hand merec tud !
4,1396045926016368642,2021-05-22 10:11:00+00:00,Estou tao feliz finalmente em Castelo Branco c...,feliz,positive,tao final castel branc xuxu
...,...,...,...,...,...,...
23327,1397872771640827908,2021-05-27 11:10:14+00:00,Eu: detesto musicais 🤮🤮🤮🤮\n\nAlso eu a dois mi...,detesto OR detestei,negative,music als doi minut episodi music anatom grey ...
23328,1397867369276579840,2021-05-27 10:48:46+00:00,Detesto está situação poha,detesto OR detestei,negative,situ poh
23329,1397839222883688449,2021-05-27 08:56:55+00:00,Que linda noite de sono ao sonhar com a pessoa...,detesto OR detestei,negative,lind noit son sonh pesso conhec faculdad ent ta
23330,1397833381099061248,2021-05-27 08:33:43+00:00,@Joaohpr Também detesto e evito sempre que exi...,detesto OR detestei,negative,evit sempr exist altern fac tap ryana ra tem…


In [31]:
outfile = open("data/df", 'wb')
pickle.dump(df, outfile)
outfile.close()

# Building a document-term matrix using CountVectorizer
This produces a scipy sparse matrix with all tweets as rows and all words as columns. It's pretty big; it is essential to keep it as a sparse data structure to avoid memory errors. sklearn can take this kind of objects as input without needing to turn them into dense structures, so that's what I will do.

In [32]:
vectorizer = CountVectorizer()

In [33]:
doc_term_matrix = vectorizer.fit_transform(df["clean_tweet"])

In [34]:
outfile = open("data/vectorizer", 'wb')
pickle.dump(vectorizer, outfile)
outfile.close()

In [35]:
doc_term_matrix

<22842x12569 sparse matrix of type '<class 'numpy.int64'>'
	with 150810 stored elements in Compressed Sparse Row format>

In [36]:
outfile = open("data/doc_term_matrix", 'wb')
pickle.dump(doc_term_matrix, outfile)
outfile.close()

# Building a pivoted version of the document-term matrix
I will still need the scipy one for the models I implement on sklearn, but I also want to implement the Naive Bayes model from scratch. To do this, it is very convenient to produce a pivot table from this document-term matrix. I'm not sure how to do this efficiently with a scipy sparse matrix object, but with pandas it's quite straightforward. Luckily, pandas has a sparse version of dataframes, so I will convert my scipy sparse matrix object to one of those. 

In [37]:
doc_term_matrix = pd.DataFrame.sparse.from_spmatrix(doc_term_matrix, columns = vectorizer.get_feature_names())
doc_term_matrix

Unnamed: 0,ab,abac,abacat,abacax,abaf,abaix,abal,aban,abandon,abat,...,zuk,zulm,zum,zumb,zuzu,ℂℙ,ℕℂℕ,ℙℝ,스트레이키즈,제이크
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22838,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
doc_term_matrix["target_value"] = np.where(df["target"]=="positive", 1, 0)


  doc_term_matrix["target_value"] = np.where(df["target"]=="positive", 1, 0)


In [39]:
pivoted_dcm = doc_term_matrix.pivot_table(columns=["target_value"], aggfunc="sum")
pivoted_dcm

target_value,0,1
ab,2,3
abac,0,1
abacat,1,2
abacax,0,1
abaf,2,1
...,...,...
ℂℙ,0,1
ℕℂℕ,0,1
ℙℝ,0,1
스트레이키즈,0,1


In [40]:
outfile = open("data/pivoted_dcm", 'wb')
pickle.dump(pivoted_dcm, outfile)
outfile.close()

# Other/miscellanea

In [41]:
doc_term_matrix.sum().sort_values(ascending = False)[:60]

target_value    15786
nao              5283
dia              1577
tod              1346
faz              1301
tao              1107
fic              1038
tud               946
jog               930
pesso             847
bom               832
ver               817
bem               816
ano               801
quer              790
sempr             769
pod               766
vid               761
deu               696
vai               681
sab               660
melhor            649
ach               644
aind              641
hoj               633
cois              625
ta                621
diz               611
pas               607
agor              585
pra               585
assim             583
outr              575
gost              569
lind              569
nad               548
vou               537
porqu             530
fal               526
deix              487
vez               475
grand             474
sei               470
sim               467
nunc              462
cas       

## Removing words that are too long to be true

In [42]:
word_lengths = pd.DataFrame([doc_term_matrix.columns, [len(i) for i in doc_term_matrix.columns]]).transpose()
word_lengths.columns = ["word", "length"]
word_lengths

Unnamed: 0,word,length
0,ab,2
1,abac,4
2,abacat,6
3,abacax,6
4,abaf,4
...,...,...
12565,ℕℂℕ,3
12566,ℙℝ,2
12567,스트레이키즈,6
12568,제이크,3


In [43]:
word_lengths["length"].value_counts()

5     2328
4     2266
6     2230
3     1602
7     1489
8      928
9      531
2      361
10     301
11     145
12     115
13      70
14      45
15      33
16      29
18      22
17      19
19      13
21       7
20       7
22       6
26       5
23       5
24       4
25       2
28       2
30       1
42       1
39       1
60       1
27       1
Name: length, dtype: int64

In [44]:
word_lengths.loc[word_lengths["length"]>10][0:60]
#word_lengths.loc[word_lengths["length"]>10][0:60].to_csv("data/word_length_sample.csv")

Unnamed: 0,word,length
11,abdelmassih,11
38,abracadinhos,12
39,abracadinhosparatodav,21
40,abracadinhostod,15
84,aceitamaristel,14
95,achievement,11
180,adolescente,11
250,agarradinhosmuit,16
287,aguamonchiqu,12
305,ahahahahaha,11


At one point, I took a sample of 60 words over 10 characters. With the addition of new data this sample will likely change so I saved it to data/word_length_sample.csv for future referencing. 

A threshold of 12 letters seemed to be suitable to get rid of nonsense words. Out of this sample of 60 words roots over 10 characters, 18.3% were meaningful portuguese words that can carry meaning, all of them 12 characters or fewer; no words over 12 characters were found to be meaningful portuguese words.

(autodetermin, caleidoscopi, coincidissem, comparticip, concentraca, confidencial, consentimen, conservatori, contraditori, contribuint, contribuint)

In [45]:
word_lengths.loc[word_lengths["length"]>10].index

Int64Index([   11,    38,    39,    40,    84,    95,   180,   250,   287,
              305,
            ...
            12299, 12333, 12358, 12366, 12392, 12453, 12502, 12508, 12512,
            12569],
           dtype='int64', length=534)

In [46]:
doc_term_matrix = doc_term_matrix.iloc[:, word_lengths.loc[word_lengths["length"]<13].index]

In [47]:
n_words_clean = doc_term_matrix.sum(axis=1)
n_words_clean

0         6
1         5
2         5
3         5
4         6
         ..
22837    10
22838     2
22839     9
22840     9
22841    14
Length: 22842, dtype: int64

# Things to do:
- add a function to reduce letters repeated more than twice --> DONE
- add a function to remove plurals (unnecessary if stemming)

In [None]:
#fml might not be a good negative keyword. Turns out, in brazilian portuguese it is also used as a diminutive for familia,
#which is used coloquially kind of like fam in US english

df.loc[df["keyword"]=="fml", "tweet"]

In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [24]:
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


True

In [10]:
df["target"].value_counts()

positive    16139
negative     7193
Name: target, dtype: int64