# Dataset Preparation

Convertendo o arquivo csv em um Dataframe para preparação da análise de dados.

In [15]:
import pandas as pd
import pickle5 as pickle
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm

In [3]:
# Download nlkt packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/le4o/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/le4o/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/le4o/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/le4o/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
columns = ['tweet_id', 'sentiment', 'content']

# Seed para os resultados da tokenização
np.random.seed(500)

# skip_blank_lines remove as possíveis linhas em branco
data = pd.read_csv(r'data/tweet_emotions.csv', skip_blank_lines=True)

df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [5]:
def uppercase_to_lowercase(x): return str(x).lower()

In [6]:
# Removendo linhas em branco
df = df.drop(df[(df.tweet_id == 0) | (df.sentiment == "") | (df.content == "")].index)

# Convertendo txto em caixa alta para caixa baixa
df['content'] = df['content'].apply(uppercase_to_lowercase)

# Criando os tokens para o texto do tweet 
df['content'] = [word_tokenize(text) for text in df['content']]

# WordNetLemmatizer requer tags para entender se uma palavra é um adjetivo, verbo ou um nome.
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['content']):
    # Declarando uma lista vazia para salvar as palavras que seguem as regras para esta etapa
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    # A função pos_tag abaixo classificará a palavra
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # Salvando o resultado final em text_final
    df.loc[index,'text_final'] = str(Final_words)

df

Unnamed: 0,tweet_id,sentiment,content,text_final
0,1956967341,empty,"[@, tiffanylue, i, know, i, was, listenin, to,...","['tiffanylue', 'know', 'listenin', 'bad', 'hab..."
1,1956967666,sadness,"[layin, n, bed, with, a, headache, ughhhh, ......","['layin', 'n', 'bed', 'headache', 'ughhhh', 'w..."
2,1956967696,sadness,"[funeral, ceremony, ..., gloomy, friday, ...]","['funeral', 'ceremony', 'gloomy', 'friday']"
3,1956967789,enthusiasm,"[wants, to, hang, out, with, friends, soon, !]","['want', 'hang', 'friend', 'soon']"
4,1956968416,neutral,"[@, dannycastillo, we, want, to, trade, with, ...","['dannycastillo', 'want', 'trade', 'someone', ..."
...,...,...,...,...
39995,1753918954,neutral,"[@, johnlloydtaylor]",['johnlloydtaylor']
39996,1753919001,love,"[happy, mothers, day, all, my, love]","['happy', 'mother', 'day', 'love']"
39997,1753919005,love,"[happy, mother, 's, day, to, all, the, mommies...","['happy', 'mother', 'day', 'mommy', 'woman', '..."
39998,1753919043,happiness,"[@, niariley, wassup, beautiful, !, !, !, foll...","['niariley', 'wassup', 'beautiful', 'follow', ..."


In [20]:
# Separando os dados para teste e treino
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'], df['sentiment'], test_size=0.3)

# Transformando os dados de target (sentiments) em um dataset numérico 
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Transformando a coleção dos dados dos tweets em um dataset numérico
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

dataset_ob = {
    'train_x': Train_X,
    'test_x': Test_X,
    'train_y': Train_Y,
    'test_y': Test_Y,
    'tfidf_vect': Tfidf_vect,
    'train_x_tfidf': Train_X_Tfidf,
    'test_x_tfidf': Test_X_Tfidf
}

In [21]:
# Utilizando o pickle para carregar os objetos para os outros notebooks
with open('./data/dataset.pkl', 'wb') as f:
    pickle.dump(dataset_ob, f)