# Dataset Preparation

Convertendo o arquivo csv em um Dataframe para preparação da análise de dados.

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [3]:
# Download nlkt packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/le4o/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/le4o/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/le4o/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/le4o/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
columns = ['tweet_id', 'sentiment', 'content']

# Seed para os resultados da tokenização
np.random.seed(500)

# skip_blank_lines remove as possíveis linhas em branco
data = pd.read_csv(r'data/tweet_emotions.csv', skip_blank_lines=True)

df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [5]:
def uppercase_to_lowercase(x): return str(x).lower()

In [6]:
# Removendo linhas em branco
df = df.drop(df[(df.tweet_id == 0) | (df.sentiment == "") | (df.content == "")].index)

# Convertendo txto em caixa alta para caixa baixa
df['content'] = df['content'].apply(uppercase_to_lowercase)

# Criando os tokens para o texto do tweet 
df['content'] = [word_tokenize(text) for text in df['content']]

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['content']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

df

Unnamed: 0,tweet_id,sentiment,content,text_final
0,1956967341,empty,"[@, tiffanylue, i, know, i, was, listenin, to,...","['tiffanylue', 'know', 'listenin', 'bad', 'hab..."
1,1956967666,sadness,"[layin, n, bed, with, a, headache, ughhhh, ......","['layin', 'n', 'bed', 'headache', 'ughhhh', 'w..."
2,1956967696,sadness,"[funeral, ceremony, ..., gloomy, friday, ...]","['funeral', 'ceremony', 'gloomy', 'friday']"
3,1956967789,enthusiasm,"[wants, to, hang, out, with, friends, soon, !]","['want', 'hang', 'friend', 'soon']"
4,1956968416,neutral,"[@, dannycastillo, we, want, to, trade, with, ...","['dannycastillo', 'want', 'trade', 'someone', ..."
...,...,...,...,...
39995,1753918954,neutral,"[@, johnlloydtaylor]",['johnlloydtaylor']
39996,1753919001,love,"[happy, mothers, day, all, my, love]","['happy', 'mother', 'day', 'love']"
39997,1753919005,love,"[happy, mother, 's, day, to, all, the, mommies...","['happy', 'mother', 'day', 'mommy', 'woman', '..."
39998,1753919043,happiness,"[@, niariley, wassup, beautiful, !, !, !, foll...","['niariley', 'wassup', 'beautiful', 'follow', ..."


In [13]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['text_final'], df['sentiment'], test_size=0.3)

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

print(Tfidf_vect.vocabulary_)
print(Train_X_Tfidf)

, 'brilliant': 567, 'windows': 4846, 'beard': 383, 'nurse': 3057, 'birmingham': 445, 'banquet': 350, 'bg': 431, 'blonde': 481, 'soundtrack': 4043, 'williams': 4840, 'montana': 2846, 'joey': 2287, 'silent': 3902, 'denny': 1169, 'romance': 3682, 'eve': 1461, 'twpp': 4589, 'murray': 2902, 'grocery': 1865, 'hoo': 2050, 'ram': 3529, 'lolz': 2564, 'somethin': 4014, 'corner': 969, 'campus': 658, 'coworker': 996, 'brew': 557, 'allah': 109, 'phil': 3276, 'oww': 3158, 'commence': 890, 'syrup': 4263, 'spotify': 4084, 'unlimited': 4645, 'jt': 2316, 'mtv': 2887, 'oi': 3091, 'helpful': 1987, 'plastic': 3327, 'chew': 759, 'university': 4641, 'lj': 2538, 'pint': 3310, 'guide': 1882, 'belt': 411, 'ahah': 73, 'limb': 2509, 'prevent': 3423, 'gg': 1766, 'sweaty': 4242, 'activity': 34, 'stitch': 4138, 'target': 4288, 'default': 1146, 'display': 1251, 'hung': 2099, 'looong': 2574, 'dresser': 1314, 'sumthin': 4202, 'whn': 4820, 'total': 4472, 'turkey': 4547, 'yard': 4942, 'yawn': 4943, 'monster': 2845, 'ten'

In [14]:
def get_values():
    return Train_X_Tfidf, Tfidf_vect, Test_X, Train_Y, Test_Y