In [1]:
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

import re
from string import punctuation

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Ler o arquivo com pandas
DF_COL = ["sentimento", "ids", "data", "flag", "usuario", "texto"]
DF_ENCODE = 'ISO-8859-1'
pd.set_option('max_colwidth', 300)
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding=DF_ENCODE, names=DF_COL)
# Deixar somente as colunas que interessam
df = df[['sentimento','texto']]
df['sentimento'] = df['sentimento'].replace(4,1)
# Levar os dados do df para listas
texto, sentimentos = list(df['texto']), list(df['sentimento'])
df


Unnamed: 0,sentimento,texto
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...
1599995,1,Just woke up. Having no school is the best feeling ever
1599996,1,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,1,Are you ready for your MoJo Makeover? Ask me for details
1599998,1,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


In [3]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat', '<3': 'heart', '=D': 'smile', ':D': 'smile', 'XD': 'smile', 'xd': 'sad'}
          

In [4]:
def pre_processamento(tweetsdata):
    # Instâncias:
    lem = WordNetLemmatizer()
    tokenizar = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    # Padrões de expressões regulares:
    padrao_url = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    padrao_num = r"[0-9]"
    preprocess_tweet = []
    stoplist = stopwords.words('english')
    for tweet in tweetsdata:
        # Dar significado aos emojis:
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # Usando os padrões:
        tweet = re.sub(padrao_url, '', tweet)  # Retirar URLS
        tweet = re.sub(padrao_num, '', tweet)  # Retirar números
        # Tokenizar
        tweet = tokenizar.tokenize(tweet)
        # Retirar Stopwords
        tweet = [p for p in tweet if not p in stoplist]
        # Retirar símbolos e pontuações
        tweet = [p for p in tweet if not p in punctuation]
        # Lematização
        tweet = [lem.lemmatize(p) for p in tweet]
        # Adicionar tweet pré-processado
        preprocess_tweet.append(' '.join(tweet))
    return preprocess_tweet

tweets_preprocess = pre_processamento(texto)


In [5]:
# Dividindo a base de dados entre treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(tweets_preprocess, sentimentos, random_state=0, test_size=0.2)  #Princípio de Pareto 80/20
# Instanciando vetorizador
vetorizador =  TfidfVectorizer(ngram_range=(1,2), min_df=3)
# Vetorizando os tweets divididos para treino e ajustando-os no vetorizador (formar vocabulário)
x_treino = vetorizador.fit_transform(x_treino)
# Vetorizando os tweets divididos para teste
x_teste = vetorizador.transform(x_teste)


In [6]:
lista = vetorizador.get_feature_names_out()
len(lista)


447157

In [7]:
# Instanciando modelo
modelo = LogisticRegression(max_iter=600)
# Ajustando os tweetes de treino e suas classificações para treinar o modelo
modelo.fit(x_treino, y_treino)


In [8]:
# Testando a precisão média do modelo utilizando os tweets divididos para testes
y_predict = modelo.predict(x_teste)
print(classification_report(y_teste, y_predict))


              precision    recall  f1-score   support

           0       0.81      0.78      0.79    159815
           1       0.79      0.81      0.80    160185

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000



In [27]:
def predicao(novos):
    # Pré-processar -> vetorizar -> predizer
    novos_pre = pre_processamento(novos)
    novos_vet = vetorizador.transform(novos_pre)
    novo_sentimento = modelo.predict(novos_vet)
    # Tabela com os tweets originais, pré-processados e a predição do algoritmo
    novo_df = pd.DataFrame({"Tweets Originais":novos, "Tweets Pré-processados": novos_pre, "Previsão do sentimento": novo_sentimento})
    novo_df = novo_df.replace({1: "Positivo", 0: "Negativo"})
    return novo_df


# Lista de novos tweets aleatórios retirados de outrod dataset -> adicionar testes aqui:
lista_tweets = ["Happy Mothers day to all you Mums out there :D", "2am feedings for the baby are fun when he is all smiles and coos,fun", "Went to sleep and there is a power cut in Noida  Power back up not working too", "WOW, i AM REALLY MiSSiN THE FAM(iLY) TODAY. BADDD. :(", "i`ve been sick for the past few days  and thus, my hair looks wierd.  if i didnt have a hat on it would look... http://tinyurl.com/mnf4kwsick"]
df_tweets = predicao(lista_tweets)
df_tweets.head()


Unnamed: 0,Tweets Originais,Tweets Pré-processados,Previsão do sentimento
0,Happy Mothers day to all you Mums out there :D,happy mother day mum emojismile,Positivo
1,"2am feedings for the baby are fun when he is all smiles and coos,fun",2am feeding baby fun smile coo fun,Positivo
2,Went to sleep and there is a power cut in Noida Power back up not working too,went sleep power cut noida power back working,Negativo
3,"WOW, i AM REALLY MiSSiN THE FAM(iLY) TODAY. BADDD. :(",wow really missin fam ily today baddd emojisad,Negativo
4,"i`ve been sick for the past few days and thus, my hair looks wierd. if i didnt have a hat on it would look... http://tinyurl.com/mnf4kwsick",sick past day thus hair look wierd didnt hat would look ...,Negativo
