In [None]:
#Instalar librerias
!pip install vaderSentiment
!pip install emosent-py
!pip install emoji
!pip install textblob

In [None]:
#Nltk extras
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

In [40]:
import numpy as np 
import pandas as pd 
import re
import nltk 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.tree import DecisionTreeClassifier

import emoji

#Lexicones
from emosent import get_emoji_sentiment_rank #Emotion
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
import textblob as txb
from nltk import sent_tokenize, word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
%matplotlib inline

In [41]:
##Vamos a utlizar un dataset de tweets sobre aerolíneas que fueron manualmente anotados con su sentimiento/polaridad
#Y Concatenar con los 1500 tweets que tenemos
tweets = pd.read_csv("Data/Tweets.csv")[['text','airline_sentiment']]
tweets=tweets.rename(columns={'airline_sentiment':'sentiment'}) 
for i in range(3):
    path='Data/Tweets{}.csv'.format(i+1)
    tweets=pd.concat([tweets,pd.read_csv(path)[['text','sentiment']]],ignore_index=True)

    
#Extraer 10k de tweets, textos para su posterior predicción
tweets10k=pd.read_csv('Data/Tweets23K.csv')['text']

In [42]:
#Variables Globales Lexicones

#Init Afinn
fileNameAFINN="Data/AFINN/AFINN-111.txt" 
afinn= dict(map(lambda ws: (ws[0], int(ws[1])), [ 
            ws.strip().split('\t') for ws in open(fileNameAFINN) ]))
#End Afinn
analyser = SentimentIntensityAnalyzer() #Cargar lexicon

In [43]:

#Retorna ScoreP y ScoreN
def sentiwordnet(text):
    wnl = nltk.WordNetLemmatizer()
    stoken=nltk.word_tokenize(text)
    tag=nltk.pos_tag(stoken)
    scoreP=0
    scoreN=0
    for t in tag:
        newtag=''
        lemmatized=wnl.lemmatize(t[0])
        dt=[]
        if t[1].startswith('NN'):
            newtag='n'
        elif t[1].startswith('JJ'):
            newtag='a'
        elif t[1].startswith('V'):
            newtag='v'
        elif t[1].startswith('R'):
            newtag='r'
        if(newtag!=''):
            synset=list(swn.senti_synsets(lemmatized,newtag))
            #Promedio de todos los posibles sentimientos
            if(len(synset)>0):   
                scorep=0
                scoren=0
                for dt in synset:
                    scorep+=dt.pos_score()
                    scoren+=dt.neg_score() 
                scoreP+=scorep/len(synset)#Suma de promedios positivos 
                scoreN+=scoren/len(synset)#Suma de promedios negativos
    return scoreP,scoreN


def extract_emojisArr(str):
    return ' '.join(c for c in str if c in emoji.UNICODE_EMOJI).split(" ")
    
def emojiLexicon(txt):
    vec=extract_emojisArr(txt)
    i=0
    sum=0
    for value in vec:
        sum+=get_emoji_sentiment_rank(value)['sentiment_score']
        i+=1
    if(sum==0):
        return 0.0
    return sum/i

def sentiment_analyzer_scores(sentence):
    global analyser
    score = analyser.polarity_scores(sentence)
    return score['neg'],score['pos'],score['neu'] 
    
#Retorna arreglo de lexicones, Affin, SentiworNet, Textblob, vanderSentiment
def lexicones(arrTextProccesing):
    global afinn    
    
    array=[]
    for txt in arrTextProccesing:
        scoreAfinn=sum(map(lambda word: afinn.get(word, 0), txt.split()))
        sentScoreP,sentScoreN=sentiwordnet(txt) 
        scoreTxblob=txb.TextBlob(txt).sentiment.polarity
        sasN,sasP,sasNeu=sentiment_analyzer_scores(txt)
        array.append([scoreAfinn,sentScoreP,sentScoreN,scoreTxblob,sasN,sasP,sasNeu,emojiLexicon(txt)])
    
    return np.array(array)
   

In [44]:
def extract_emojis(str):
    return ' '.join(c for c in str if c in emoji.UNICODE_EMOJI)

def processing_text(text,removeEmoticon=False,removeUserHastag=False):     
    #Pasarlo a minuscula
    newT=text.lower()
    #Eliminar links
    newT=re.sub("(\w+:\/\/\S+)",' ',newT)
    #Extraer emojis
    emojis=extract_emojis(newT)    
    if(removeUserHastag):
        newT=re.sub("(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)"," ",newT)
    
    #Eliminar puntos, comas, caracteres especiales ... 
    newT= re.sub(r'\W', ' ', newT)     
    if not removeEmoticon:
        newT=newT+" "+emojis #Concateno los Emojis
    return newT

In [75]:
#Imprimirá los porcentajes de la predicción de  los 10k  de tweets
def predictTweets(varP,features_prueba):
    print("Ahora con los 10K de Tweets:")
    predictions=varP.predict(features_prueba)
    data=pd.DataFrame(data={'sentiment':predictions})
    data['frecuencia']=1 
    va=data.groupby(['sentiment']).sum()   
    total=sum(va["frecuencia"])  
    indexs=["negative","neutral","positive"]
    for i  in indexs:
        try:
            print("{:8}: {:.2f}% equivale a {:4} tweets".format(i, 100*va['frecuencia'][i]/total,va['frecuencia'][i] ))
        except:
            print("{:8}: {:.2f}% equivale a {:4} tweets".format(i, 0,0 ))
            pass


def naiveBayes(X_train,X_test,y_train,y_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train) 
    predictions = gnb.predict(X_test) 
    print("Accurancy:",accuracy_score(y_test, predictions))    
    return gnb

def arboleDeDesicion(X_train,X_test,y_train,y_test):
    cart = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
    cart.fit(X_train, y_train)
    predictions = cart.predict(X_test)    
    print("Accurancy:",accuracy_score(y_test, predictions))
    return cart

def randomArbol(X_train,X_test,y_train,y_test):
    rfc = RandomForestClassifier(n_estimators=200, random_state=0,n_jobs=-1)
    rfc.fit(X_train, y_train)
    predictions = rfc.predict(X_test)
    print("Accurancy:",accuracy_score(y_test, predictions))
    return rfc
    
def entrenamientoyPrueba(features,labels,features_prueba):    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)
    
    print("\n","Naive Bayes","\n")
    nB=naiveBayes(X_train, X_test, y_train, y_test )
    predictTweets(nB,features_prueba)
    
    print("\n","Arbol de desición","\n")
    aD=arboleDeDesicion(X_train,X_test,y_train,y_test )
    predictTweets(aD,features_prueba)
    
    print("\n","Random Arbol","\n")
    rA=randomArbol(X_train, X_test, y_train, y_test )
    predictTweets(rA,features_prueba)

In [46]:
#
labels = tweets['sentiment'].values

texto_para_procesar=tweets['text'].values #Entrenamiento y prueba
texto_predecir=tweets10k.values[:10000]   #Para predecir los 10K de Tweets

#Texto con emojis y hastag-username
texto_procesado=[]  
texto_predecir_procesado=[]  

#Sin emojis
texto_procesado_sin_emoji=[] 
texto_predecir_sin_emoji=[]   

#Sin hastag-username
texto_procesado_sin_has_user=[]  
texto_predecir_sin_has_user=[]   

for txt in texto_para_procesar:
    texto_procesado.append(processing_text(txt))
    texto_procesado_sin_emoji.append(processing_text(txt,removeEmoticon=True))
    texto_procesado_sin_has_user.append(processing_text(txt,removeUserHastag=True))

for txt in texto_predecir:
    texto_predecir_procesado.append(processing_text(txt))
    texto_predecir_sin_emoji.append(processing_text(txt,removeEmoticon=True))
    texto_predecir_sin_has_user.append(processing_text(txt,removeUserHastag=True))  

In [47]:
#Lexicones con emoji y hatag-username
lexicon_procesado=lexicones(texto_procesado)
lexicon_predecir_procesado=lexicones(texto_predecir_procesado)

In [48]:
#Lexicones sin emoji
lexicon_procesado_sin_emoji=lexicones(texto_procesado_sin_emoji)
lexicon_predecir_sin_emoji=lexicones(texto_predecir_sin_emoji)

In [49]:
#Lexicones sin hastag-username
lexicon_procesado_sin_has_user=lexicones(texto_procesado_sin_has_user)
lexicon_predecir_sin_has_user=lexicones(texto_predecir_sin_has_user)

<h2>1. Metodo: Vector TfIdf </h2>
<h4># Sin stop words, con emoji y con hastag-username</h4>
<h4># Con stop words, con emoji y con hastag-username</h4>
<h4># Sin stop words, sin emoji y con hastag-username</h4>
<h4># Sin stop words, con emoji y sin hastag-username</h4>

In [76]:
#Sacar los featurers con las posibles combinaciones
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
vectorizer_con_stopwords=TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)

features = vectorizer.fit_transform(texto_procesado).toarray()
features_predecir=vectorizer.transform(texto_predecir_procesado).toarray()

features_con_stop_words=vectorizer_con_stopwords.fit_transform(texto_procesado).toarray()
features_predecir_con_stop_words=vectorizer_con_stopwords.transform(texto_predecir_procesado).toarray()

features_sin_emoji=vectorizer.fit_transform(texto_procesado_sin_emoji).toarray()
features_predecir_sin_emoji=vectorizer.transform(texto_predecir_sin_emoji).toarray()

features_procesado_sin_has_user=vectorizer.fit_transform(texto_procesado_sin_has_user).toarray()
features_predecir_sin_has_user=vectorizer.transform(texto_predecir_sin_has_user).toarray()

In [77]:
print("\n",'\033[1m' +"Sin stop words, con emoji y con hastag-username" + '\033[0m' )
entrenamientoyPrueba(features,labels,features_predecir)

print("\n",'\033[1m' +"Con stop words, con emoji y con hastag-username" + '\033[0m')
entrenamientoyPrueba(features_con_stop_words,labels,features_predecir_con_stop_words)

print("\n",'\033[1m' +"Sin stop words, sin emoji y con hastag-username"+ '\033[0m')
entrenamientoyPrueba(features_sin_emoji,labels,features_predecir_sin_emoji)

print("\n",'\033[1m' +"Sin stop words, con emoji y sin hastag-username"+ '\033[0m')
entrenamientoyPrueba(features_procesado_sin_has_user,labels,features_predecir_sin_has_user)



 [1mSin stop words, con emoji y con hastag-username[0m

 Naive Bayes 

Accurancy: 0.36957868649318465
Ahora con los 10K de Tweets:
negative: 8.26% equivale a  826 tweets
neutral : 24.84% equivale a 2484 tweets
positive: 66.90% equivale a 6690 tweets

 Arbol de desición 

Accurancy: 0.6285625774473358
Ahora con los 10K de Tweets:
negative: 98.03% equivale a 9803 tweets
neutral : 0.00% equivale a    0 tweets
positive: 1.97% equivale a  197 tweets

 Random Arbol 

Accurancy: 0.7224287484510533
Ahora con los 10K de Tweets:
negative: 13.12% equivale a 1312 tweets
neutral : 81.91% equivale a 8191 tweets
positive: 4.97% equivale a  497 tweets

 [1mCon stop words, con emoji y con hastag-username[0m

 Naive Bayes 

Accurancy: 0.37267657992565056
Ahora con los 10K de Tweets:
negative: 9.98% equivale a  998 tweets
neutral : 27.05% equivale a 2705 tweets
positive: 62.97% equivale a 6297 tweets

 Arbol de desición 

Accurancy: 0.6335192069392813
Ahora con los 10K de Tweets:
negative: 98.89% eq

In [28]:
texto_para_procesar = tweets['text'].values #selecciona los textos
labels = tweets['sentiment'].values

texto_procesado = [] 


for sentence in range(0, len(texto_para_procesar)):
    procesado = processing_text(texto_para_procesar[sentence]).lower()
    texto_procesado.append(procesado) 

print(texto_para_procesar[1:5])
print(texto_procesado[1:5])

["@VirginAmerica plus you've added commercials to the experience... tacky."
 "@VirginAmerica I didn't today... Must mean I need to take another trip!"
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse'
 "@VirginAmerica and it's a really big bad thing about it"]
[' virginamerica plus you ve added commercials to the experience    tacky ', ' virginamerica didn today    must mean need to take another trip ', ' virginamerica it really aggressive to blast obnoxious  entertainment  in your guests  faces  amp  they have little recourse', ' virginamerica and it a really big bad thing about it']


In [30]:
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
features = vectorizer.fit_transform(texto_procesado).toarray()

In [31]:
lexicones=lexicones(texto_procesado) 

In [32]:
#Probar concatenación con lexicones
concat=np.concatenate((features,lexicones),axis=1)

In [33]:
labels

array(['neutral', 'positive', 'neutral', ..., 'positive', 'negative',
       'negative'], dtype=object)

In [38]:
#Mostrar con concatenación
entrenamientoyPrueba(concat,labels)


 Naive Bayes 

[[0.    0.    0.    ... 0.    0.176 0.824]
 [0.    0.    0.    ... 0.    0.208 0.792]
 [0.    0.    0.    ... 0.092 0.213 0.695]
 ...
 [0.    0.    0.    ... 0.131 0.126 0.743]
 [0.    0.    0.    ... 0.27  0.    0.73 ]
 [0.    0.    0.    ... 0.385 0.    0.615]]
Accurancy: 0.4225526641883519 
 Ahora con los 10K de Tweets:


ValueError: could not convert string to float: 'being in Dominican Republic during quarantine &gt;&gt;&gt; https://t.co/NpcOCDMkSR'

In [34]:
#Mostrar solo con lexicones
print(lexicones)
entrenamientoyPrueba(lexicones,labels)

[[ 0.          0.01136364  0.         ...  0.          1.
   0.        ]
 [ 0.          0.          0.         ...  0.          1.
   0.        ]
 [ 0.          0.38690476  0.41369048 ...  0.          1.
   0.        ]
 ...
 [ 2.          0.23501499  0.2297286  ...  0.225       0.775
   0.        ]
 [-6.          0.41493056  0.24305556 ...  0.          0.717
   0.        ]
 [ 0.          0.3030303   0.16504329 ...  0.          1.
   0.        ]]

 Naive Bayes 



TypeError: '<' not supported between instances of 'float' and 'str'

In [19]:
text="😁 Hola mundo cómo vás . dfkj as 🤐🙃🌤️💕" 

In [20]:
import emoji

def extract_emojis(str):
    return ' '.join(c for c in str if c in emoji.UNICODE_EMOJI)

In [21]:
emojis=extract_emojis(text)
emojis


'😁 🤐 🙃 🌤 💕'

In [2]:
 import sentlex

ModuleNotFoundError: No module named 'sentlex'

In [80]:
dataframe=pd.read_csv('Tweets500P1.csv')
tweets = pd.read_csv("Tweets.csv")

ParserError: Error tokenizing data. C error: Expected 6 fields in line 501, saw 12


In [98]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    
    print("El score",score)
    print("{:-<40} {}".format(sentence, str(score)))

In [99]:
sentiment_analyzer_scores("i love me but i hate me")

El score {'neg': 0.399, 'neu': 0.395, 'pos': 0.206, 'compound': -0.5346}
i love me but i hate me----------------- {'neg': 0.399, 'neu': 0.395, 'pos': 0.206, 'compound': -0.5346}


In [87]:
sentiment_analyzer_scores("i hate me but i love me  ")

i hate me but i love me  --------------- {'neg': 0.179, 'neu': 0.38, 'pos': 0.441, 'compound': 0.6652}


In [104]:
analyser.polarity_scores("I hate you")

{'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}

In [37]:
print("<h1>hola</h1>")

<h1>hola</h1>


[1mHello


In [78]:
!pip3 install emosent-py

Collecting emosent-py
  Downloading emosent-py-0.1.6.tar.gz (28 kB)
Building wheels for collected packages: emosent-py
  Building wheel for emosent-py (setup.py): started
  Building wheel for emosent-py (setup.py): finished with status 'done'
  Created wheel for emosent-py: filename=emosent_py-0.1.6-py3-none-any.whl size=28506 sha256=f2d2c54fcca4b37d272c03768fb3cae0ce77bce2f4b78f00e4e640c0c5266774
  Stored in directory: c:\users\asus\appdata\local\pip\cache\wheels\32\37\bd\b4e67490f36c4beb85a1047d6cd13a356ffecbfa854eaf4688
Successfully built emosent-py
Installing collected packages: emosent-py
Successfully installed emosent-py-0.1.6
