In [142]:
import numpy as np 
import pandas as pd 
import re
import nltk 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.tree import DecisionTreeClassifier

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

#Lexicones
import textblob as txb
from nltk import sent_tokenize, word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
%matplotlib inline

In [147]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [146]:
##Vamos a utlizar un dataset de tweets sobre aerolíneas que fueron manualmente anotados con su sentimiento/polaridad
#Y Concatenar con los 1500 tweets que tenemos
tweets = pd.read_csv("Data/Tweets.csv")[['text','airline_sentiment']]
tweets=tweets.rename(columns={'airline_sentiment':'sentiment'}) 
for i in range(3):
    path='Data/Tweets{}.csv'.format(i+1)
    tweets=pd.concat([tweets,pd.read_csv(path)[['text','sentiment']]],ignore_index=True)

    
#Extraer 10k de tweets, textos para su posterior predicción
tweets10k=pd.read_csv('Data/Tweets23K.csv')['text'].values[:10000]

In [136]:
analyser = SentimentIntensityAnalyzer() #Cargar lexicon
#Retorna ScoreP y ScoreN
def sentiwordnet(text):
    wnl = nltk.WordNetLemmatizer()
    stoken=nltk.word_tokenize(text)
    tag=nltk.pos_tag(stoken)
    scoreP=0
    scoreN=0
    for t in tag:
        newtag=''
        lemmatized=wnl.lemmatize(t[0])
        dt=[]
        if t[1].startswith('NN'):
            newtag='n'
        elif t[1].startswith('JJ'):
            newtag='a'
        elif t[1].startswith('V'):
            newtag='v'
        elif t[1].startswith('R'):
            newtag='r'
        if(newtag!=''):
            synset=list(swn.senti_synsets(lemmatized,newtag))
            #Promedio de todos los posibles sentimientos
            if(len(synset)>0):   
                scorep=0
                scoren=0
                for dt in synset:
                    scorep+=dt.pos_score()
                    scoren+=dt.neg_score() 
                scoreP+=scorep/len(synset)#Suma de promedios positivos 
                scoreN+=scoren/len(synset)#Suma de promedios negativos
    return scoreP,scoreN


def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score['neg'],score['pos'],score['neu'] 
    

#Retorna arreglo de lexicones, Affin, SentiworNet, Textblob, vanderSentiment
def lexicones(arrTextProccesing):
    #Init Afinn
    fileNameAFINN="Data/AFINN/AFINN-111.txt" 
    afinn= dict(map(lambda ws: (ws[0], int(ws[1])), [ 
            ws.strip().split('\t') for ws in open(fileNameAFINN) ]))
    #End Afinn    
    
    array=[]
    for txt in arrTextProccesing:
        scoreAfinn=sum(map(lambda word: afinn.get(word, 0), txt.split()))
        sentScoreP,sentScoreN=sentiwordnet(txt) 
        scoreTxblob=txb.TextBlob(txt).sentiment.polarity
        sasN,sasP,sasNeu=sentiment_analyzer_scores(txt)
        array.append([scoreAfinn,sentScoreP,sentScoreN,scoreTxblob,sasN,sasP,sasNeu])
    
    return np.array(array)
   

In [131]:
def processing_text(texto): 
    # Remove all the special characters
    processed_feature=texto
    processed_feature = re.sub(r'\W', ' ', str(processed_feature))
    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    # Converting to Lowercase
    return processed_feature

In [132]:

def naiveBayes(X_train,X_test,y_train,y_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))

def arboleDeDesicion(X_train,X_test,y_train,y_test):
    cart = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
    cart.fit(X_train, y_train)
    predictions = cart.predict(X_test)    
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))

def randomArbol(X_train,X_test,y_train,y_test):
    rfc = RandomForestClassifier(n_estimators=200, random_state=0,n_jobs=-1)
    rfc.fit(X_train, y_train)
    predictions = rfc.predict(X_test)

    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))
    
def entrenamientoyPrueba(features,labels):    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)
    print("\n","Naive Bayes","\n")
    naiveBayes(X_train, X_test, y_train, y_test)
    print("\n","Arbol de desición","\n")
    arboleDeDesicion(X_train,X_test,y_train,y_test)
    print("\n","Random Arbol","\n")
    randomArbol(X_train, X_test, y_train, y_test)

In [133]:
texto_para_procesar = tweets['text'].values #selecciona los textos
labels = tweets['sentiment'].values

texto_procesado = [] 

for sentence in range(0, len(texto_para_procesar)):
    procesado = processing_text(texto_para_procesar[sentence]).lower()
    texto_procesado.append(procesado) 

print(texto_para_procesar[1:5])
print(texto_procesado[1:5])

["@VirginAmerica plus you've added commercials to the experience... tacky."
 "@VirginAmerica I didn't today... Must mean I need to take another trip!"
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse'
 "@VirginAmerica and it's a really big bad thing about it"]
[' virginamerica plus you ve added commercials to the experience    tacky ', ' virginamerica didn today    must mean need to take another trip ', ' virginamerica it really aggressive to blast obnoxious  entertainment  in your guests  faces  amp  they have little recourse', ' virginamerica and it a really big bad thing about it']


In [134]:
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
features = vectorizer.fit_transform(texto_procesado).toarray()

In [137]:
lexicones=lexicones(texto_procesado) 

In [138]:
#Probar concatenación con lexicones
concat=np.concatenate((features,lexicones),axis=1)

In [139]:
#Mostrar con concatenación
entrenamientoyPrueba(concat,labels)


 Naive Bayes 

[[692 465 772]
 [ 84 262 437]
 [ 36  70 410]]
              precision    recall  f1-score   support

    negative       0.85      0.36      0.50      1929
     neutral       0.33      0.33      0.33       783
    positive       0.25      0.79      0.38       516

    accuracy                           0.42      3228
   macro avg       0.48      0.50      0.41      3228
weighted avg       0.63      0.42      0.44      3228

0.4225526641883519

 Arbol de desición 

[[1673  188   68]
 [ 446  261   76]
 [ 229   69  218]]
              precision    recall  f1-score   support

    negative       0.71      0.87      0.78      1929
     neutral       0.50      0.33      0.40       783
    positive       0.60      0.42      0.50       516

    accuracy                           0.67      3228
   macro avg       0.61      0.54      0.56      3228
weighted avg       0.64      0.67      0.64      3228

0.6666666666666666

 Random Arbol 

[[1768  112   49]
 [ 358  361   64]
 [ 150  

In [140]:
#Mostrar solo con lexicones
print(lexicones)
entrenamientoyPrueba(lexicones,labels)

[[ 0.          0.01136364  0.         ...  0.          0.
   1.        ]
 [ 0.          0.          0.         ...  0.          0.
   1.        ]
 [ 0.          0.38690476  0.41369048 ...  0.          0.
   1.        ]
 ...
 [ 2.          0.23501499  0.2297286  ...  0.          0.225
   0.775     ]
 [-6.          0.41493056  0.24305556 ...  0.283       0.
   0.717     ]
 [ 0.          0.3030303   0.16504329 ...  0.          0.
   1.        ]]

 Naive Bayes 

[[1223  544  162]
 [ 181  479  123]
 [  73  139  304]]
              precision    recall  f1-score   support

    negative       0.83      0.63      0.72      1929
     neutral       0.41      0.61      0.49       783
    positive       0.52      0.59      0.55       516

    accuracy                           0.62      3228
   macro avg       0.59      0.61      0.59      3228
weighted avg       0.68      0.62      0.64      3228

0.6214374225526642

 Arbol de desición 

[[1700  161   68]
 [ 459  248   76]
 [ 237   61  218]]
     

In [6]:
text="😁 Hola mundo cómo vás . dfkj as 🤐🙃🌤️💕" 

In [7]:
import emoji

def extract_emojis(str):
    return ' '.join(c for c in str if c in emoji.UNICODE_EMOJI)

In [1]:
emojis=extract_emojis(text)
texPro=processing_text(text)
print(emojis)
print(texPro)
texPro+=emojis
nltk.word_tokenize(texPro)


NameError: name 'extract_emojis' is not defined

In [2]:
 import sentlex

ModuleNotFoundError: No module named 'sentlex'

In [80]:
dataframe=pd.read_csv('Tweets500P1.csv')
tweets = pd.read_csv("Tweets.csv")

ParserError: Error tokenizing data. C error: Expected 6 fields in line 501, saw 12


In [98]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    
    print("El score",score)
    print("{:-<40} {}".format(sentence, str(score)))

In [99]:
sentiment_analyzer_scores("i love me but i hate me")

El score {'neg': 0.399, 'neu': 0.395, 'pos': 0.206, 'compound': -0.5346}
i love me but i hate me----------------- {'neg': 0.399, 'neu': 0.395, 'pos': 0.206, 'compound': -0.5346}


In [87]:
sentiment_analyzer_scores("i hate me but i love me  ")

i hate me but i love me  --------------- {'neg': 0.179, 'neu': 0.38, 'pos': 0.441, 'compound': 0.6652}


In [104]:
analyser.polarity_scores("I hate you")

{'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}