In [4]:
import numpy as np 
import pandas as pd 
import re
import nltk 
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.tree import DecisionTreeClassifier
#from texblob import TextBlob
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

%matplotlib inline

In [147]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [13]:
##Vamos a utlizar un dataset de tweets sobre aerolíneas que fueron manualmente anotados con su sentimiento/polaridad
tweets = pd.read_csv("Tweets.csv")

In [19]:
#Retorna ScoreP y ScoreN
def sentiwordnet(text):
    wnl = nltk.WordNetLemmatizer()
    stoken=nltk.word_tokenize(text)
    tag=nltk.pos_tag(stoken)
    scoreP=0
    scoreN=0
    for t in tag:
        newtag=''
        lemmatized=wnl.lemmatize(t[0])
        dt=[]
        if t[1].startswith('NN'):
            newtag='n'
        elif t[1].startswith('JJ'):
            newtag='a'
        elif t[1].startswith('V'):
            newtag='v'
        elif t[1].startswith('R'):
            newtag='r'
        if(newtag!=''):
            synset=list(swn.senti_synsets(lemmatized,newtag))
            #Promedio de todos los posibles sentimientos
            if(len(synset)>0):   
                scorep=0
                scoren=0
                for dt in (list(swn.senti_synsets(lemmatized,newtag))):
                    scorep+=dt.pos_score()
                    scoren+=dt.neg_score()
                    break;
                scoreP+=scorep/len(synset)
                scoreN+=scoren/len(synset)
    return scoreP,scoreN
    

#Uso de dos lexicones, Affin y SentiworNet
def lexicones(arrTextProccesing):
    #Init Afinn
    fileNameAFINN="AFINN/AFINN-111.txt" 
    afinn= dict(map(lambda ws: (ws[0], int(ws[1])), [ 
            ws.strip().split('\t') for ws in open(fileNameAFINN) ]))
    #End Afinn    
    
    array=[]
    for txt in arrTextProccesing:
        scoreAfinn=sum(map(lambda word: afinn.get(word, 0), txt.split()))
        sentScoreP,sentScoreN=sentiwordnet(txt)
        array.append([scoreAfinn,sentScoreP,sentScoreN])
    
    return np.array(array)
   

In [20]:
def processing_text(texto): 
    # Remove all the special characters
    processed_feature=texto
    processed_feature = re.sub(r'\W', ' ', str(processed_feature))
    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    # Converting to Lowercase
    return processed_feature

In [21]:

def naiveBayes(X_train,X_test,y_train,y_test):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))

def arboleDeDesicion(X_train,X_test,y_train,y_test):
    cart = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
    cart.fit(X_train, y_train)
    predictions = cart.predict(X_test)    
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))

def randomArbol(X_train,X_test,y_train,y_test):
    rfc = RandomForestClassifier(n_estimators=200, random_state=0,n_jobs=-1)
    rfc.fit(X_train, y_train)
    predictions = rfc.predict(X_test)

    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))
    print(accuracy_score(y_test, predictions))
    
def entrenamientoyPrueba(features,labels):    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)
    print("\n","Naive Bayes","\n")
    naiveBayes(X_train, X_test, y_train, y_test)
    print("\n","Arbol de desición","\n")
    arboleDeDesicion(X_train,X_test,y_train,y_test)
    print("\n","Random Arbol","\n")
    randomArbol(X_train, X_test, y_train, y_test)

In [None]:
texto_para_procesar = tweets.iloc[:, 10].values #selecciona los textos
labels = tweets.iloc[:, 1].values

texto_procesado = [] 

for sentence in range(0, len(texto_para_procesar)):
    procesado = processing_text(texto_para_procesar[sentence]).lower()
    texto_procesado.append(procesado) 

print(texto_para_procesar[1:5])
print(texto_procesado[1:5])

In [10]:
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
features = vectorizer.fit_transform(texto_procesado).toarray()

In [11]:
lexicones=lexicones(texto_procesado) 

In [57]:
#Probar concatenación con lexicones
concat=np.concatenate((features,lexicones),axis=1)

In [58]:
#Mostrar con concatenación
entrenamientoyPrueba(concat,labels)


 Naive Bayes 

[[703 488 679]
 [ 69 244 301]
 [ 31  59 354]]
              precision    recall  f1-score   support

    negative       0.88      0.38      0.53      1870
     neutral       0.31      0.40      0.35       614
    positive       0.27      0.80      0.40       444

    accuracy                           0.44      2928
   macro avg       0.48      0.52      0.42      2928
weighted avg       0.66      0.44      0.47      2928

0.4443306010928962

 Arbol de desición 

[[1537  160  173]
 [ 279  210  125]
 [ 117   44  283]]
              precision    recall  f1-score   support

    negative       0.80      0.82      0.81      1870
     neutral       0.51      0.34      0.41       614
    positive       0.49      0.64      0.55       444

    accuracy                           0.69      2928
   macro avg       0.60      0.60      0.59      2928
weighted avg       0.69      0.69      0.69      2928

0.6933060109289617

 Random Arbol 

[[1736   96   38]
 [ 291  272   51]
 [ 127  

In [59]:
#Mostrar solo con lexicones
print(lexicones)
entrenamientoyPrueba(lexicones,labels)

[[0.         0.01136364 0.        ]
 [0.         0.         0.        ]
 [0.         0.38690476 0.41369048]
 ...
 [1.         0.04545455 0.06818182]
 [2.         0.36566938 0.45195131]
 [0.         0.56689108 0.39761992]]

 Naive Bayes 

[[1346  419  105]
 [ 174  385   55]
 [  96  165  183]]
              precision    recall  f1-score   support

    negative       0.83      0.72      0.77      1870
     neutral       0.40      0.63      0.49       614
    positive       0.53      0.41      0.47       444

    accuracy                           0.65      2928
   macro avg       0.59      0.59      0.57      2928
weighted avg       0.70      0.65      0.67      2928

0.6536885245901639

 Arbol de desición 

[[1537  160  173]
 [ 279  210  125]
 [ 117   44  283]]
              precision    recall  f1-score   support

    negative       0.80      0.82      0.81      1870
     neutral       0.51      0.34      0.41       614
    positive       0.49      0.64      0.55       444

    accuracy

In [6]:
text="😁 Hola mundo cómo vás . dfkj as 🤐🙃🌤️💕" 

In [7]:
import emoji

def extract_emojis(str):
    return ' '.join(c for c in str if c in emoji.UNICODE_EMOJI)

In [12]:
emojis=extract_emojis(text)
texPro=processing_text(text)
print(emojis)
print(texPro)
texPro+=emojis
nltk.word_tokenize(texPro)


😁 🤐 🙃 🌤 💕
  Hola mundo cómo vás   dfkj as      


['Hola', 'mundo', 'cómo', 'vás', 'dfkj', 'as', '😁', '🤐', '🙃', '🌤', '💕']