<a href="https://colab.research.google.com/github/MariaIsabelLL/AnalisisSentimientosTwitter/blob/main/5_Twitter_Analisis_Sentimientos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from nltk import word_tokenize
import nltk
nltk.download('punkt')
import re

""" tokenizar tweets"""
def preprocess(s):
    emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
        
    regex_str =[emoticons_str,
                r'<[^>]+>' , #HTML tags
                r'(?:@[\w_]+)' , #@-Mención
                r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)" , #Hash-tags
                r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', #URLs
                r'(?:[\w_]+)' , #Otras Palabras
                r'(?:\S)' #Otras Palabras
                ]    
    
    tokens_re = re.compile (r'('+'|'.join(regex_str)+')' ,re.VERBOSE | re.IGNORECASE)
    tokens = tokens_re.findall(s)
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.stem import SnowballStemmer
stopwords = nltk.corpus.stopwords.words('spanish')

def bag_of_words(words):
    words_dictionary = dict([word, True] for word in words)
    #print('dictionario',words_dictionary)
    return words_dictionary

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed    

def obtain_tokens(tweet):
    stemmer = SnowballStemmer('spanish')    
    features  = {}
    #primero se realiza la identificación de tokens y se quitan los stopwords
    tweet_token = [term for term in preprocess(tweet) if term not in stopwords]
    total_words = []
    #segundo se obtienen los stemm
    for word in stem_tokens(tweet_token,stemmer):
        total_words.append(word)
    return total_words


In [None]:
def lee_datos(loc):
  listFiles = glob.glob(loc+'*train.xml')
  count=0
  count1=0
  count2=0
  pos_reviews = []
  neg_reviews = []
  pos_reviews_set = []
  neg_reviews_set = []

  for fileName in listFiles:

        soup = BeautifulSoup(open(fileName,'r',encoding='utf8'),features="xml")
        for tweet in soup.find_all("tweet"):
            words = obtain_tokens(tweet.content.text)
            label = tweet.sentiments.polarity.value.text
            if (label=='NONE'):
                #etiqueta='X'
                continue
            if (label=='NEU'):
                etiqueta='Y'
            if (label in ('N','P')):
                etiqueta=label
                if (label=='N'):
                    neg_reviews.append(words)
                    count1= count1+1
                if (label=='P'):
                    pos_reviews.append(words)
                    count2= count2+1
            count= count+1

  for words in pos_reviews:
        pos_reviews_set.append((bag_of_words(words), 'P'))
  for words in neg_reviews:
        neg_reviews_set.append((bag_of_words(words), 'N'))

  size = int(len(pos_reviews_set) * 0.1)     
  testSet = pos_reviews_set[:size] + neg_reviews_set[:size]
  trainSet = pos_reviews_set[size:] + neg_reviews_set[size:]

  print('total de casos',count)
  print('total de casos positivos',count1)
  print('total de casos negativos',count2)
  return trainSet, testSet

In [None]:
#!pip install sklearn-crfsuite
import glob
import json
from sklearn_crfsuite import CRF as CRF_sklearn
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

def clasificadorSentimientos(loc):
    (trainSet, testSet) = lee_datos(loc)
    
    #Naive Bayes classifier
    classifier1 = nltk.NaiveBayesClassifier.train(trainSet)
    print('Naive Bayes classifier',nltk.classify.accuracy(classifier1,  testSet)) 

    #Predicting on the test set.
    X_test = [f for (f,pos) in testSet]
    y_test = [pos for (f,pos) in testSet]
    predSet=[]
    for xtest in X_test:
      y_pred = classifier1.classify(xtest)
      predSet.append(y_pred)
    f1_score2 = flat_f1_score(y_test, predSet, average = 'weighted')
    print('f1_score',f1_score2)
    report = flat_classification_report(y_test, predSet)
    print(report)

    return classifier1

def prediccionSentimientos(arc,clas):    

    with open (arc ,"r") as f:  
        for line in f:     
            if not line.isspace():
                tweet = json.loads(line)
                #print("TWEET:",tweet["text"])
                newTexto = clas.classify(bag_of_words(obtain_tokens(tweet["text"]))) 
                print("Resultado", newTexto, tweet["text"] )
       

locCorpusTass1 = '/content/sample_data/tass/'          
clas = clasificadorSentimientos(locCorpusTass1)
#prediccionSentimientos("coronavirus.json",clas)  
tweet1="@dw_espanol: Lo más triste de la #pandemia del #coronavirus son la cantidad de fallecidos"
tweet2="@dw_espanol: Todos los adultos mayores al fin vacunados!!!"
print(tweet1, clas.classify(bag_of_words(obtain_tokens(tweet1)))  )
print(tweet2, clas.classify(bag_of_words(obtain_tokens(tweet2)))  )


total de casos 5736
total de casos positivos 1335
total de casos negativos 1232
Naive Bayes classifier 0.6829268292682927
f1_score 0.6781400966183575
              precision    recall  f1-score   support

           N       0.65      0.80      0.72       123
           P       0.74      0.56      0.64       123

    accuracy                           0.68       246
   macro avg       0.69      0.68      0.68       246
weighted avg       0.69      0.68      0.68       246

@dw_espanol: Lo más triste de la #pandemia del #coronavirus son la cantidad de fallecidos N
@dw_espanol: Todos los adultos mayores al fin vacunados!!! P
