In [10]:
import json
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from wordcloud import WordCloud
from pysentimiento.preprocessing import preprocess_tweet

## Corpus SENT-COVID

In [11]:
with open('data/SENT-COVID.json') as file:
    data = json.load(file)
    
pd.options.mode.chained_assignment = None                                         
pd.set_option('display.max_colwidth',None)   


df = pd.DataFrame(data)
print('Numero de tweets: ' + str(len(df)))
df.head()

Numero de tweets: 4594


Unnamed: 0,Label,Tweet,id
0,NEUTRO,-@dulcema201 @BronstonRaqsa02 Protocolo de COVID !!!!,1401047081121353728
1,NEUTRO,-#COVID19 #QuedateEnCasa en Morelia Centro,1258159310162595843
2,POSITIVO,-México va en en aumento con el #Covid_19. Tal vez no tengamos la estabilidad de Europa o estados unidos. Para mantener 120 días en paro total. Pero podemos ser precavidos al usar la #SanaDistancia,1272748988626862082
3,NEUTRO,-@sororavirus Creo en todo y nada. 💜,1349385638722883585
4,NEGATIVO,-@GobiernoMX había prometido 389,1360615587114844161


## Preprocesamiento

In [14]:
def clean_tweet(text):
  text = re.sub(r'[~^0-9]', '', text) #numeros
  text = re.sub("\\s+", ' ', text) ##Espacios blancos dobles
  text = re.sub('\n', ' ', text) ##Saltos de linea

  pattern = r'([.])([A-Z#@¿])'
  pattern2 = r'([-?])([a-zA-Z#@¿])'
  pattern3 = r'([a-zA-Z])([#@¿(])'
  pattern4 = r'([:!])([a-zA-Z#@¿])'
  text = re.sub(pattern, r'\1 \2', text) # Separacion de punto seguido por una mayuscula
  text = re.sub(pattern2, r'\1 \2', text)
  text = re.sub(pattern3, r'\1 \2', text)
  text = re.sub(pattern4, r'\1 \2', text)
  return text 


def preprocess(text):  # Preprocesamiento de pysentimiento   
  return preprocess_tweet(text, normalize_laughter=True, shorten=2, 
                          emoji_wrapper='', user_token='', url_token='url')  


def normalize(text):
 pattern2 = r'([a-zA-Z])([.])'
 pattern3 = r'([.])([a-zA-Z])'
 text = re.sub(pattern2, r'\1 \2', text)
 text = re.sub(pattern3, r'\1 \2', text)
 
 text = "".join(u for u in text if u not in ("?","¿", ".", ";", ":", "!","¡",'"',"%","“","”","$","&","'","\\", "(",")",
                                             "*","+",",","/","<",">","=","^","•","...", "ç","π","ⓘ", "-", "_","#","|"))
 a,b = 'áéíóúÁÉÍÓÚ','aeiouAEIOU'
 trans = str.maketrans(a,b)     
 text = text.translate(trans) # Reemplazo de palabras acentuadas       

 pattern  = r'([a-z])([A-Z-])'
 text = re.sub(pattern, r'\1 \2', text)

 #text = re.sub(r'@[A-Za-z0-9_]+', '', text)
 text = text.lower()
 return text  


def tokenize(text):    
  text= text.split(sep = ' ')  # Tokenización por palabras individuales
  text= [token for token in text if len(token) > 1]  # Eliminación de tokens con una longitud < 2
  return(text) 

In [15]:
df['clean_tweet'] = df['Tweet'].apply(clean_tweet) 
df['preprocess_tweet'] = df['clean_tweet'].apply(preprocess)
df['normalized_tweet'] = df['preprocess_tweet'].apply(normalize)
df['tokenized_tweet'] = df['normalized_tweet'].apply(tokenize)

df[['Tweet','normalized_tweet','Label']][3395:3405]

Unnamed: 0,Tweet,normalized_tweet,Label
3395,"-""Reino Unido es uno de los países con más muertos por covid-19 del mundo y el NHS",reino unido es uno de los paises con mas muertos por covid del mundo y el nhs,NEGATIVO
3396,-Que buenas charlas,que buenas charlas,POSITIVO
3397,-En Ignacio Esteva 48 entre Tagle y Cano como todos los fines de semana están en fiesta,en ignacio esteva entre tagle y cano como todos los fines de semana estan en fiesta,NEUTRO
3398,-Hola 33 https://t.co/MKwXBDkPCr,hola url,NEUTRO
3399,-Mañana tendremos este interesante ejercicio de apoyo colectivo! Última llamada! https://t.co/hiEWl7HdRh#Crisis #puebla #eneagrama #COVID19mx #webinario #Webinars #sesiongratuita https://t.co/3hdkzh0rYG,mañana tendremos este interesante ejercicio de apoyo colectivo ultima llamada url crisis puebla eneagrama covidmx webinario webinars sesiongratuita url,NEUTRO
3400,-@CiroGomezL CiroTe propongo una forma de compararnos vs el mundo en covid 130 millones de mexicanos somos el 1.78 % de población total46,ciro te propongo una forma de compararnos vs el mundo en covid millones de mexicanos somos el de poblacion total,NEUTRO
3401,-Tengo dolor de garganta,tengo dolor de garganta,NEGATIVO
3402,-@lopezdoriga Bueno pero es @HLGatell no le pidas demasiado mejor pregúntenle de la fuerza moral de su patrón,bueno pero es no le pidas demasiado mejor preguntenle de la fuerza moral de su patron,NEGATIVO
3403,-Oigan no se supone que Alfaro se las jactaba del gobernador más responsable de México y era el mesías salvador del COVID para Jalisco? Bueno...no parece eso justo ahora.,oigan no se supone que alfaro se las jactaba del gobernador mas responsable de mexico y era el mesias salvador del covid para jalisco bueno no parece eso justo ahora,NEGATIVO
3404,-#Distrito04 del @INECoahula 🖥️ Así funcionará la expedición de #ConstanciasDigitales,distrito del ordenador de sobremesa asi funcionara la expedicion de constancias digitales,NEUTRO


## Lemmatizacion

In [None]:
import spacy
from nltk.stem.snowball import SnowballStemmer

sp = spacy.load('es_core_news_sm')

def lemmatization(text):
    doc = sp(text)
    return ' '.join([word.lemma_ for word in doc]) 

#stemmer = SnowballStemmer('spanish')
#stemmed_spanish = [stemmer.stem(item) for item in spanish_words]

In [None]:
df['lem_tweet'] = df['normalized_tweet'].apply(lemmatization)
df['lemtokenized_tweet'] = df['lem_tweet'].apply(tokenize)
df[['Label', 'normalized_tweet','lem_tweet','tokenized_tweet','lemtokenized_tweet']][100:110]

Unnamed: 0,Label,norm_tweet,lem_tweet,tokenized_tweet,lemtokenized_tweet
100,NEGATIVO,esos hermanos negrete una bola de vividores,ese hermano negrete uno bola de vividor,"[esos, hermanos, negrete, una, bola, de, vividores]","[ese, hermano, negrete, uno, bola, de, vividor]"
101,NEUTRO,esta en crisis la crisis url,este en crisis el crisis url,"[esta, en, crisis, la, crisis, url]","[este, en, crisis, el, crisis, url]"
102,NEGATIVO,uff cara triste pero aliviada microbio microbio quedate en casa casa con jardin coahuila mexico cara llorando,uff cara triste pero aliviado microbio microbio quedatar en casa casa con jardin coahuila mexico cara llorar,"[uff, cara, triste, pero, aliviada, microbio, microbio, quedate, en, casa, casa, con, jardin, coahuila, mexico, cara, llorando]","[uff, cara, triste, pero, aliviado, microbio, microbio, quedatar, en, casa, casa, con, jardin, coahuila, mexico, cara, llorar]"
103,NEUTRO,no se que tanto maman con lady vacuna,no él que tanto mamar con lady vacuna,"[no, se, que, tanto, maman, con, lady, vacuna]","[no, él, que, tanto, mamar, con, lady, vacuna]"
104,NEUTRO,ojo,ojo,[ojo],[ojo]
105,NEUTRO,hoy en el programa versiones raras,hoy en el programa versión rara,"[hoy, en, el, programa, versiones, raras]","[hoy, en, el, programa, versión, rara]"
106,NEUTRO,hoy participamos nuevamente para el programa de opinion yo creo,hoy participar nuevamente para el programa de opinion yo creer,"[hoy, participamos, nuevamente, para, el, programa, de, opinion, yo, creo]","[hoy, participar, nuevamente, para, el, programa, de, opinion, yo, creer]"
107,NEUTRO,flexibilidad,flexibilidad,[flexibilidad],[flexibilidad]
108,POSITIVO,dios los cuide y les conceda una pronta recuperacion y que cuide y bendiga a aquellos que aun no han presentado sintomas de contagio,dios él cuidir y él conceder uno pronto recuperacion y que cuidir y bendiga a aquel que aun no haber presentar sintoma de contagio,"[dios, los, cuide, les, conceda, una, pronta, recuperacion, que, cuide, bendiga, aquellos, que, aun, no, han, presentado, sintomas, de, contagio]","[dios, él, cuidir, él, conceder, uno, pronto, recuperacion, que, cuidir, bendiga, aquel, que, aun, no, haber, presentar, sintoma, de, contagio]"
109,NEGATIVO,y las sanciones para y por el terrible manejo de la crisis que,y el sanción para y por el terrible manejo de el crisis que,"[las, sanciones, para, por, el, terrible, manejo, de, la, crisis, que]","[el, sanción, para, por, el, terrible, manejo, de, el, crisis, que]"


In [16]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


False

## Stopwords

In [17]:
from nltk.corpus import stopwords

# Obtención de listado de stopwords del español
stop_words_esp = list(stopwords.words('spanish'))

def remove_stopwords(text):
    text = [w for w in text if not w in stop_words_esp]
    return text

df['normsw_tweet'] = df['normalized_tweet'].apply(remove_stopwords)
#df['lemsw_tweet'] = df['lem_tweet'].apply(remove_stopwords)
df['normtoksw_tweet'] = df['tokenized_tweet'].apply(remove_stopwords)
#df['lemtoksw_tweet'] = df['lemtokenized_tweet'].apply(remove_stopwords)

print(stop_words_esp[:100])
#df[['Label', 'lemtokenized_tweet', 'lemtoksw_tweet']][100:110]


['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo']


## Train test 

In [18]:
from sklearn.model_selection import train_test_split

X1 = df['normalized_tweet']          #Tweets normalizados
#X2 = df['lem_tweet']           #Tweets lemmatizados
X3 = df['tokenized_tweet']     #Normalizados y tokenizados
#X4 = df['lemtokenized_tweet']  #Lemmatizados y tokenizados 
X5 = df['normtoksw_tweet']     #Normalizados, tokenizados y sin stopwords
#X6 = df['lemtoksw_tweet']      #Lemmatizados, tokenizados y sin stopwords 
y = df['Label']                #Etiquetas

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.25 ,random_state=37)

In [None]:
value, counts = np.unique(y_train, return_counts=True)
print(dict(zip(value, 100 * counts / sum(counts))))
value, counts = np.unique(y_test, return_counts=True)
print(dict(zip(value, 100 * counts / sum(counts))))

{'NEGATIVO': 33.64296081277213, 'NEUTRO': 44.93468795355588, 'POSITIVO': 21.42235123367199}
{'NEGATIVO': 34.89991296779809, 'NEUTRO': 44.38642297650131, 'POSITIVO': 20.713664055700608}


## Vectorizaciones

### Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf= TfidfVectorizer(min_df=3, ngram_range=(1,2), stop_words = stop_words_esp).fit(X_train)
                        
print('Numero de features: ' +str(len(tfidf.get_feature_names_out())))
tfidf.fit(X_train)

Numero de features: 2764


TfidfVectorizer(min_df=3, ngram_range=(1, 2),
                stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los',
                            'del', 'se', 'las', 'por', 'un', 'para', 'con',
                            'no', 'una', 'su', 'al', 'lo', 'como', 'más',
                            'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí',
                            'porque', ...])

In [None]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)
X_train_tfidf 

<3445x2764 sparse matrix of type '<class 'numpy.float64'>'
	with 30163 stored elements in Compressed Sparse Row format>

### Countvectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countvect = CountVectorizer(min_df=3, ngram_range=(1,2)).fit(X_train)

print('Numero de features: ' +str(len(countvect.get_feature_names_out())))
countvect.fit(X_train)

Numero de features: 4435


CountVectorizer(min_df=3, ngram_range=(1, 2))

In [None]:
X_train_cv = countvect.transform(X_train)
X_test_cv  = countvect.transform(X_test)
X_train_cv

<3445x4435 sparse matrix of type '<class 'numpy.int64'>'
	with 55227 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  ### CountVectorizer de scikitlearn

tfidf = TfidfVectorizer()      # Instanciamos un objeto de la clase 
X = tfidf.fit_transform(data)  # Se ajustan a los datos y se transforman en una matriz
tfidf.get_feature_names_out()  # Obtencion del vocabulario de palabras 

### Word embedding

In [None]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.models import FastText

wordvectors_file = 'data/wiki.es.vec'
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file, limit=600000)

#embedding=200
#w2v = Word2Vec(X6, min_count=3, vector_size=embedding, window=5, sg=1 )
#w2v.train(X6, total_examples= len(df['lem_tweet']), epochs=20)

#wordvectors.most_similar('pozole')

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += wordvectors[word].reshape((1, size))
            count += 1.
        except KeyError:  
            continue
    if count != 0:
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(X6), 300)) 
for i in range(len(X6)):
    wordvec_arrays[i,:] = word_vector(X6[i], 300)
    
X_w2v = pd.DataFrame(wordvec_arrays)
X_w2v.shape

(4594, 300)

In [None]:
# vocabulario CountVectorizer:
print(f" Número de tokens creados: {len(countvect.get_feature_names_out())}")
countvect.get_feature_names_out()

# vocabuilario TFIDF:
#print(f" Número de tokens creados: {len(tfidf.get_feature_names_out())}")
#tfidf.get_feature_names()

 Número de tokens creados: 4587


array(['abajo', 'abajo flecha', 'abajo tono', ..., 'zeneca', 'zona',
       'zoom'], dtype=object)

## Bert

In [19]:
!pip3 install transformers
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json 
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m/bin/bash: wget: command not found
/bin/bash: wget: command not found
/bin/bash: wget: command not found
tar: Error opening archive: Failed to open 'pytorch_weights.tar.gz'
mv: rename config.json to pytorch/.: No such file or directory
mv: rename vocab.txt to pytorch/.: No such file or directory


In [20]:
import torch
import transformers as ppb
from transformers import BertForMaskedLM, BertTokenizer

In [21]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading: 100%|██████████| 256M/256M [00:33<00:00, 8.03MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
tokenized = X1.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(4594, 418)

In [23]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)


In [24]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

KeyboardInterrupt: 

## **skl-bert wrapper**

In [None]:
from bert_sklearn import BertClassifier
from bert_sklearn import load_model

model = BertClassifier()         
model.fit(X_train, y_train)

#model.bert_model = 'bert-large-uncased'
#model.num_mlp_layers = 2
#model.max_seq_length = 196
#model.epochs = 3
#model.learning_rate = 4e-5
#model.gradient_accumulation_steps = 4

#savefile = 'bert_mod_test.bin'
#model.save(savefile)
#bert_mod = load_model(savefile)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
scr = model.score(X_test, y_test)


Predicting:   0%|          | 0/144 [00:00<?, ?it/s]

Predicting:   0%|          | 0/144 [00:00<?, ?it/s]

Testing:   0%|          | 0/144 [00:00<?, ?it/s]


Loss: 0.9107, Accuracy: 60.23%


In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

print(classification_report(y_test, y_pred))

from sklearn.model_selection import GridSearchCV

params = {'epochs':[3, 4, 5], 'learning_rate':[2e-5, 3e-5, 5e-5]}

clf = GridSearchCV(BertClassifier(validation_fraction=0), 
                    params,
                    scoring='accuracy',
                    verbose=True)

clf.fit(X_train ,y_train)


In [None]:
!pip install jupyter_http_over_ws
!jupyter serverextension enable --py jupyter_http_over_ws

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter_http_over_ws
  Downloading jupyter_http_over_ws-0.0.8-py2.py3-none-any.whl (18 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 16.7 MB/s 
Installing collected packages: jedi, jupyter-http-over-ws
Successfully installed jedi-0.18.1 jupyter-http-over-ws-0.0.8
Enabling: jupyter_http_over_ws
- Writing config: /root/.jupyter
    - Validating...
      jupyter_http_over_ws 0.0.7 [32mOK[0m


In [None]:
!jupyter notebook \
  --NotebookApp.allow_origin='https://colab.research.google.com' \
  --port=2022 \
  --NotebookApp.port_retries=0

Traceback (most recent call last):
  File "/usr/local/bin/jupyter-notebook", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.7/dist-packages/jupyter_core/application.py", line 269, in launch_instance
    return super().launch_instance(argv=argv, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 845, in launch_instance
    app.initialize(argv)
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/application.py", line 88, in inner
    return method(app, *args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/notebook/notebookapp.py", line 1537, in initialize
    self.init_webapp()
  File "/usr/local/lib/python3.7/dist-packages/notebook/notebookapp.py", line 1321, in init_webapp
    self.http_server.listen(port, self.ip)
  File "/usr/local/lib/python3.7/dist-packages/tornado/tcpserver.py", line 143, in listen
    sockets = bind_sockets(port, address=address)
  File "/usr/local/lib/python3.7/dist-packa