# Preprocesamiento de texto

In [162]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
import re
import emoji
import emojis

## Base

In [163]:
os.chdir('/Users/valentinacastilla/Library/CloudStorage/OneDrive-UniversidaddeLosAndes/Trabajo Final')

In [164]:
df = pd.read_pickle('Data/Consolidated_V0.pkl')

In [165]:
df.columns

Index(['ID', 'Permalink', 'Author ID', 'Author Name', 'Author Location',
       'Author Description', 'Author Followers', 'Author Following',
       'Author Tweets', 'Author Profile Image', 'Author Verified', 'Date',
       'Text', 'Replies', 'Retweets', 'Favorites', 'Quotes', 'is Retweet?',
       'Reply To User Name', 'Mentions', 'Referenced Tweet', 'Reference Type',
       'Referenced Tweet Author ID', 'Media URLs', 'Media Keys'],
      dtype='object')

In [166]:
# variables de texto relevantes
df = df[['ID', 'Author ID', 'Author Tweets', 'Text']]
df

Unnamed: 0,ID,Author ID,Author Tweets,Text
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...
...,...,...,...,...
95,1525265946952310784,335665232,4616,@AlvaroUribeVel Claro es que todo esto son mon...
96,1525257787894276097,335665232,4616,RT @maclago12: Yo ya me vacuné contra las ment...
97,1525242880142483456,335665232,4616,@CARLOSFMEJIA El imbécil que crea en el cuento...
98,1525169648441884681,335665232,4616,@MariaFdaCabal Eso depende de la dictadura si ...


In [167]:
# arreglar texto - gracias ñungix y santix
def remove_url(text):
    # Lo \S todo lo que no es un espacio
    text = re.sub(r'https\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    return text

def remove_accents(text):
    dict_accents={'á':'a', 'é':'e', 'í':'i', 'ó':'o', 'ú':'u'}
    for acc in dict_accents.keys(): 
        text=text.replace(acc,dict_accents.get(acc))
    return text

# quitar rt del inicio (REVISAR)
def remove_rt(text):
    text = re.sub('RT ', '', text)
    return text

def clean_text(text):
    text = text.lower()
    remove_url(text)
    text = remove_accents(text)
    text = re.sub(r'@[\w]+', '', text)
    #text = re.sub(r'#[\w]+', '', text)
    text = re.sub(r'[^\w]+', ' ', text)
    text = re.sub(r'_+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip() 
    return text

def full_preprossesing(text):
    text=remove_rt(text)
    text=remove_accents(text)
    text=remove_url(text)
    text=clean_text(text)
    return text

In [168]:
# texto lindo - no tiene nada de caracteres especiales, inclyendo emojis
df['tweets limpios']=df['Text'].apply(lambda x: full_preprossesing(x))

In [169]:
df.head(10)

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...,no se lo de presencia pero me interesa el kit
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...,juntos con volvemos con una gran sesion del se...
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...,una nueva masacre con armas en un colegio adem...
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...,esto steve kerr
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...,si te habla de anticorrupcion pero privilegia ...
5,1529186552621981697,252912120,16033,@olademarf 👏👏👏👏,
6,1529139106688782339,252912120,16033,@silvia_otero85 Mucho para pensar con esta vis...,mucho para pensar con esta vision interesante
7,1529129297834803202,252912120,16033,@Santiagomezm @vanguardiacom @lasillavacia Y l...,y la faceta youtuber
8,1529092935605751811,252912120,16033,RT @periodistica: Lo de Fajardo es bastante tr...,lo de fajardo es bastante triste paso de const...
9,1528891703498883072,252912120,16033,RT @histoftech: Uber CEO’s Memo Admits That Ub...,uber ceo s memo admits that uber can t be prof...


## Palabras más frecuentes

### Stopwords

In [170]:
stop_words = stopwords.words('spanish')
stop_words = [clean_text(i) for i in stop_words]
print(stop_words)

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'mas', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'si', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'tambien', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mi', 'antes', 'algunos', 'que', 'unos', 'yo', 'otro', 'otras', 'otra', 'el', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tu', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mio', 'mia', 'mios', 'mias', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estas', 'esta', 'estamos', 'estais', 'estan', 'e

### En general - palabras "más" usadas

In [171]:
# bolsa de palabras
tweets_pp = [full_preprossesing(t) for t in df['tweets limpios']]

In [172]:
len(tweets_pp)

486300

In [173]:
vectorizer=CountVectorizer(min_df=0.015, stop_words=stop_words) #creo el modelo vacio y después lo ajusto a mis datos
#lo que me va a interesar es el vocabulario y que tome mis textos y los convierta en vectores
vectorizer.fit(tweets_pp)

CountVectorizer(min_df=0.015,
                stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los',
                            'del', 'se', 'las', 'por', 'un', 'para', 'con',
                            'no', 'una', 'su', 'al', 'lo', 'como', 'mas',
                            'pero', 'sus', 'le', 'ya', 'o', 'este', 'si',
                            'porque', ...])

In [174]:
features=vectorizer.get_feature_names_out() #un vector con todas las parablas que se están usando en mis textos
print(features.shape)
features
# estas palabras aparecen menos en menos de 1.5 veces en los tweets

(41,)


array(['ahora', 'aqui', 'asi', 'años', 'bien', 'bogota', 'cambio',
       'campaña', 'candidato', 'colombia', 'dia', 'dias', 'dice', 'duque',
       'elecciones', 'fico', 'gente', 'gobierno', 'gracias', 'hace',
       'hacer', 'hoy', 'mejor', 'pais', 'personas', 'petro', 'politica',
       'presidente', 'primera', 'puede', 'ser', 'siempre', 'solo', 'tan',
       'uribe', 'usted', 'va', 'vamos', 'ver', 'vida', 'votar'],
      dtype=object)

In [175]:
freq=vectorizer.fit_transform(tweets_pp).toarray()

In [176]:
freq.shape

(486300, 41)

### Conteo de palabras por tweets

In [177]:
# Contar las palabras por tweet
def count_words(sentence) :
    num_words=len(re.findall(r'\w+',sentence))
    return num_words

In [178]:
# numero de palabras
df['palabras tweets']=df['tweets limpios'].apply(lambda x: count_words(x))
# no se cuentan los @users o #, solo palabras extra

In [179]:
df.head()

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios,palabras tweets
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...,no se lo de presencia pero me interesa el kit,10
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...,juntos con volvemos con una gran sesion del se...,23
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...,una nueva masacre con armas en un colegio adem...,45
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...,esto steve kerr,3
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...,si te habla de anticorrupcion pero privilegia ...,21


### Proporción de tweets por usuario que usan las palabras más frecuentes

In [180]:
# Marcar si el texto tiene alguna de estas palabras
palabras = features.tolist()
politica = ['bogota','cambio', 'campaña', 'candidato', 'colombia', 
            'democracia','dios', 'dos', 'duque', 'elecciones', 
            'fajardo', 'fico', 'francia', 'gobierno', 'gustavo', 
            'gutierrez', 'historia', 'medellin', 'nacional', 'pais', 'paz', 
            'petro', 'politica', 'presidente', 'pueblo', 'rodolfo', 'trabajo', 
            'uribe', 'verdad', 'voto', 'votos', 'masacre', 'resistencia', 'paro']

In [181]:
def match_words_relevantes(text):
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in palabras]), re.I)
    matches = regex.findall(text)
    relevantes = len(matches)
    return relevantes

def match_words_politicas(text):
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in politica]), re.I)
    matches = regex.findall(text)
    relevantes = len(matches)
    return relevantes

def match_words_stopwords(text):
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in stop_words]), re.I)
    matches = regex.findall(text)
    relevantes = len(matches)
    return relevantes

In [182]:
# numero de palabras relevantes y politicas
df['palabras relevantes tweets']=df['tweets limpios'].apply(lambda x: match_words_relevantes(x))
df['palabras politicas tweets']=df['tweets limpios'].apply(lambda x: match_words_politicas(x))
df['stopwords tweets']=df['tweets limpios'].apply(lambda x: match_words_stopwords(x))
# no se cuentan los @users o #, solo palabras extra

In [183]:
# proporción de palabras relevantes por tweet
df['prop palabras relevantes tweets']=df['palabras relevantes tweets']/df['palabras tweets']
df['prop palabras politicas tweets']=df['palabras politicas tweets']/df['palabras tweets']
df['prop stopwords tweets']=df['stopwords tweets']/df['palabras tweets']

In [184]:
# proporción de palabras usadas por usuario
df_user=df.groupby(by='Author ID').sum()
df_user.columns

Index(['palabras tweets', 'palabras relevantes tweets',
       'palabras politicas tweets', 'stopwords tweets',
       'prop palabras relevantes tweets', 'prop palabras politicas tweets',
       'prop stopwords tweets'],
      dtype='object')

In [185]:
df_user['prop palabras relevantes por usuario']=df_user['palabras relevantes tweets']/df_user['palabras tweets']
df_user['prop palabras politicas por usuario']=df_user['palabras politicas tweets']/df_user['palabras tweets']
df_user['prop stopwords por usuario']=df_user['stopwords tweets']/df_user['palabras tweets']

In [186]:
df_user=df_user[['prop palabras relevantes por usuario', 'prop palabras politicas por usuario', 
                 'prop stopwords por usuario']]

In [187]:
df_user=df_user.reset_index()

In [188]:
df_merge=pd.merge(df, df_user, on='Author ID', how='left')

In [189]:
df_merge.sort_values(by='Author ID')

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios,palabras tweets,palabras relevantes tweets,palabras politicas tweets,stopwords tweets,prop palabras relevantes tweets,prop palabras politicas tweets,prop stopwords tweets,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario
56177,1529546268770435074,29283,677767,RT @danpfeiffer: There is going to be a lot of...,there is going to be a lot of debate on this w...,41,0,0,1,0.000000,0.000000,0.024390,0.000000,0.000322,0.033119
56163,1529559118326837248,29283,677767,RT @oneunderscore__: Some people are still pus...,some people are still pushing the lie that the...,52,0,0,2,0.000000,0.000000,0.038462,0.000000,0.000322,0.033119
56164,1529558996264587264,29283,677767,RT @GeauxGabrielle: University in California d...,university in california did a comprehensive s...,48,0,0,4,0.000000,0.000000,0.083333,0.000000,0.000322,0.033119
56165,1529558940987625473,29283,677767,RT @AOC: The last time leadership waded in to ...,the last time leadership waded in to save him ...,51,0,0,2,0.000000,0.000000,0.039216,0.000000,0.000322,0.033119
56166,1529558855663099904,29283,677767,RT @AOC: Why even be in Congress if you don’t ...,why even be in congress if you don t believe i...,55,0,0,3,0.000000,0.000000,0.054545,0.000000,0.000322,0.033119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300235,1507497026249973771,1389769251704147971,1008,RT @MissPoirot07: Si me muero de este virus qu...,si me muero de este virus que tengo diganle a ...,17,0,0,12,0.000000,0.000000,0.705882,0.066919,0.054924,0.448232
300208,1529565720752930816,1389769251704147971,1008,RT @ELTIEMPO: 🚨 Tumban fallo fiscal contra Gus...,tumban fallo fiscal contra gustavo petro por b...,17,1,2,6,0.058824,0.117647,0.352941,0.066919,0.054924,0.448232
300234,1507497049801076748,1389769251704147971,1008,RT @agmethescaf: Oficialmente inscritos quiene...,oficialmente inscritos quienes representan la ...,9,0,1,4,0.000000,0.111111,0.444444,0.066919,0.054924,0.448232
300242,1507495696303366148,1389769251704147971,1008,RT @soniabernalcas: Ridiculez de #ElGranDebate...,ridiculez de elgrandebate de rcn con pobres ca...,39,5,3,19,0.128205,0.076923,0.487179,0.066919,0.054924,0.448232


### Emojis

In [190]:
def extract_emojis(s):
  return ' '.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

In [191]:
df_merge['emojis']=df_merge['Text'].apply(lambda x: extract_emojis(x))

In [192]:
def q_emojis(s):
    lista=len(''.join(c for c in s if c in emoji.UNICODE_EMOJI['en']))
    return lista

In [193]:
df_merge['cantidad emojis']=df_merge['Text'].apply(lambda x: q_emojis(x))

In [194]:
df_merge['usa emojis']=(df_merge['cantidad emojis']>0).astype(int)

In [195]:
df_merge['emojis'].value_counts()

           394484
🧵            2084
👇            1974
❤            1344
😂            1049
            ...  
📚 🤓 💪 🏻         1
🚃 ✈             1
☺ 🤲             1
🎨 🖼             1
🧵 📸             1
Name: emojis, Length: 20778, dtype: int64

In [196]:
df_merge

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios,palabras tweets,palabras relevantes tweets,palabras politicas tweets,stopwords tweets,prop palabras relevantes tweets,prop palabras politicas tweets,prop stopwords tweets,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario,emojis,cantidad emojis,usa emojis
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...,no se lo de presencia pero me interesa el kit,10,0,0,7,0.000000,0.000000,0.700000,0.028879,0.019579,0.338718,,0,0
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...,juntos con volvemos con una gran sesion del se...,23,0,1,11,0.000000,0.043478,0.478261,0.028879,0.019579,0.338718,,0,0
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...,una nueva masacre con armas en un colegio adem...,45,4,1,20,0.088889,0.022222,0.444444,0.028879,0.019579,0.338718,,0,0
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...,esto steve kerr,3,0,0,1,0.000000,0.000000,0.333333,0.028879,0.019579,0.338718,✊ 🏼,2,1
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...,si te habla de anticorrupcion pero privilegia ...,21,0,0,9,0.000000,0.000000,0.428571,0.028879,0.019579,0.338718,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486295,1525265946952310784,335665232,4616,@AlvaroUribeVel Claro es que todo esto son mon...,claro es que todo esto son montajes,7,0,0,5,0.000000,0.000000,0.714286,0.049669,0.033113,0.520814,,0,0
486296,1525257787894276097,335665232,4616,RT @maclago12: Yo ya me vacuné contra las ment...,yo ya me vacune contra las mentiras que han di...,14,1,1,10,0.071429,0.071429,0.714286,0.049669,0.033113,0.520814,😝 💉 🍑,3,1
486297,1525242880142483456,335665232,4616,@CARLOSFMEJIA El imbécil que crea en el cuento...,el imbecil que crea en el cuento de que petro ...,21,1,1,12,0.047619,0.047619,0.571429,0.049669,0.033113,0.520814,,0,0
486298,1525169648441884681,335665232,4616,@MariaFdaCabal Eso depende de la dictadura si ...,eso depende de la dictadura si es de izquierda...,27,1,0,14,0.037037,0.000000,0.518519,0.049669,0.033113,0.520814,,0,0


### Análisis de sentimientos de emojis

In [197]:
def bye_emojis(text):
    emoticon=emoji.demojize(text)
    return emoticon

# quitar espacio entre emojis
def space_emojis(text):
    text = re.sub(':', ' ', text)
    return text

def pretty_lists(text):
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text

def emojis_text(text):
    text=bye_emojis(text)
    text=space_emojis(text)
    text=pretty_lists(text)
    return text

In [198]:
df_merge['emojis texto']=df_merge['emojis'].apply(lambda x: emojis_text(x))

In [199]:
df_merge

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios,palabras tweets,palabras relevantes tweets,palabras politicas tweets,stopwords tweets,prop palabras relevantes tweets,prop palabras politicas tweets,prop stopwords tweets,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario,emojis,cantidad emojis,usa emojis,emojis texto
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...,no se lo de presencia pero me interesa el kit,10,0,0,7,0.000000,0.000000,0.700000,0.028879,0.019579,0.338718,,0,0,
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...,juntos con volvemos con una gran sesion del se...,23,0,1,11,0.000000,0.043478,0.478261,0.028879,0.019579,0.338718,,0,0,
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...,una nueva masacre con armas en un colegio adem...,45,4,1,20,0.088889,0.022222,0.444444,0.028879,0.019579,0.338718,,0,0,
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...,esto steve kerr,3,0,0,1,0.000000,0.000000,0.333333,0.028879,0.019579,0.338718,✊ 🏼,2,1,raised_fist medium-light_skin_tone
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...,si te habla de anticorrupcion pero privilegia ...,21,0,0,9,0.000000,0.000000,0.428571,0.028879,0.019579,0.338718,,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486295,1525265946952310784,335665232,4616,@AlvaroUribeVel Claro es que todo esto son mon...,claro es que todo esto son montajes,7,0,0,5,0.000000,0.000000,0.714286,0.049669,0.033113,0.520814,,0,0,
486296,1525257787894276097,335665232,4616,RT @maclago12: Yo ya me vacuné contra las ment...,yo ya me vacune contra las mentiras que han di...,14,1,1,10,0.071429,0.071429,0.714286,0.049669,0.033113,0.520814,😝 💉 🍑,3,1,squinting_face_with_tongue syringe peach
486297,1525242880142483456,335665232,4616,@CARLOSFMEJIA El imbécil que crea en el cuento...,el imbecil que crea en el cuento de que petro ...,21,1,1,12,0.047619,0.047619,0.571429,0.049669,0.033113,0.520814,,0,0,
486298,1525169648441884681,335665232,4616,@MariaFdaCabal Eso depende de la dictadura si ...,eso depende de la dictadura si es de izquierda...,27,1,0,14,0.037037,0.000000,0.518519,0.049669,0.033113,0.520814,,0,0,


In [200]:
Positive=['😀','😃','😄','😁','☺️','🤠','😺','😸','😍','🥰','😘',
          '❤️','💕','♥️','💖','💗','💙','❣️','💜','😻','💓','💛','💞','💚','💘','😂',
          '👍','👍🏻','👍🏼','👍🏽','👍🏾','👍🏿','👊','👊🏼','👊🏽','👊🏾','👊🏿',
          '👊🏻','👌🏻','👌🏼','👌🏽','👌🏾','👌🏿','👌','🙏🏻','🙏','🙏🏼','🙏🏾','🙏🏽','🙏🏿',
          '🙌','🙌🏻','🙌🏼','🙌🏽','🙌🏾','🙌🏿','💪','💪🏻','💪🏼','💪🏽','💪🏾','💪🏿',
          '💃🏻','💃','💃🏼','💃🏽','💃🏾','💃🏿','🎉','🍾','🌸','✨','☀️','💋','🎶',
          '👏','👏🏻','👏🏼','👏🏽','👏🏾','👏🏿','😋','🙈','💁🏻‍♀️','💁‍♀️','💁🏼‍♀️','💁🏽‍♀️','💁🏾‍♀️','💁🏿‍♀️']
Negative=['😢','😓','😭','😩','😫','😥','😰','😪','😿','😞','☹️','😣',
          '😤','😠','😡','🤬','💢','😝','😱','😷','😒','😵','💩',
          '😑','🫤','🙄','😎','🙂','🙃','🙁','😶','💔','👎','👎🏼',
          '👎🏻','👎🏽','👎🏾','👎🏿','🖕','🖕🏼','🖕🏽','🖕🏾','🖕🏿','🖕🏻','🔫','💀']
Neutral=['🤨','😉','😌','😐','😳','😯','😦','😧','😮','😲','☯️','🔥','😴','💯','👀','☕️']

In [201]:
Positive = [emojis_text(i) for i in Positive]
Negative = [emojis_text(i) for i in Negative]
Neutral = [emojis_text(i) for i in Neutral]

In [202]:
Positive

['grinning_face',
 'grinning_face_with_big_eyes',
 'grinning_face_with_smiling_eyes',
 'beaming_face_with_smiling_eyes',
 'smiling_face',
 'cowboy_hat_face',
 'grinning_cat',
 'grinning_cat_with_smiling_eyes',
 'smiling_face_with_heart-eyes',
 'smiling_face_with_hearts',
 'face_blowing_a_kiss',
 'red_heart',
 'two_hearts',
 'heart_suit',
 'sparkling_heart',
 'growing_heart',
 'blue_heart',
 'heart_exclamation',
 'purple_heart',
 'smiling_cat_with_heart-eyes',
 'beating_heart',
 'yellow_heart',
 'revolving_hearts',
 'green_heart',
 'heart_with_arrow',
 'face_with_tears_of_joy',
 'thumbs_up',
 'thumbs_up_light_skin_tone',
 'thumbs_up_medium-light_skin_tone',
 'thumbs_up_medium_skin_tone',
 'thumbs_up_medium-dark_skin_tone',
 'thumbs_up_dark_skin_tone',
 'oncoming_fist',
 'oncoming_fist_medium-light_skin_tone',
 'oncoming_fist_medium_skin_tone',
 'oncoming_fist_medium-dark_skin_tone',
 'oncoming_fist_dark_skin_tone',
 'oncoming_fist_light_skin_tone',
 'OK_hand_light_skin_tone',
 'OK_hand_

In [203]:
def match_emojis_positive(text):
    pos = 0
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in Positive]), re.I)
    matches = regex.findall(text)
    pos = (len(matches)>0)
    return pos

def match_emojis_negative(text):
    neg = 0
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in Negative]), re.I)
    matches = regex.findall(text)
    neg = (len(matches)>0)
    return neg

def match_emojis_neutral(text):
    neut = 0
    regex=re.compile('|'.join([r'\b' + w[:-1] if w.endswith('*') else r'\b' + w + r'\b' for w in Neutral]), re.I)
    matches = regex.findall(text)
    neut = (len(matches)>0)
    return neut

In [204]:
# separar palabras por espacio
def split_words(text):
    x = text.split(" ")
    return x

In [205]:
df_merge['emoji positivo']=df_merge['emojis texto'].apply(lambda x: match_emojis_positive(x))
df_merge['emoji negativo']=df_merge['emojis texto'].apply(lambda x: match_emojis_negative(x))
df_merge['emoji neutral']=df_merge['emojis texto'].apply(lambda x: match_emojis_neutral(x))

In [206]:
df_merge['emoji neutral'].value_counts()

False    478308
True       7992
Name: emoji neutral, dtype: int64

In [207]:
df_merge['emoji positivo']=df_merge['emoji positivo'].astype(int)
df_merge['emoji negativo']=df_merge['emoji negativo'].astype(int)
df_merge['emoji neutral']=df_merge['emoji neutral'].astype(int)

In [211]:
df_merge

Unnamed: 0,ID,Author ID,Author Tweets,Text,tweets limpios,palabras tweets,palabras relevantes tweets,palabras politicas tweets,stopwords tweets,prop palabras relevantes tweets,...,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario,emojis,cantidad emojis,usa emojis,emojis texto,emoji positivo,emoji negativo,emoji neutral
0,1529541764843028484,252912120,16033,@roots360co @AdrianaRudling No sé lo de presen...,no se lo de presencia pero me interesa el kit,10,0,0,7,0.000000,...,0.028879,0.019579,0.338718,,0,0,,0,0,0
1,1529453075819282433,252912120,16033,RT @PettinaVanni: Juntos con @CEHColmex @UNAM_...,juntos con volvemos con una gran sesion del se...,23,0,1,11,0.000000,...,0.028879,0.019579,0.338718,,0,0,,0,0,0
2,1529452371662852097,252912120,16033,RT @laurarovi1: Una nueva masacre con armas en...,una nueva masacre con armas en un colegio adem...,45,4,1,20,0.088889,...,0.028879,0.019579,0.338718,,0,0,,0,0,0
3,1529448991611658240,252912120,16033,RT @Toni_Padilla: Esto. Steve Kerr. ✊🏼 https:/...,esto steve kerr,3,0,0,1,0.000000,...,0.028879,0.019579,0.338718,✊ 🏼,2,1,raised_fist medium-light_skin_tone,0,0,0
4,1529302053125758977,252912120,16033,RT @laquintana2015: Si te habla de anticorrupc...,si te habla de anticorrupcion pero privilegia ...,21,0,0,9,0.000000,...,0.028879,0.019579,0.338718,,0,0,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486295,1525265946952310784,335665232,4616,@AlvaroUribeVel Claro es que todo esto son mon...,claro es que todo esto son montajes,7,0,0,5,0.000000,...,0.049669,0.033113,0.520814,,0,0,,0,0,0
486296,1525257787894276097,335665232,4616,RT @maclago12: Yo ya me vacuné contra las ment...,yo ya me vacune contra las mentiras que han di...,14,1,1,10,0.071429,...,0.049669,0.033113,0.520814,😝 💉 🍑,3,1,squinting_face_with_tongue syringe peach,0,1,0
486297,1525242880142483456,335665232,4616,@CARLOSFMEJIA El imbécil que crea en el cuento...,el imbecil que crea en el cuento de que petro ...,21,1,1,12,0.047619,...,0.049669,0.033113,0.520814,,0,0,,0,0,0
486298,1525169648441884681,335665232,4616,@MariaFdaCabal Eso depende de la dictadura si ...,eso depende de la dictadura si es de izquierda...,27,1,0,14,0.037037,...,0.049669,0.033113,0.520814,,0,0,,0,0,0


In [212]:
df_merge['prop stopwords por usuario'].type()


AttributeError: 'Series' object has no attribute 'type'

In [219]:
df_final=df_merge.groupby(by='Author ID').mean()

In [220]:
df_final.head()

Unnamed: 0_level_0,palabras tweets,palabras relevantes tweets,palabras politicas tweets,stopwords tweets,prop palabras relevantes tweets,prop palabras politicas tweets,prop stopwords tweets,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario,cantidad emojis,usa emojis,emoji positivo,emoji negativo,emoji neutral
Author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
29283,31.1,0.0,0.01,1.03,0.0,0.000286,0.029032,0.0,0.000322,0.033119,0.04,0.03,0.0,0.01,0.01
1061601,19.37,1.06,0.88,9.23,0.055742,0.041005,0.4554,0.054724,0.045431,0.47651,0.78,0.33,0.08,0.03,0.01
1357911,27.43,1.07,0.53,13.37,0.042637,0.018037,0.474188,0.039008,0.019322,0.487423,0.52,0.26,0.03,0.01,0.0
2984721,17.14,0.85,0.55,7.37,0.051783,0.03392,0.415015,0.049592,0.032089,0.429988,0.76,0.27,0.09,0.03,0.03
7996082,31.28,0.94,0.28,14.77,0.028305,0.008146,0.464385,0.030051,0.008951,0.472187,0.17,0.15,0.0,0.0,0.0


In [221]:
df_final=df_final[['prop palabras relevantes por usuario', 'prop palabras politicas por usuario',
         'prop stopwords por usuario', 'cantidad emojis', 'usa emojis', 'emoji positivo',
         'emoji negativo', 'emoji neutral']]

In [222]:
df_final=df_final.reset_index()

In [223]:
df_final

Unnamed: 0,Author ID,prop palabras relevantes por usuario,prop palabras politicas por usuario,prop stopwords por usuario,cantidad emojis,usa emojis,emoji positivo,emoji negativo,emoji neutral
0,29283,0.000000,0.000322,0.033119,0.040000,0.030000,0.000000,0.010000,0.010000
1,1061601,0.054724,0.045431,0.476510,0.780000,0.330000,0.080000,0.030000,0.010000
2,1357911,0.039008,0.019322,0.487423,0.520000,0.260000,0.030000,0.010000,0.000000
3,2984721,0.049592,0.032089,0.429988,0.760000,0.270000,0.090000,0.030000,0.030000
4,7996082,0.030051,0.008951,0.472187,0.170000,0.150000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
4894,1389637694075162628,0.055627,0.013583,0.432730,0.890000,0.400000,0.180000,0.040000,0.170000
4895,1389737202742071304,0.038192,0.014640,0.504137,2.897959,0.673469,0.071429,0.040816,0.030612
4896,1389745371690283014,0.048007,0.030004,0.487784,0.360000,0.150000,0.070000,0.000000,0.030000
4897,1389753654849286145,0.058205,0.014551,0.496362,0.139535,0.104651,0.034884,0.011628,0.000000


In [224]:
df_final.to_pickle('user_model_texto.pkl') # save

### pruebas

In [39]:
# numero de veces que usas emojis
# clasificacion de emojis - sentimientos - 
# el paper de jancho - analisis


In [42]:
print(emoji.demojize('Python is 👍'))

Python is :thumbs_up:


In [43]:
print(emoji.demojize('😵‍💫,🥰,☺️'))

:face_with_spiral_eyes:,:smiling_face_with_hearts:,:smiling_face:


In [44]:
#Importing libraries
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

ImportError: cannot import name 'UNICODE_EMO' from 'emot.emo_unicode' (/Users/valentinacastilla/opt/anaconda3/lib/python3.9/site-packages/emot/emo_unicode.py)

In [None]:
print(emoji.demojize(df.Text[1]))