In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import preprocessor as p
from nltk.corpus import stopwords
import nltk
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import string
import spacy
import es_core_news_sm
import re

from import_data import read_data


In [16]:
def preprocessing_text(text, lemmatize = False):
    '''
    INPUT: string tweet
    OUTPUT: str w/ emojis, urls, numbers, and reserved words removed
    '''    
    def remove_symbols(word, symbol_set):
        return ''.join(char for char in word 
                     if char not in symbol_set)
    
    def fix_lemmatized_hashtags(tweet):
        '''
        Lemmatizing function separates # and word.
        This function returns string that rejoins hashtags
        '''
        tokens = []
        for i,j in enumerate(tweet.split()):
            if j == '#':
                j = tweet.split()[i] + tweet.split()[i+1]  
                tokens.append(j)
                continue
            if (tweet.split()[i-1] == '#'):
                continue
            elif j != '#':
                tokens.append(j)

        return ' '.join(tokens)
    
    # define stopwords
    stop_words_sp = stopwords.words('spanish')
    stop_words_en = stopwords.words('english')
    stop_words = stop_words_sp + stop_words_en + [' ']
    
    # define punctuation
    punct = set('!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~¿… °¡')
    
    # remove laughter
    matcher= re.compile(r'(ja)\1*')
    jaja = [match.group() for match in matcher.finditer(text)]
    jaja += ['lol', 'LOL', 'Lol', 'LoL']
    
    text = ' '.join([ word for word in text.split() if word not in jaja ])
    
    if lemmatize == True:
        # Lemmatize and rejoin
        nlp = es_core_news_sm.load()
        nlp_text = nlp(text)
        text = ' '.join([token.lemma_ for token in nlp_text ])    
        text = fix_lemmatized_hashtags(text)
        
    else:
        # Stem and rejoin
        stemmer = SnowballStemmer('spanish')
        text = ' '.join([stemmer.stem(token) for token in text.split() ])
    
    # remove emojis, urls, numbers, and reserved words
    p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED)
    clean_text = p.clean(text)
    
    # split tweet, remove stopwords, and len(words) <= 2
    clean_text = [ word for word in clean_text.split() 
                          if (remove_symbols(word, punct).lower() not in stop_words) \
                              and (word not in punct) \
                              and (len(remove_symbols(word, punct)) > 2) \
                              and (p.clean(remove_symbols(word, punct)) != '')]

    clean_text = [ word.lower() if word.startswith('@') else remove_symbols(word, punct).lower()
                  for word in clean_text ]
    
    return clean_text 

In [18]:
def grab_user_tweets(tweets_vec):
    all_tokens = []
    for tweet in tweets_vec:
        tokens =  preprocessing_text(tweet)
        all_tokens += tokens
    return all_tokens

In [7]:
data = read_data('../data', 2012, 2012, 1)

In [8]:
users_df_grp = data.groupby('userid')

In [9]:
users_df_grp.head()

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,...,retweet_tweetid,hashtags,urls,user_mentions,reply_count,like_count,retweet_count,year,month,num_tweets
0,225611245326303232,239471570,ForoCandanga,ForoCandanga,"Caracas, Venezuela",Fundación ForoCandanga | Especialistas en Medi...,518308,411928,2011-01-17,Forocandanga y periodismo necesario en la fisc...,...,,,[http://twitpic.com/a90yb0],,1.0,0.0,3.0,2012,7,119526
1,193773520227012611,N89KLvhi6BHTrhOb+WPn+QCiFhfoLEHL5A4uRUEpA4U=,N89KLvhi6BHTrhOb+WPn+QCiFhfoLEHL5A4uRUEpA4U=,N89KLvhi6BHTrhOb+WPn+QCiFhfoLEHL5A4uRUEpA4U=,,vinotinto 100% me encanta oir musica y q les p...,69,285,2010-11-08,Hala madrid,...,,,,,0.0,0.0,0.0,2012,4,161
2,235571262724902913,138503898,Pepitona Negra,PepitonaNegra,Venezuela,Trabajando,12976,13051,2010-04-29,"Uribe te faltaron cojones, no tiempo.",...,,,,,0.0,0.0,0.0,2012,8,540
3,155773172376018944,239471570,ForoCandanga,ForoCandanga,"Caracas, Venezuela",Fundación ForoCandanga | Especialistas en Medi...,518308,411928,2011-01-17,RT @temasdebates: LA MANO DE PABLO PEREZ ES AU...,...,1.557731e+17,,,"[temasdebates, ForoCandanga]",0.0,0.0,0.0,2012,1,119526
4,200629678682288130,239471570,ForoCandanga,ForoCandanga,"Caracas, Venezuela",Fundación ForoCandanga | Especialistas en Medi...,518308,411928,2011-01-17,RT @Laiguana_tv: VENEZUELA SUENA PARA LA CUMBR...,...,2.006144e+17,,[http://laiguana.tv/noticias/noticia_32.html],"[Laiguana_tv, Laiguana_tv]",0.0,0.0,0.0,2012,5,119526
5,246807850154876928,239471570,ForoCandanga,ForoCandanga,"Caracas, Venezuela",Fundación ForoCandanga | Especialistas en Medi...,518308,411928,2011-01-17,RT @paris_jm: @ForoCandanga Estas Son las nuev...,...,2.468074e+17,,,"[paris_jm, ForoCandanga]",0.0,0.0,0.0,2012,9,119526
6,284867430306680832,239471570,ForoCandanga,ForoCandanga,"Caracas, Venezuela",Fundación ForoCandanga | Especialistas en Medi...,518308,411928,2011-01-17,#SolidaridadCANDANGA http://t.co/UTPdM7kT,...,,[SolidaridadCANDANGA],[http://twitpic.com/9s237g],,0.0,1.0,0.0,2012,12,119526
601,269431812604952578,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,En Jumanji,Dos palabras Optimista y ambicioso. Contador d...,995,728,2010-04-15,Yo teamo mas. RT @soyanamalave: yeampiere vist...,...,,,,[soyanamalave],0.0,0.0,0.0,2012,11,15894
602,265965250589904896,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,Z9SEZKAkYBpJZP34CLNfeKxThol17qJ418cPNdSe5ao=,En Jumanji,Dos palabras Optimista y ambicioso. Contador d...,995,728,2010-04-15,"RT @_Nymphaa: Yo soy de las que te dá alas, te...",...,2.659651e+17,,,[_Nymphaa],0.0,0.0,0.0,2012,11,15894
603,280645946088501248,292401414,Yennefer,Yennefer_Ve,,Tengamos una conducta recta y dejemos al tiemp...,32491,14018,2011-05-03,RT @karinbel1985: #MapaRojoRojito que alegria ...,...,2.806430e+17,[MapaRojoRojito],,[karinbel1985],0.0,0.0,0.0,2012,12,47285


In [13]:
data[data.userid == '239471570']['tweet_text'].head(30)

0     Forocandanga y periodismo necesario en la fisc...
3     RT @temasdebates: LA MANO DE PABLO PEREZ ES AU...
4     RT @Laiguana_tv: VENEZUELA SUENA PARA LA CUMBR...
5     RT @paris_jm: @ForoCandanga Estas Son las nuev...
6             #SolidaridadCANDANGA http://t.co/UTPdM7kT
7     RT @ElespinitoSilva: EL FILO: Venezuela, bajo ...
8     #ConstruyendoSOBERANIA ...  http://t.co/CGRMZF...
9     RT @Ramon_ant: BUENOS DIAS HOY MARTES INDEPEND...
10    RT @Rennjparra: @ForoCandanga felicitaciones c...
11    RT @victoriadali53: Mas buenas..buenas..pa mas...
12    RT @henriquezmer04: @forocandanga Más de 2mill...
13    RT @nelson2903: Aló, Presidente regresa el pró...
14       #INVITACIÓN  http://t.co/HHoTyOhg vía @TwitPic
15    RT @zapataruben: @paloalza @ForoCandanga TOD@S...
16    RT @ForoC_Anzo: @ForoCandanga Operativo Mercal...
17    @alagrape LO MIDEN ES POR LA CANTIDAD DE VECES...
18    Lo que niega RICHARD MARDO fue transmitido en ...
19    RT @jorgburrundanga: @solylunaa @ForoCanda

In [19]:
user_doc = users_df_grp['tweet_text'].apply(grab_user_tweets)

In [20]:
user_doc = pd.DataFrame(user_doc).reset_index().rename(index=str, columns={"tweet_text": "tweet_document"})

In [21]:
user_doc.head()

Unnamed: 0,userid,tweet_document
0,+dQ+QlvsYdF8Lald5LNFZRmGAJVayY9jpoeDm4mvMko=,"[perd, opcion, @female_bitch1, jajajaj, casi, ..."
1,+kKXxTxO9bdLP3qG1Pg8duf8g3LJk2ywVN6+Vr551M=,"[@axel_adri, nadaa, olvidalo, tom, mas, doming..."
2,138503898,"[urib, falt, cojones, tiempo, @gobiernodezuli,..."
3,138582229,"[@la_iguanatv:, @vtvcanal8:, chavez, favor, di..."
4,141736595,"[@chavezcandang, excelente, orgull, comand, pa..."


In [22]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

dictionary = gensim.corpora.Dictionary(user_doc['tweet_document'])
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [24]:
from gensim import corpora, models
from pprint import pprint

bow_corpus = [dictionary.doc2bow(doc) for doc in user_doc['tweet_document']]
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [25]:
# LDA on TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.001*"liv" + 0.001*"separ" + 0.001*"lead" + 0.000*"vote" + 0.000*"onechot" + 0.000*"#hoyganachavez" + 0.000*"quiero" + 0.000*"check" + 0.000*"mtv" + 0.000*"see"
Topic: 1 
Words: 0.001*"detal" + 0.000*"@maperezpirel" + 0.000*"greivis" + 0.000*"amp" + 0.000*"fanat" + 0.000*"@vtvcanal8" + 0.000*"kier" + 0.000*"ideales" + 0.000*"noc" + 0.000*"desacuerd"
Topic: 2 
Words: 0.001*"#siguemeytesig" + 0.001*"algui" + 0.001*"amo" + 0.000*"amp" + 0.000*"envidi" + 0.000*"@globovision:" + 0.000*"soluciones" + 0.000*"tiemp" + 0.000*"@eliasgobern" + 0.000*"demasi"
Topic: 3 
Words: 0.001*"@chavezcandanga:" + 0.001*"@forocandang" + 0.001*"caribi" + 0.001*"#batalladoresinternet" + 0.000*"honra" + 0.000*"maravilloso" + 0.000*"tachir" + 0.000*"@frentebinternet" + 0.000*"glori" + 0.000*"@chavezcandang"
Topic: 4 
Words: 0.001*"photo" + 0.001*"pipi" + 0.001*"@victordrij" + 0.001*"hol" + 0.000*"tremend" + 0.000*"haciend" + 0.000*"caricua" + 0.000*"capril" + 0.000*"vot" + 0.000*"capitul"
Topic:

In [26]:
import pyLDAvis.gensim
tw_data =  pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.display(tw_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [2]:
data[data.userid == '239471570']['tweet_text'][5]

NameError: name 'data' is not defined