In [3]:
import numpy as np
import pandas as pd
import os
import json

<h3>Create DataFrame based on individual tweets files</h3>

In [73]:
tweets = []

In [74]:
for entry in os.scandir('C:/Users/marti_000/Documents/NLP Project/Data/Tweets/Paro'):
    if entry.is_file():
        tweet_file = open('C:/Users/marti_000/Documents/NLP Project/Data/Tweets/Paro/' + entry.name, encoding='utf-8')
        tweets.append(json.loads(tweet_file.read()))

In [75]:
len(tweets)

4187

In [68]:
del tweets[278]

In [76]:
tweets_df = pd.DataFrame(tweets, index=[tweet['ID'] for tweet in tweets])
tweets_df = tweets_df.drop('ID', axis=1)
tweets_df.index.name = 'ID'

In [77]:
tweets_df.shape

(4187, 12)

In [9]:
tweets_df.head()

Unnamed: 0_level_0,datetime,has_media,is_reply,is_retweet,medias,nbr_favorite,nbr_reply,nbr_retweet,text,url,user_id,usernameTweet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
884200730600644610,2017-07-09 21:00:56,,False,False,,1,0,1,Concluyó la toma y paro de estudiantes de ...,/OvejeroNoticias/status/884200730600644610,3199468276,OvejeroNoticias
884202300612521984,2017-07-09 21:07:10,True,False,False,[https://t.co/Ovjaub1fv1],0,0,0,Docentes de Santa Cruz continuarán el paro : ...,/ElComunalDiario/status/884202300612521984,771060664685174784,ElComunalDiario
884202625998221312,2017-07-09 21:08:28,,False,False,,0,0,0,De trancazo en trancazo el gobierno monta su...,/cabox86/status/884202625998221312,44535962,cabox86
884202674660442112,2017-07-09 21:08:39,,False,False,,0,0,0,"RT unificaluchas ""RT salariojustotsa: Los com...",/RebelinAluminio/status/884202674660442112,840628267,RebelinAluminio
884203271711322119,2017-07-09 21:11:02,True,False,False,[],0,0,0,El Gobierno porteño descontará el día de pa...,/gianluca_drogo/status/884203271711322119,825729469,gianluca_drogo


<h3>Cleaning tweets</h3>

In [10]:
import re

In [11]:
def remove_by_regex(tweets, regexp):
        tweets.loc[:, 'text'].replace(regexp, '', inplace=True)
        return tweets

In [12]:
def remove_urls(tweets): 
    return remove_by_regex(tweets_df, re.compile(r'https?://[^\s<>"]+|www\.[^\s<>"]+'))

In [13]:
def remove_usernames(tweets):
        return remove_by_regex(tweets, re.compile(r"@[^\s]+[\s]?"))

In [14]:
def remove_special_chars(tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: re.compile(re.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            tweets.loc[:, 'text'].replace(remove, '', inplace=True)
        return tweets

In [15]:
def remove_numbers(tweets):
        return remove_by_regex(tweets, re.compile(r"\s?[0-9]+\.?[0-9]*"))

In [78]:
tweets_df = remove_urls(tweets_df)
tweets_df = remove_usernames(tweets_df)
tweets_df = remove_special_chars(tweets_df)
tweets_df = remove_numbers(tweets_df)

<h3>Tokenization</h3>

In [17]:
from nltk.tokenize import word_tokenize

In [18]:
def tokenize_row(row):
            row['text'] = word_tokenize(row['text'])
            row['tokenized_text'] = [] + row['text']
            return row

In [79]:
tweets_df = tweets_df.apply(tokenize_row, axis=1)

In [21]:
def to_lowercase(row):
    row['text'] = list(map(lambda str: str.lower(), row['text']))
    return row

In [80]:
tweets_df = tweets_df.apply(to_lowercase, axis=1)

<h3>Remove stopwords</h3>

In [23]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords

In [81]:
words = Counter()
for idx in tweets_df.index:
    words.update(tweets_df.loc[idx, 'text'])
words.most_common(5)

[('paro', 4057), ('de', 3984), ('la', 2337), ('el', 2268), ('y', 1915)]

In [82]:
stop = stopwords.words('spanish')

In [84]:
stop_domain = ['paro',
               'http', 'https',
               'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z',
               'ah', 'ahi', 'ahí', 'alguien', 'ante', 'aquí', 'asi', 'así', 'che', 'cob', 'cómo', 'cn', 'cosa', 'da', 'después', 'eh', 'estan', 
               'etc', 'fue', 'hola', 'había', 'hace', 'hacia', 'he', 'hs', 'ir', 'jul', 'lt', 'mitma', 'os', 'puede', 'quién', 
               're', 'retweeted', 'rt', 'sé', 'si', 'sino', 'tal', 'tb', 'tatus', 'tras', 'unas', 'ud', 'us', 'va', 'vas', 'vamos', 'ver', 'ves', 
               'via', 'vía', 'vos', 
               'vs', 'xa', 'xd', 'xq',
               '¡', '+', '…', '¿',
               '+video+cabeza', 'eldestapewebcomc', 'newsgalapagar', 'lavozdelaa', 'borjalujanlago', '¡perú', '¡con', '"más',
               'poorgarprensaobrerao', '¿qué', 'gobiernogenerarparoprecariedadlaboral', 'diariocompsoeacusaal', 'owlylynedcus',
               'jajaja', 'jajajaja', 'jajajajaj',
               'bitlyumdwrq', 'eldiario', '"', 'owlyqpdsred', 
               'twittercomzetaorlandost', 'owlyegpadcawl', 'wpmepjxjrwtp', '¿la', 
               'agencianovacomnotaaspn__idid_tiponota']

In [85]:
stop = stop + stop_domain

In [86]:
stop_whitelist = []

In [29]:
def remove_stopwords(row, stopwords, stopwords_whitelist):
    row['text'] = [word for word in row['text'] if word not in stopwords or word in stopwords_whitelist]
    return row

In [87]:
tweets_df = tweets_df.apply(lambda x: remove_stopwords(x, stop, stop_whitelist), axis=1)

Count all words

In [88]:
words_frequency = defaultdict(int)

In [56]:
def count_words(row, words_frequency):
    for word in row['text']:
        words_frequency[word] +=1

In [89]:
tweets_df.apply(lambda x: count_words(x, words_frequency), axis=1)

ID
884200730600644610    None
884202300612521984    None
884202625998221312    None
884202674660442112    None
884203271711322119    None
884204272082198528    None
884204486167810048    None
884205348168577025    None
884205922586738688    None
884206478189629440    None
884206816250523649    None
884207251451518976    None
884207452878667776    None
884207530632552450    None
884208048700440576    None
884208792870805504    None
884208833148715008    None
884209376202027009    None
884209635300970497    None
884210693729374208    None
884211537312972801    None
884212473401946113    None
884213648729702400    None
884214201987637248    None
884214779279224832    None
884214853128290304    None
884215557897060352    None
884215999624601604    None
884216882181672960    None
884217366925774848    None
                      ... 
885644325362819072    None
885644473224634368    None
885644614434263040    None
885644982350204929    None
885645143323365376    None
885645262747824128    Non

Remove words that only ocurr once

In [34]:
def remove_word(row, words_frequency, min_ocurrence=1, max_ocurrence=1000):
    row['text'] = [word for word in row['text'] if words_frequency[word] > min_ocurrence and words_frequency[word] < max_ocurrence] 
    return row

In [90]:
tweets_df = tweets_df.apply(lambda x: remove_word(x, words_frequency), axis=1)

In [60]:
tweets_df.to_csv('Data/Paro_Tweets_Cleaned_Version.csv')

Sort words in tweets

In [91]:
tweets_df.apply(lambda x: x['text'].sort(), axis=1)

ID
884200730600644610    None
884202300612521984    None
884202625998221312    None
884202674660442112    None
884203271711322119    None
884204272082198528    None
884204486167810048    None
884205348168577025    None
884205922586738688    None
884206478189629440    None
884206816250523649    None
884207251451518976    None
884207452878667776    None
884207530632552450    None
884208048700440576    None
884208792870805504    None
884208833148715008    None
884209376202027009    None
884209635300970497    None
884210693729374208    None
884211537312972801    None
884212473401946113    None
884213648729702400    None
884214201987637248    None
884214779279224832    None
884214853128290304    None
884215557897060352    None
884215999624601604    None
884216882181672960    None
884217366925774848    None
                      ... 
885644325362819072    None
885644473224634368    None
885644614434263040    None
885644982350204929    None
885645143323365376    None
885645262747824128    Non

<h3>Create dictionary and corpus</h3>

In [38]:
from gensim import corpora



In [92]:
dictionary = corpora.Dictionary(tweets_df['text'])
dictionary.compactify()
dictionary.save('paro_tweets.dict')

In [94]:
print(dictionary)

Dictionary(3508 unique tokens: ['estudiantes', 'toma', 'universidad', 'conflicto', 'continuarán']...)


In [93]:
corpus = []

In [95]:
for i,row in tweets_df.text.iteritems():
    corpus.append(dictionary.doc2bow(row))

In [96]:
corpora.MmCorpus.serialize('paro_tweets.mm', corpus)

<h3>LDA Modeling</h3>

In [68]:
from gensim import models

In [97]:
lda_params = {'num_topics': 10, 'passes': 100, 'alpha': 0.001}

In [98]:
lda = models.LdaModel(corpus, id2word=dictionary,
                        num_topics=lda_params['num_topics'],
                        passes=lda_params['passes'],
                        alpha = lda_params['alpha'])

<h3>Visualization</h3>

In [71]:
import pyLDAvis.gensim

In [72]:
tweets_data =  pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(tweets_data)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
