In [121]:
from pymongo import MongoClient
from nltk.tokenize import word_tokenize
import pandas as pd
import re
import operator 
import json
from collections import Counter
from nltk.corpus import stopwords
import string
from collections import defaultdict
import vincent
import matplotlib.pyplot as plt
import datetime
import csv
from sklearn.cluster import KMeans
import numpy as np

# FEATURES:

### - PREPROCESSING: 
    - FILTER STOPWORDS+COMMON WORDS
    - RETRIEVE MOST COMMON TERMS, HASHTAGS (WITH ASSOCIATE COUNT)
    - RETRIEVE CO-OCCURENT TERMS
### - TIME SERIES ANALYSIS:
    - REPRESENT HASHTAGS/TERMS TIMESERIES (WITH POSSIBILITY TO CHOOSE THE GRANULARITY)
    - CLUSTER HASHTAGS/TERMS TIMESERIES USING K-MEANS
### - VISUALIZATION:
    - NATIVE VISUALIZATION OF TIME SERIES CLUSTERING
    - EXPORT DATA TO PLOT GEOLOCATED GRAPHS AND TIME SERIES CHARTS ON JS LIBRARIES (LINKED IN THE COMMENTS)

# TODO:
### - IMPROVE PREPROCESSING 
    - FILTER MORE COMMON WORDS
    - AUTOMATICALLY IMPROVE THE LIST OF NON INTERESTING WORDS FROM THE PREVIOUS ITERATIONS
    - ADD STUDY OF BIGRAMS
### - EXPLORE DIFFERENT MACHINE LEARNING ALGORITHMS FOR TIME SERIES
    - STUDY MORE APPROPRIATE ALGORITHMS LIKE DTW
### - STUDY GEOLOCATIONAL FEATURES 

In [144]:
################################DATABASE VARIABLES##################################
db_name= "immigration"
collection = "immigration"
#TODO: automatically increment the list of stop words and hashtags
personalized_excluded_words = ['example1','example2' ]
personalized_excluded_hashtags = ['#example1','#example2']
#if the dataset is too huge, these variables are used to chunk it from the starting index to the ending index
dataset_start_index = 200000
dataset_end_index = 300000

################################TIME CLUSTERING VARIABLES##################################

data_types= ['hashtag','words','accounts']
#types of object to study
chosen_type = data_types[0]
#number of hashtags/terms/other to study
objects_size= 30

#to do: expand the color dictionary for more clusters 
plot_colors= {0:"b",1:"g",2:"r",3:"c",4:"m",5:"y",6:"k",7:"w",}

from_filter= '2015-01-01'
to_filter= '2016-10-01'

time_span='10000Min' #scale of the time axis i.e. tweets are clustered each time_span fraction

num_clusters= 5

################################OUTPUT FILE NAMES##################################


In [123]:
client = MongoClient()
db = client[db_name]
coll= db[collection]

In [124]:
print coll

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'immigration'), u'immigration')


In [125]:
tweets = coll.find()[dataset_start_index:dataset_end_index]


In [126]:
print tweets.count(with_limit_and_skip=True)

100000


In [127]:
tweetsDf = pd.DataFrame(list(tweets)) 

In [128]:
print len(tweetsDf)

100000


In [129]:
tweets= tweetsDf["text"]
print "Example tweet:"
tweets[1]

Example tweet:


u'RNC makes major hire to woo Latinos https://t.co/OiF8lfSf9O @GOP'

In [130]:
##################################PREPROCESSING UTILITIES####################

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = "@gioazzi: example!  http://gioazzi.com #democrats #republicans "
print(preprocess(tweet))


['@gioazzi', ':', 'example', '!', 'http://gioazzi.com', '#democrats', '#republicans']


In [131]:
#TODO: escape emoticons errors
com = defaultdict(lambda : defaultdict(int))
count_all = Counter()
count_all_single = Counter()
count_all_hash = Counter()
count_all_terms = Counter()
punctuation = list(string.punctuation)

stop = stopwords.words('english') + punctuation + personalized_excluded_words +personalized_excluded_hashtags #+com

for index,row in tweetsDf.iterrows():

    terms_stop = [term for term in preprocess(row["text"]) if term not in stop]
    terms_all = [term for term in preprocess(row["text"]) if term not in stop]
    # Update the counter
    count_all.update(terms_all) 
    # Count terms only once, equivalent to Document Frequency
    terms_single = set(terms_all)
    count_all_single.update(terms_single)
    # Count hashtags only
    terms_hash = [term for term in preprocess(row["text"]) 
                  if term.startswith('#') and term not in stop]
    count_all_hash.update(terms_hash)
    # Count terms only (no hashtags, no mentions)
    terms_only = [term for term in preprocess(row["text"]) 
                  if term not in stop and
                  not term.startswith(('#', '@'))] 
                  # mind the ((double brackets))
                  # startswith() takes a tuple (not a list) if 
                  # we pass a list of inputs
    count_all_terms.update(terms_only)    
    # Build co-occurrence matrix
    for i in range(len(terms_only)-1):            
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])                
            if w1 != w2:
                com[w1][w2] += 1
                                
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[:5])                
                

[((u'Republican', u'immigration'), 10749), ((u'Republican', u'Trump'), 9496), ((u'Donald', u'Trump'), 6274), ((u'Immigration', u'Republican'), 5760), ((u'GOP', u'undocumented'), 5680)]


In [132]:
print "*************DOCUMENT FREQUENCY****************"
print "All"
print(count_all_single.most_common(15))
print "Only hashtags"
print(count_all_hash.most_common(15))
print "Only terms"
print(count_all_terms.most_common(15))

*************DOCUMENT FREQUENCY****************
All
[(u'Republican', 35502), (u'GOP', 31822), (u'immigration', 19439), (u'Trump', 16032), (u'immigrants', 10813), (u'undocumented', 9835), (u'Immigration', 9371), (u'The', 8817), (u'Latino', 8230), (u'via', 7781), (u'\u2026', 6976), (u'Undocumented', 6545), (u'Latinos', 6510), (u'#GOP', 6400), (u'amp', 6090)]
Only hashtags
[(u'#GOP', 6429), (u'#tcot', 3961), (u'#immigration', 3269), (u'#gop', 2160), (u'#Republican', 1521), (u'#p2', 1426), (u'#teaparty', 1222), (u'#UniteBlue', 1152), (u'#ocra', 1077), (u'#Immigration', 1055), (u'#Trump', 1029), (u'#sgp', 1018), (u'#GOPDebate', 922), (u'#Election2016', 876), (u'#ImmigrationReform', 845)]
Only terms
[(u'Republican', 35949), (u'GOP', 32509), (u'immigration', 19657), (u'Trump', 17491), (u'immigrants', 10944), (u'undocumented', 9912), (u'The', 9860), (u'Immigration', 9495), (u'Latino', 8464), (u'via', 7788), (u'amp', 7446), (u'\u2026', 7226), (u'\u2019', 6841), (u'Latinos', 6757), (u'Undocument

In [133]:
count_all_dict= dict(count_all_hash)
df = pd.DataFrame.from_dict(count_all_dict, orient='index').reset_index()
df.columns= ["word","count"]
df = df.sort(['count'],ascending=False)
df=df[:objects_size]
df.to_json("Frequency.json")
df.head(5)



Unnamed: 0,word,count
7756,#GOP,6429
9260,#tcot,3961
3670,#immigration,3269
8262,#gop,2160
8645,#Republican,1521


In [134]:
'''
TO PLOT: 
Run the server:
python -m http.server 8888 # Python 3
python -m SimpleHTTPServer 8888 # Python 2

in chart.html uncomment this line: parse("term_freq.json");

open
http://localhost:8888/chart.html
'''

#prepare data to plot a barchar of the most frequent terms
word_freq = count_all_terms.most_common(20)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x')
bar.to_json('term_freq.json')

In [135]:
#prepare data to plot timeseries of some terms/hashtags in the dates_dict
print df.head(5)
'''
TO PLOT: 
Run the server:
python -m http.server 8000 # Python 3
python -m SimpleHTTPServer 8000 # Python 2

in chart.html uncomment this line: parse("time_chart.json");

open
http://localhost:8000/chart.html
'''
#to study custom hashtag initializate the dates_dict as in the comment below:
#dates_dict = {'#immigration':[], '#GOP':[] } 
dates_dict = dict((word,[]) for word in df["word"][:30])
#dates_dict = take(200, dates_dict.iteritems())
#print dates_dict


              word  count
7756          #GOP   6429
9260         #tcot   3961
3670  #immigration   3269
8262          #gop   2160
8645   #Republican   1521


In [136]:
print dates_dict

{u'#ImmigrationReform': [], u'#sgp': [], u'#GOPDebate': [], u'#teaparty': [], u'#GOP': [], u'#Conservative': [], u'#US': [], u'#Latinos': [], u'#tlot': [], u'#Election2016': [], u'#TNTweeters': [], u'#ccot': [], u'#gop': [], u'#Trump2016': [], u'#Trump': [], u'#news': [], u'#ocra': [], u'#Latino': [], u'#republican': [], u'#Immigration': [], u'#immigrants': [], u'#Not1More': [], u'#tcot': [], u'#UniteBlue': [], u'#immigration': [], u'#Republican': [], u'#CIR': [], u'#politics': [], u'#p2': [], u'#undocumented': []}


In [145]:
for index,row in tweetsDf.iterrows(): 
    if chosen_type== 'hashtag':
        terms = [term for term in preprocess(row['text']) if term.startswith('#')]
    if chosen_type== 'words':
        terms = [term for term in preprocess(row['text']) if not term.startswith(('#', '@'))]
    if chosen_type== 'accounts':
        terms = [term for term in preprocess(row['text']) if term.startswith('@')]

    # track when the hashtag is mentioned
    for key in dates_dict:
        
        if key in terms:
            dates_dict[key].append(row['raw']['created_at'])
             
# a list of "1" to count the hashtags
ones= {}
idx={}
dataframes={}
per_minute={}
for key in dates_dict:
    
    ones[key] = [1]*len(dates_dict[key])
    # the index of the series
    idx[key] = pd.DatetimeIndex(dates_dict[key])
    # the actual series (at series of 1s for the moment)
    dataframes[key] = pd.Series(ones[key], index=idx[key])
                                                          
    # Resampling / bucketing
    per_minute[key] = dataframes[key].resample(time_span, how='sum').fillna(0)
print "done"
#print per_minute

done


In [146]:
# all the data together
match_data = per_minute
# we need a DataFrame, to accommodate multiple series
all_matches = pd.DataFrame(data=match_data) #,index=per_minute['#'].index
# Resampling 
all_matches = all_matches.resample(time_span, how='sum').fillna(0)
#selecting interval to study 
all_matches= all_matches.ix[from_id:to_id]    
    
#prepare data to plot on external libraries
time_chart = vincent.Line(all_matches[list(match_data.keys())])
time_chart.axis_titles(x='Time', y='Freq')
time_chart.legend(title='Matches')
time_chart.to_json('time_chart.json')
all_matches.to_csv('time_chart.csv')
print all_matches.head(5)
print "done"

                     #CIR  #Conservative  #Election2016  #GOP  #GOPDebate  \
2016-03-06 08:00:00     0             22              4    72         156   
2016-03-13 06:40:00    24             38             14   224           2   
2016-03-20 05:20:00    12             64             14   472           0   
2016-03-27 04:00:00    60             54             32   510           4   
2016-04-03 02:40:00    10             40              8   302           0   

                     #Immigration  #ImmigrationReform  #Latino  #Latinos  \
2016-03-06 08:00:00            18                   2        6        12   
2016-03-13 06:40:00            66                   2       20        40   
2016-03-20 05:20:00            88                  24       30        36   
2016-03-27 04:00:00            88                   6       52        56   
2016-04-03 02:40:00            54                   2       32        40   

                     #Not1More      ...        #news  #ocra  #p2  #politics  \
2

In [147]:
###########################CLUSTERING###################
#labels identify the cluster
matches_np= np.array(all_matches).transpose()
kmeans= KMeans(n_clusters=num_clusters)
kmeans.fit(matches_np)
#centroids= kmeans.cluster_centers_
labels= kmeans.labels_
#print centroids
print labels


[2 0 2 1 2 0 2 0 0 2 0 2 0 0 2 2 2 0 2 3 2 2 2 2 2 2 4 2 2 2]


In [148]:
i=0
clusters=[]
for i in xrange(0,num_clusters):
    clusters.append(["cluster "+ str(i), " color "+ plot_colors[i]])
    
i=0
for el in all_matches.columns.values:    
    #print el + " color: "+ plot_colors[labels[i]]+ " cluster "+ str(labels[i])
    clusters[labels[i]].append(el)
    i=i+1
for el in clusters:
    print el


['cluster 0', ' color b', u'#Conservative', u'#Immigration', u'#Latino', u'#Latinos', u'#Republican', u'#Trump', u'#Trump2016', u'#gop']
['cluster 1', ' color g', u'#GOP']
['cluster 2', ' color r', u'#CIR', u'#Election2016', u'#GOPDebate', u'#ImmigrationReform', u'#Not1More', u'#TNTweeters', u'#US', u'#UniteBlue', u'#ccot', u'#immigrants', u'#news', u'#ocra', u'#p2', u'#politics', u'#republican', u'#sgp', u'#teaparty', u'#tlot', u'#undocumented']
['cluster 3', ' color c', u'#immigration']
['cluster 4', ' color m', u'#tcot']


In [149]:
for index in xrange(1,len(matches_np)):
    y= matches_np[index]
    x = np.array(all_matches.index)
    #print x.shape
    #y = np.random.randint(100, size=x.shape)
    plt.plot(x,y, color= plot_colors[labels[index]])
plt.show()

In [142]:
#prepare data to plot geolocated graphs

#TO PLOT: http://geojson.io/#map=2/20.0/0.0

geo_data = {
    "type": "FeatureCollection",
    "features": []
}
for index,row in tweetsDf.iterrows():
    if row['raw']['coordinates']:
        geo_json_feature = {
            "type": "Feature",
            "geometry": row['raw']['coordinates'],
            "properties": {
                "text": row['text'],
                "created_at": row['raw']['created_at']
            }
        }
        geo_data['features'].append(geo_json_feature)
 
# Save geo data
with open('geo_data.json', 'w') as fout:
    fout.write(json.dumps(geo_data, indent=4))

In [143]:
'''
Bigrams: not used yet
from nltk import bigrams 
terms_bigram = bigrams(terms_stop)
'''

'\nBigrams: not used yet\nfrom nltk import bigrams \nterms_bigram = bigrams(terms_stop)\n'