# First, we need to define all we did in the previous lab:

In [1]:
import json
jsonFileText = None
with open("TwitterKeys.json", 'r') as inputFile: jsonFileText = inputFile.read().replace('\n', '')
keys = json.loads(jsonFileText)

In [2]:
import tweepy
from tweepy import OAuthHandler
 
consumer_key = keys["consumer_key"]
consumer_secret = keys["consumer_secret"]
access_token = keys["access_token"]
access_secret = keys["access_secret"]

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth)

In [3]:
from tweepy import Stream
from tweepy.streaming import StreamListener

topic = "Manchester"    # Change only this line to generate a file for this topic and with this name
fileName = topic + ".json"

In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords") # download the stopword corpus on our computer
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'RT']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [6]:
import operator 
import json
from collections import Counter
 
with open(fileName, 'r') as f:
    count_all_stop = Counter()
    count_all_hash = Counter()
    count_all_only = Counter()
    for line in f:
        tweet = json.loads(line)
        if "text" in tweet: 
            terms_stop = [term for term in preprocess(tweet['text']) if term not in stop]
            terms_hash = [term for term in preprocess(tweet['text']) if term.startswith('#')]
            terms_only = [term for term in preprocess(tweet['text']) if term not in stop and not term.startswith(('#', '@'))]
            
            count_all_stop.update(terms_stop)
            count_all_hash.update(terms_hash)
            count_all_only.update(terms_only)

# Now we can use some of the created counters to generate the JSON file that OurService.html will read. This HTML will be served by Server.ipynb in turn.

In [7]:
import vincent

word_freq = count_all_stop.most_common(15)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x')
bar.to_json('term_freq.json')

# Now we can run Server.ipynb and open http://localhost:8954, which will be redirected to http://localhost:8954/OurService.html and will show the barplot from the JSON file term_freq.json.