# GeenPeil Monitor: Exploratory Data Analysis

In [74]:
import json

In [75]:
# Read in the data in a dictionary
tweetsFile = open('data-17_03_2016.json')
tweetsString = tweetsFile.read()
tweets = json.loads(tweetsString)

# How does a tweet look?
print(tweets[0])

{'favorited': False, 'in_reply_to_screen_name': 'AdriaanBeenen', 'id_str': '702799069832466432', 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'in_reply_to_status_id': 702525968070610945, 'timestamp_ms': '1456395330280', 'retweeted': False, 'user': {'description': 'Art Music Science Maths Physics QM Tech HR Humanity Nature. T mostly Dutch. ★ Blackstar, Shining, UFO collector(7). Quarky. RT/... not endorsement', 'profile_sidebar_fill_color': '000000', 'location': 'Villa of Ormen, Netherlands', 'screen_name': 'Backenricker', 'geo_enabled': False, 'favourites_count': 4824, 'id_str': '3576975202', 'follow_request_sent': None, 'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/657929087252627456/kPGzL3vR.jpg', 'utc_offset': -28800, 'is_translator': False, 'default_profile': False, 'listed_count': 335, 'name': 'Valentino-v.Gogh', 'default_profile_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/68662829421

In [76]:
# Which headers are in a tweet?
print (tweets[0].keys())

dict_keys(['favorited', 'in_reply_to_screen_name', 'id_str', 'source', 'in_reply_to_status_id', 'timestamp_ms', 'retweeted', 'user', 'in_reply_to_user_id_str', 'contributors', 'geo', 'favorite_count', 'entities', 'is_quote_status', 'possibly_sensitive', 'truncated', 'created_at', 'id', 'in_reply_to_status_id_str', 'place', 'filter_level', 'extended_entities', 'in_reply_to_user_id', 'retweet_count', 'coordinates', 'text', 'lang'])


In [77]:
# Which headers are in the user object?
print (tweets[0]['user'].keys())

# What is in a user?
print (tweets[0]['user'])

dict_keys(['description', 'profile_sidebar_fill_color', 'location', 'screen_name', 'geo_enabled', 'favourites_count', 'id_str', 'follow_request_sent', 'profile_background_image_url', 'utc_offset', 'is_translator', 'default_profile', 'listed_count', 'name', 'default_profile_image', 'profile_image_url', 'url', 'protected', 'statuses_count', 'profile_background_image_url_https', 'profile_image_url_https', 'created_at', 'notifications', 'id', 'verified', 'profile_banner_url', 'contributors_enabled', 'profile_background_color', 'profile_sidebar_border_color', 'profile_link_color', 'profile_text_color', 'friends_count', 'following', 'profile_background_tile', 'followers_count', 'profile_use_background_image', 'time_zone', 'lang'])
{'description': 'Art Music Science Maths Physics QM Tech HR Humanity Nature. T mostly Dutch. ★ Blackstar, Shining, UFO collector(7). Quarky. RT/... not endorsement', 'profile_sidebar_fill_color': '000000', 'location': 'Villa of Ormen, Netherlands', 'screen_name': '

In [78]:
# Read Dutch noise words file
noiseWords = []
f = open('stopwoordenlijst.txt', 'r')
for line in f:
    word = line.rstrip()
    noiseWords.append(word)
print(noiseWords)

['aangaande', 'aangezien', 'achter', 'achterna', 'afgelopen', 'al', 'aldaar', 'aldus', 'alhoewel', 'alias', 'alle', 'allebei', 'alleen', 'alsnog', 'altijd', 'altoos', 'ander', 'andere', 'anders', 'anderszins', 'behalve', 'behoudens', 'beide', 'beiden', 'ben', 'beneden', 'bent', 'bepaald', 'betreffende', 'bij', 'binnen', 'binnenin', 'boven', 'bovenal', 'bovendien', 'bovengenoemd', 'bovenstaand', 'bovenvermeld', 'buiten', 'daar', 'daarheen', 'daarin', 'daarna', 'daarnet', 'daarom', 'daarop', 'daarvanlangs', 'dan', 'dat', 'de', 'die', 'dikwijls', 'dit', 'door', 'doorgaand', 'dus', 'echter', 'eer', 'eerdat', 'eerder', 'eerlang', 'eerst', 'elk', 'elke', 'en', 'enig', 'enigszins', 'enkel', 'er', 'erdoor', 'even', 'eveneens', 'evenwel', 'gauw', 'gedurende', 'geen', 'gehad', 'gekund', 'geleden', 'gelijk', 'gemoeten', 'gemogen', 'geweest', 'gewoon', 'gewoonweg', 'haar', 'had', 'hadden', 'hare', 'heb', 'hebben', 'hebt', 'heeft', 'hem', 'hen', 'het', 'hierbeneden', 'hierboven', 'hij', 'hoe', 'hoe

In [79]:
# filter uninteresting words
textFromTweets = []

for i in range(0, len(tweets)):
    textFromTweets.append(' '.join([word for word in tweets[i]['text'].split() if word not in noiseWords]))  

In [80]:
# count occurences per word
from collections import Counter
wordCounts = Counter()
for tweet in textFromTweets:
    words = tweet.split()
    for word in words:
        wordCounts[word] += 1
        
with open("wordcounts.json", 'w') as outfile:
    json.dump(wordCounts, outfile)

In [81]:
# count occurences per user and daily occurences per user
userCounts = Counter()
dailyUserCounts = {}

for i in range(0,len(tweets)):
    user = tweets[i]['user']['screen_name']
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    userCounts[user] += 1
    if ts in dailyUserCounts:
        if user in dailyUserCounts[ts]:
            dailyUserCounts[ts][user] += 1
        else:
            dailyUserCounts[ts][user] = 1
    else:
        dailyUserCounts.update({ts:{user: 1}})

with open("usercounts.json", 'w') as outfile:
    json.dump(userCounts, outfile)

    
with open("dailyusercounts.json",'w') as outfile:
    json.dump(dailyUserCounts,outfile)

In [82]:
import time
# number of tweets per day
dailyCounts = Counter()

for i in range(0,len(tweets)):
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    dailyCounts[ts] += 1

with open("dailycounts.json", 'w') as outfile:
    json.dump(dailyCounts, outfile)

In [83]:
# Basis for occurrences of words per day
textFromTweetsPerDay = {}

for i in range(0, len(tweets)):
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    textFromTweetsPerDay.setdefault(ts,[]).append(' '.join([word for word in tweets[i]['text'].split() if word not in noiseWords]))

wordCountsFromTweetsPerDay = {}
for day in textFromTweetsPerDay:
    dailyWordCounts = Counter()
    for tweet in textFromTweetsPerDay[day]:
        words = tweet.split()
        for word in words:
            dailyWordCounts[word] += 1
    wordCountsFromTweetsPerDay.setdefault(day,[]).append(dailyWordCounts)

with open("wordcountsperday.json", 'w') as outfile:
    json.dump(wordCountsFromTweetsPerDay, outfile)

In [84]:
# Reshape json for convenience
totalData = {}

keys = wordCountsFromTweetsPerDay.keys()

for key in keys:
    totalData[key] = {'count':dailyCounts[key], 'user_counts':dailyUserCounts[key],'word_counts':wordCountsFromTweetsPerDay[key]}
    
with open("aggregatedData.json", 'w') as outfile:
    json.dump(totalData, outfile)