# GeenPeil Monitor: Exploratory Data Analysis

In [4]:
import json
from collections import Counter
import locale
import time
locale.setlocale(locale.LC_ALL, '')


'en_US.UTF-8'

In [5]:
# Read in the data in a dictionary
tweetsFile = open('data-17_03_2016.json')
tweetsString = tweetsFile.read()
tweets = json.loads(tweetsString)

In [6]:
# How does a tweet look?
print(tweets[0])

{'favorite_count': 0, 'created_at': 'Thu Feb 25 10:15:30 +0000 2016', 'is_quote_status': False, 'contributors': None, 'in_reply_to_user_id_str': '371932423', 'retweeted': False, 'user': {'id': 3576975202, 'time_zone': 'Pacific Time (US & Canada)', 'name': 'Valentino-v.Gogh', 'following': None, 'notifications': None, 'screen_name': 'Backenricker', 'profile_sidebar_fill_color': '000000', 'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/657929087252627456/kPGzL3vR.jpg', 'profile_use_background_image': True, 'profile_text_color': '000000', 'verified': False, 'created_at': 'Mon Sep 07 12:39:34 +0000 2015', 'followers_count': 1389, 'listed_count': 335, 'statuses_count': 25393, 'description': 'Art Music Science Maths Physics QM Tech HR Humanity Nature. T mostly Dutch. ★ Blackstar, Shining, UFO collector(7). Quarky. RT/... not endorsement', 'follow_request_sent': None, 'default_profile': False, 'geo_enabled': False, 'profile_sidebar_border_color': '000000'

In [36]:
linksCounter = Counter()
for i in range(0,len(tweets)):
    tweet = tweets[i]
    entities = tweet['entities']
    urls = entities['urls']
    for j in range(0,len(urls)):
        link = urls[j]['expanded_url']
        linksCounter[link] += 1

totalUrls = []

for k,v in dict(linksCounter).items():
    if v > 100:
        totalUrls.append({"url":str(k),"count":v})

with open("urls.json", 'w') as outfile:
    json.dump(totalUrls, outfile)

In [None]:
# How does a tweet location look?
locationCounter = Counter()

for i in range(0,len(tweets)):
    location = tweets[i]['user']['location']
    locationCounter[location] += 1
    
print(locationCounter)
coordsCount = 0
for i in range(0,len(tweets)):
    coords = tweets[i]['coordinates']
    if coords != None:
        coordsCount += 1
print(coordsCount) # not even unique

In [None]:
# Which headers are in a tweet?
print (tweets[0].keys())

In [None]:
# Which headers are in the user object?
print (tweets[0]['user'].keys())

# What is in a user?
for i in range(0,len(tweets)):
    if tweets[i]['user']['screen_name'] == "VVD":
        print (tweets[i]['text'])
#print (tweets[0]['user'])

In [None]:
# Read Dutch noise words file
noiseWords = []
f = open('stopwoordenlijst.txt', 'r')
for line in f:
    word = line.rstrip()
    noiseWords.append(word)
print(noiseWords)

In [None]:
# filter uninteresting words
import re, string 
pattern = re.compile('[^a-zA-Z]+')
textFromTweets = []

for i in range(0, len(tweets)):
    textFromTweets.append(' '.join([pattern.sub('', word.lower()) for word in tweets[i]['text'].split() if word.lower() not in noiseWords]))  

In [None]:
# count occurences per word
wordCounts = Counter()
for tweet in textFromTweets:
    words = tweet.split()
    for word in words:
        wordCounts[word] += 1

with open("wordcounts.json", 'w') as outfile:
    json.dump(wordCounts, outfile)

In [None]:
# count occurences per word, new format
wordsList = []

for key in wordCounts.keys():
    wordsDict = {}
    if wordCounts[key] > 200 and len(key) >= 2 and key not in noiseWords:
        wordsDict.update({"text":key,"size":wordCounts[key]})
        wordsList.append(wordsDict)


with open("GeenPeilMonitor/server/public/words.json", 'w') as outfile:
    json.dump(wordsList, outfile)

In [None]:
# count occurences per user and daily occurences per user
userCounts = Counter()
dailyUserCounts = {}

for i in range(0,len(tweets)):
    user = tweets[i]['user']['screen_name']
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    userCounts[user] += 1
    if ts in dailyUserCounts:
        if user in dailyUserCounts[ts]:
            dailyUserCounts[ts][user] += 1
        else:
            dailyUserCounts[ts][user] = 1
    else:
        dailyUserCounts.update({ts:{user: 1}})

with open("usercounts.json", 'w') as outfile:
    json.dump(userCounts, outfile)

    
with open("dailyusercounts.json",'w') as outfile:
    json.dump(dailyUserCounts,outfile)

In [None]:
# number of tweets per day
dailyCounts = Counter()

for i in range(0,len(tweets)):
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    dailyCounts[ts] += 1

with open("dailycounts.json", 'w') as outfile:
    json.dump(dailyCounts, outfile)

In [None]:
# Basis for occurrences of words per day
textFromTweetsPerDay = {}

for i in range(0, len(tweets)):
    ts = time.strftime('%Y-%m-%d', time.strptime(tweets[i]['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    textFromTweetsPerDay.setdefault(ts,[]).append(' '.join([pattern.sub('',word.lower()) for word in tweets[i]['text'].split() if word.lower() not in noiseWords]))

wordCountsFromTweetsPerDay = {}
for day in textFromTweetsPerDay:
    dailyWordCounts = {}
    for tweet in textFromTweetsPerDay[day]:
        words = tweet.split()
        for word in words:
            if word in dailyWordCounts:
                dailyWordCounts[word] += 1
            else:
                dailyWordCounts[word] = 1
    if day in wordCountsFromTweetsPerDay:
        wordCountsFromTweetsPerDay[day]['text'] = dailyWordCounts
    else:
        wordCountsFromTweetsPerDay[day] = {}
        wordCountsFromTweetsPerDay[day]['text'] = dailyWordCounts

print(wordCountsFromTweetsPerDay["2016-02-27"]['text'])
    
with open("wordcountsperday.json", 'w') as outfile:
    json.dump(wordCountsFromTweetsPerDay, outfile)

In [None]:
dailyWordsList = []

for day in wordCountsFromTweetsPerDay.keys():
    wordsDict = {}
    wordsDict.update({"date":day,"total":dailyCounts[day],"words":[]})
    for key in wordCountsFromTweetsPerDay[day]['text']:
        if wordCountsFromTweetsPerDay[day]['text'][key] > 60 and len(key) >= 2 and key not in noiseWords:
            wordsDict['words'].append({"text":key,"size":wordCountsFromTweetsPerDay[day]['text'][key]})
    dailyWordsList.append(wordsDict)


with open("GeenPeilMonitor/server/public/dailywords.json", 'w') as outfile:
    json.dump(dailyWordsList, outfile)

In [None]:
# Reshape json for convenience
totalData = {}

keys = wordCountsFromTweetsPerDay.keys()

for key in keys:
    totalData[key] = {'count':dailyCounts[key], 'user_counts':dailyUserCounts[key],'word_counts':wordCountsFromTweetsPerDay[key]}
    
with open("aggregatedData.json", 'w') as outfile:
    json.dump(totalData, outfile)

In [None]:
# Check out most common users
print(userCounts)