### Twitter data
Many social networks, including Facebook, twitter, OkCupid have information about geolocation. 
Using geolocation data one can get or infer the information about some things happening around the world. This notebook has been inspired from the work about surfers locations. 

As the idea we can use 
1. Openhumans.org https://exploratory.openhumans.org/notebook/1/
2. Public repositories https://gwu-libraries.github.io/sfm-ui/posts/2017-09-14-twitter-data networkrepository.org 
3. DOLLY (Digital Online Life and You) data http://www.floatingsheep.org/2010/01/googles-geographies-of-religion.html  


<img src="Where_Surfers_Travel.png" alt="Drawing" style="width: 900px;"/>


*Notebook dedicated to the visit of Lisa from CorrelAid*

### Main questions of this notebook 

1. Here using some open twitter data we infer the information about the main emergency events in the world. 
2. We also try to infer the information about human travels. 
3. Using twitter data we can also show that normal borders, which exist on our maps do not really exist.

In [2]:

import os
import json
import requests
from datetime import datetime
from collections import defaultdict
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tempfile
import zipfile
import pytz
import io
import sys
from textblob import TextBlob
#import emoji

# time for twitter data
DARKSKY_KEY = ''
DATARANGE_START = "2016-06-01"
DATARANGE_END = "2018-05-08"



# sets the axis label sizes for seaborn as in Bastian setup
rc={'font.size': 14, 'axes.labelsize': 14, 'legend.fontsize': 14.0, 
    'axes.titlesize': 14, 'xtick.labelsize': 14, 'ytick.labelsize': 14}
sns.set(rc=rc)


# THIS CODE BELOW IS COPIED FROM TWARXIV.ORG AS IT ALREADY DOES EXACTLY WHAT WE WANT FOR READING IN THE DATA
# This code is also using some Bastian code  https://exploratory.openhumans.org/notebook/1/ 
# Other authors: Liuba, Lisa, Jon

# READ JSON FILES FROM TWITTER ARCHIVE!

def check_hashtag(single_tweet):
    '''check whether tweet has any hashtags'''
    return len(single_tweet['entities']['hashtags']) > 0


def check_media(single_tweet):
    '''check whether tweet has any media attached'''
    return len(single_tweet['entities']['media']) > 0


def check_url(single_tweet):
    '''check whether tweet has any urls attached'''
    return len(single_tweet['entities']['urls']) > 0


def check_retweet(single_tweet):
    '''
    check whether tweet is a RT. If yes:
    return name & user name of the RT'd user.
    otherwise just return nones
    '''
    if 'retweeted_status' in single_tweet.keys():
        return (single_tweet['retweeted_status']['user']['screen_name'],
                single_tweet['retweeted_status']['user']['name'])
    else:
        return (None, None)


def check_coordinates(single_tweet):
    '''
    check whether tweet has coordinates of location attached.
    if yes return the coordinates
    otherwise just return nones
    '''
    if 'coordinates' in single_tweet['geo'].keys():
        return (single_tweet['geo']['coordinates'][0],
                single_tweet['geo']['coordinates'][1])
    else:
        return (None, None)


def check_reply_to(single_tweet):
    '''
    check whether tweet is a reply. If yes:
    return name & user name of the user that's replied to.
    otherwise just return nones
    '''
    if 'in_reply_to_screen_name' in single_tweet.keys():
        name = None
        for user in single_tweet['entities']['user_mentions']:
            if user['screen_name'] == single_tweet['in_reply_to_screen_name']:
                name = user['name']
                break
        return (single_tweet['in_reply_to_screen_name'], name)
    else:
        return (None, None)


def create_dataframe(tweets):
    '''
    create a pandas dataframe from our tweet jsons
    '''

    # initalize empty lists
    utc_time = []
    longitude = []
    latitude = []
    hashtag = []
    media = []
    url = []
    retweet_user_name = []
    retweet_name = []
    reply_user_name = []
    reply_name = []
    text = []
    # iterate over all tweets and extract data
    for single_tweet in tweets:
        utc_time.append(datetime.strptime(single_tweet['created_at'],
                                                   '%Y-%m-%d %H:%M:%S %z'))
        coordinates = check_coordinates(single_tweet)
        latitude.append(coordinates[0])
        longitude.append(coordinates[1])
        hashtag.append(check_hashtag(single_tweet))
        media.append(check_media(single_tweet))
        url.append(check_url(single_tweet))
        retweet = check_retweet(single_tweet)
        retweet_user_name.append(retweet[0])
        retweet_name.append(retweet[1])
        reply = check_reply_to(single_tweet)
        reply_user_name.append(reply[0])
        reply_name.append(reply[1])
        text.append(single_tweet['text'])
    # convert the whole shebang into a pandas dataframe
    dataframe = pd.DataFrame(data={
                            'utc_time': utc_time,
                            'latitude': latitude,
                            'longitude': longitude,
                            'hashtag': hashtag,
                            'media': media,
                            'url': url,
                            'retweet_user_name': retweet_user_name,
                            'retweet_name': retweet_name,
                            'reply_user_name': reply_user_name,
                            'reply_name': reply_name,
                            'text': text
    })
    return dataframe


def read_files(zip_url):
    tf = tempfile.NamedTemporaryFile()
    print('downloading files')
    tf.write(requests.get(zip_url).content)
    tf.flush()
    zf = zipfile.ZipFile(tf.name)
    print('reading index')
    with zf.open('data/js/tweet_index.js', 'r') as f:
        f = io.TextIOWrapper(f)
        d = f.readlines()[1:]
        d = "[{" + "".join(d)
        json_files = json.loads(d)
    data_frames = []
    print('iterate over individual files')
    for single_file in json_files:
        print('read ' + single_file['file_name'])
        with zf.open(single_file['file_name']) as f:
            f = io.TextIOWrapper(f)
            d = f.readlines()[1:]
            d = "".join(d)
            tweets = json.loads(d)
            df_tweets = create_dataframe(tweets)
            data_frames.append(df_tweets)
    return data_frames


def create_main_dataframe(zip_url='http://ruleofthirds.de/test_archive.zip'):
    print('reading files')
    dataframes = read_files(zip_url)
    print('concatenating...')
    dataframe = pd.concat(dataframes)
    dataframe = dataframe.sort_values('utc_time', ascending=False)
    dataframe = dataframe.set_index('utc_time')
    dataframe = dataframe.replace(to_replace={
                                    'url': {False: None},
                                    'hashtag': {False: None},
                                    'media': {False: None}
                                    })

    return dataframe

### Loading data and analyzing locations

Let us plot the coordinates of twitter user using function *check_coordinates(single_tweet)* 
For each user we will have its own trajectory of where he/she tweeted.

The goal is later to get geographical locations of tweets with #tags which mention conferences.

In [1]:



import requests
import json
import os 
'''

# read data from online file of OpenHumans
response = requests.get("https://www.openhumans.org/api/direct-sharing/project/exchange-member/?access_token={}".format(os.environ.get('OH_ACCESS_TOKEN')))
user = json.loads(response.content)
has_twitter = False

public_data = {}
for i in response['results']:
    data = requests.get(i['download_url']).json()
    public_data[i['user']['username']] = data
        
''' 

response = requests.get("https://www.openhumans.org/api/direct-sharing/project/exchange-member/?access_token={}".format(os.environ.get('OH_ACCESS_TOKEN')))
user = json.loads(response.content)


#printing data 
print(json.dumps(user, indent=4, sort_keys=True))

has_twitter = False
has_moves = False
'''
# get our download URLs
for entry in user['data']:
    if entry['source'] == "direct-sharing-70":
        twitter_data_url = entry['download_url']
        has_twitter = True
    if entry['source'] == "direct-sharing-138":
        moves_data_url = entry['download_url']
        has_moves = True
if not has_twitter:
    print("YOU NEED TO HAVE SOME TWITTER DATA IN YOUR ACCOUNT TO USE THIS NOTEBOOK")
    print("GO TO http://twarxiv.org TO UPLOAD IT")



# read the twitter data
#twitter_data = create_main_dataframe(zip_url=twitter_data_url)

# load json file: convert js file to json and then read it
twitter_file = 'C:/Users/lyubo/Documents/PYTHON/jupiter_notebook/twitter_data_analysis/account1/data/js/tweet_index.js'



with open(twitter_file) as dataFile:
    data = dataFile.read()
    obj = data[data.find('{') : data.rfind('}')+1]
    jsonObj = json.loads(obj)
'''


{
    "detail": "Invalid token."
}


'\n# get our download URLs\nfor entry in user[\'data\']:\n    if entry[\'source\'] == "direct-sharing-70":\n        twitter_data_url = entry[\'download_url\']\n        has_twitter = True\n    if entry[\'source\'] == "direct-sharing-138":\n        moves_data_url = entry[\'download_url\']\n        has_moves = True\nif not has_twitter:\n    print("YOU NEED TO HAVE SOME TWITTER DATA IN YOUR ACCOUNT TO USE THIS NOTEBOOK")\n    print("GO TO http://twarxiv.org TO UPLOAD IT")\n\n\n\n# read the twitter data\n#twitter_data = create_main_dataframe(zip_url=twitter_data_url)\n\n# load json file: convert js file to json and then read it\ntwitter_file = \'C:/Users/lyubo/Documents/PYTHON/jupiter_notebook/twitter_data_analysis/account1/data/js/tweet_index.js\'\n\n\n\nwith open(twitter_file) as dataFile:\n    data = dataFile.read()\n    obj = data[data.find(\'{\') : data.rfind(\'}\')+1]\n    jsonObj = json.loads(obj)\n'