# Putting all of the pieces together, this Notebook handles GNIP GeoJSONL files

In [1]:
import json, pprint, os, io, sys, PIL,urllib.request
import pandas as pd; import numpy as np
from IPython.display import display, Image
from urllib.request import urlopen

Where is the configuration file?

In [2]:
config_file = "hajj.config"

In [3]:
config = json.load(open(config_file,'r'))
pprint.pprint(config)

{'start_date': '2016-09-07',
 'tweets': '/data/hajj/gnip_tweets_full.jsonl',
 'web_root': '/data/www/tweetsonamap/hajj'}


In [4]:
start_date = pd.Timestamp(config['start_date'] + "T00:00:000Z")
start_date # Ensure this is UTC

Timestamp('2016-09-07 00:00:00+0000', tz='UTC')

## Step 1: Read in jsonl file.

In [5]:
tweets = []
with open(config['tweets'],'r') as inFile:
    for line in inFile:
        tweets.append(json.loads(line.strip()))
df = pd.DataFrame(tweets)
df = pd.DataFrame(df[df['twitter_extended_entities'].notnull()])
df['id'] = df.id.apply(lambda x: x.split(":")[2])

Two types of tweets: geo-tagged and geolocated. Identify these

In [146]:
geotagged_df  = df[df.geo.notnull()].copy()
geolocated_df = df[df.geo.isnull()].copy()

"Identified {0} geotagged tweets and {1} geolocated tweets ({2} total)".format(len(geotagged_df),len(geolocated_df), len(geolocated_df)+len(geotagged_df))

'Identified 86 geotagged tweets and 1962 geolocated tweets (2048 total)'

## Step 2: Process all of the images

First, get the raw img_url for all of the tweets

In [138]:
df['img_url'] = df.twitter_extended_entities.apply(lambda r: r['media'][0]['media_url'])

In [139]:
if not os.path.exists(config['web_root']+"/map_images"):
    os.makedirs(config['web_root']+"/map_images")

map_image_dir = config['web_root']+"/map_images/"

In [140]:
sizes = {"thumb":60,
         "small":150,
         "medium":300,
         "large":400,
         "original":-1 #basewidth value won't be used
        }

def download_and_resize(url, tweet_id):
    """
        Input: URL to image
        Returns: None
    """
    
    img_name = tweet_id + ".jpg"
    
    
    try:
        with urllib.request.urlopen(url) as img:
            f = io.BytesIO(img.read())
            orig_img = PIL.Image.open(f)
    except:
        e = sys.exc_info()[0]
        print(e,"Error on:",url)
        print(sys.exc_info())

    # Iterate over each size/basewidth
    for size,basewidth in sizes.items():

        # Make directory within map_image_dir for each size
        if not os.path.exists(map_image_dir+size):
            os.makedirs(map_image_dir+size)
            print("making directory for",size)

        # Set filename to save resized image and break to the next image if it already exists
        new_path = map_image_dir+size+"/"+img_name
        if os.path.isfile(new_path):
            break

        # Don't resize images in original folder
        if size == "original":
            orig_img.save(new_path)
        else:
            try:
                # Set height proportional to fixed basewidth from 
                # https://opensource.com/life/15/2/resize-images-python
                wpercent = (basewidth / float(orig_img.size[0]))
                hsize = int((float(orig_img.size[1]) * float(wpercent))) 
                resized_img = orig_img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
                resized_img.save(new_path)
#                     display(Image(filename=orig_path, width=basewidth, height=hsize)) # display resized image
            except:
                e = sys.exc_info()[0]
                print(e,"Error on:",img_name)
                break

In [141]:
for idx, tweet in df.iterrows():
    download_and_resize(tweet.img_url, tweet.id)
    sys.stderr.write("\r {0}: {1}, {2}".format(idx, tweet.id, tweet.img_url))

 37: 773490012331577344, http://pbs.twimg.com/media/CrXkDEuXYAQZg8Q.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CrXkDEuXYAQZg8Q.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d3d0de48>)
<class 'UnboundLocalError'> Error on: 773490012331577344.jpg


 798: 774730112566697984, http://pbs.twimg.com/media/CsA87z3XYAA8X4z.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CsA87z3XYAA8X4z.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d8507948>)
<class 'UnboundLocalError'> Error on: 774730112566697984.jpg


 1800: 775772519324381184, http://pbs.twimg.com/media/CsPu6HwXgAAtX_O.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CsPu6HwXgAAtX_O.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d87a2108>)
<class 'UnboundLocalError'> Error on: 775772519324381184.jpg


 2012: 776121452760367104, http://pbs.twimg.com/media/CsVV2IXWgAAVbUS.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CsVV2IXWgAAVbUS.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d3deaa88>)
<class 'UnboundLocalError'> Error on: 776121452760367104.jpg


 2425: 777198381890502656, http://pbs.twimg.com/media/CsjlXHPWgAAETbA.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CsjlXHPWgAAETbA.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d3e22048>)
<class 'UnboundLocalError'> Error on: 777198381890502656.jpg


 2501: 778019121069953026, http://pbs.twimg.com/media/CswUQfkWgAAEaTw.jpg

## Step 2: Write these geojson files

### First, the geotagged tweets

In [168]:
feats = []
for idx, tweet in geotagged_df.iterrows():
    if os.path.exists(map_image_dir+"/thumb/"+tweet.id+".jpg"):
        jsonObj = {  "type"       : "Feature",
                     "geometry"   : {  "type" : "Point", 
                                       "coordinates" : list(reversed(tweet['geo']['coordinates']))},
                     "properties" : {  "id" : tweet.id,
                                       "user" : tweet['actor']['preferredUsername'],
                                       "text" : tweet['body'],
                                       "day" : int( ( pd.Timestamp(tweet['postedTime']) - start_date ).total_seconds() /(3600*24) )
           }
         }
        feats.append(jsonObj)

In [170]:
with open(config['web_root']+'/geotagged-tweets.geojson','w') as outFile:
    json.dump({'type':"FeatureCollection","features":feats},outFile)

### Second, the geolocated tweets

In [171]:
geolocated_df['location_string'] = geolocated_df.location.apply(lambda x: str(x['geo']['coordinates']))
#TODO: Should filter this based on whether or not the image was downloaded?

In [173]:
gb_loc = geolocated_df.groupby('location_string').aggregate({
        'id' : {
            'Tweets' : 'count',
            'IDs'    : lambda x: [y for y in x.values]
         },
        'body': {
            'text'   : lambda x: [y for y in x.values]
         },
        'postedTime' : {
            'day'   : lambda x: [int( ( pd.Timestamp(y) - start_date ).total_seconds() /(3600*24) ) for y in x.values]
         },
        'actor'      : {
            'user'   : lambda x: [y['preferredUsername'] for y in x.values]
         },
        'location' : {
            'geojson'     : lambda x: x.values[0]['geo'],
            'displayName' : lambda x: x.values[0]['displayName']
         }
    })
gb_loc.columns = gb_loc.columns.droplevel()

In [174]:
gb_loc.sort_values(by='Tweets', ascending=False).head(2)

Unnamed: 0_level_0,text,user,day,Tweets,IDs,geojson,displayName
location_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"[[[42.010156, 19.471337], [42.010156, 27.574819], [48.217206, 27.574819], [48.217206, 19.471337]]]",[حج مبرور و سعي مشكور \nاهلا ضيوف الرحمن https...,"[bashry_bkb, zozo_2432, riyadiyatv, awtannews,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",502,"[773401676841648128, 773417749796225024, 77343...","{'coordinates': [[[42.010156, 19.471337], [42....","الرياض, المملكة العربية السعودية"
"[[[39.571438, 21.165831], [39.571438, 21.613835], [40.031935, 21.613835], [40.031935, 21.165831]]]","[SS1:I WANT TO LOSE#WEIGHT,BY CYCLE OR GO TO F...","[paradiseticket1, vip50133, 1412Mshmsh, alshad...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, ...",209,"[773314636242378752, 773326875024326656, 77332...","{'coordinates': [[[39.571438, 21.165831], [39....","مكة المكرمة, المملكة العربية السعودية"


In [175]:
regions = []
for idx, row in gb_loc.copy().iterrows():
    
    json_obj = {'type':'Feature',
                'properties' : {
                        'count'       : row['Tweets'],
                        'tweets'      : [],
                        'displayName' : row['displayName']
                },
                'geometry'  : row['geojson']
               }
    
    #Stupid Twitter --> They only have 4 coordinates, need to make it a valid polygon
    json_obj['geometry']['coordinates'][0].append(json_obj['geometry']['coordinates'][0][0])
    
    for idx, tweet_id in enumerate(row['IDs']):
        
        json_obj['properties']['tweets'].append({
                 'id': tweet_id,
                 'text': row['text'][idx],
                 'user': row['user'][idx],
                 'day':  row['day'][idx]
            })
        
    regions.append(json_obj)
print(len(regions))

291


In [176]:
geojson_feature_collection = {'type':"FeatureCollection", "features" : regions}

In [177]:
with open(config['web_root']+'/image-tweets-by-polygon.geojson','w') as out:
    json.dump(geojson_feature_collection,out)

### Third, Write tweets per day CSV

In [6]:
# Format: 2016-09-25,140008                                                                                                                   

df['day'] = df.postedTime.apply(lambda x: pd.Timestamp(x).date())
gb_day = df.groupby('day').agg({'id': 'count'})
gb_day.head()

Unnamed: 0_level_0,id
day,Unnamed: 1_level_1
2016-09-07,91
2016-09-08,99
2016-09-09,199
2016-09-10,299
2016-09-11,449


In [8]:
with open(config['web_root']+"/tweets_per_day.csv",'w') as tweetsPerDayOut:
    tweetsPerDayOut.write("postedDate2,count\n")
    for idx, r in gb_day.iterrows():
        tweetsPerDayOut.write(str(r.name)+","+str(r.id)+"\n")

# Almost Done... Now go run Stage 2!
Stage 2 handles the geo metadata and creates more point geojsons with turf